LLVM 18.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242static bool isRegisterType(LLT Ty) {
243 if (!isRegisterSize(Ty.getSizeInBits()))
244 return false;
245
246 if (Ty.isVector())
247 return isRegisterVectorType(Ty);
248
249 return true;
250}
251
252// Any combination of 32 or 64-bit elements up the maximum register size, and
253// multiples of v2s16.
254static LegalityPredicate isRegisterType(unsigned TypeIdx) {
255 return [=](const LegalityQuery &Query) {
256 return isRegisterType(Query.Types[TypeIdx]);
257 };
258}
259
260// RegisterType that doesn't have a corresponding RegClass.
261static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
262 return [=](const LegalityQuery &Query) {
263 LLT Ty = Query.Types[TypeIdx];
264 return isRegisterType(Ty) &&
266 };
267}
268
269static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
270 return [=](const LegalityQuery &Query) {
271 const LLT QueryTy = Query.Types[TypeIdx];
272 if (!QueryTy.isVector())
273 return false;
274 const LLT EltTy = QueryTy.getElementType();
275 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
276 };
277}
278
279// If we have a truncating store or an extending load with a data size larger
280// than 32-bits, we need to reduce to a 32-bit type.
282 return [=](const LegalityQuery &Query) {
283 const LLT Ty = Query.Types[TypeIdx];
284 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
285 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
286 };
287}
288
289// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
290// handle some operations by just promoting the register during
291// selection. There are also d16 loads on GFX9+ which preserve the high bits.
292static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
293 bool IsLoad, bool IsAtomic) {
294 switch (AS) {
296 // FIXME: Private element size.
297 return ST.enableFlatScratch() ? 128 : 32;
299 return ST.useDS128() ? 128 : 64;
304 // Treat constant and global as identical. SMRD loads are sometimes usable for
305 // global loads (ideally constant address space should be eliminated)
306 // depending on the context. Legality cannot be context dependent, but
307 // RegBankSelect can split the load as necessary depending on the pointer
308 // register bank/uniformity and if the memory is invariant or not written in a
309 // kernel.
310 return IsLoad ? 512 : 128;
311 default:
312 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
313 // if they may alias scratch depending on the subtarget. This needs to be
314 // moved to custom handling to use addressMayBeAccessedAsPrivate
315 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
316 }
317}
318
319static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
320 const LegalityQuery &Query) {
321 const LLT Ty = Query.Types[0];
322
323 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
324 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
325
326 unsigned RegSize = Ty.getSizeInBits();
327 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
328 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
329 unsigned AS = Query.Types[1].getAddressSpace();
330
331 // All of these need to be custom lowered to cast the pointer operand.
333 return false;
334
335 // Do not handle extending vector loads.
336 if (Ty.isVector() && MemSize != RegSize)
337 return false;
338
339 // TODO: We should be able to widen loads if the alignment is high enough, but
340 // we also need to modify the memory access size.
341#if 0
342 // Accept widening loads based on alignment.
343 if (IsLoad && MemSize < Size)
344 MemSize = std::max(MemSize, Align);
345#endif
346
347 // Only 1-byte and 2-byte to 32-bit extloads are valid.
348 if (MemSize != RegSize && RegSize != 32)
349 return false;
350
351 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
352 Query.MMODescrs[0].Ordering !=
353 AtomicOrdering::NotAtomic))
354 return false;
355
356 switch (MemSize) {
357 case 8:
358 case 16:
359 case 32:
360 case 64:
361 case 128:
362 break;
363 case 96:
364 if (!ST.hasDwordx3LoadStores())
365 return false;
366 break;
367 case 256:
368 case 512:
369 // These may contextually need to be broken down.
370 break;
371 default:
372 return false;
373 }
374
375 assert(RegSize >= MemSize);
376
377 if (AlignBits < MemSize) {
378 const SITargetLowering *TLI = ST.getTargetLowering();
379 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
380 Align(AlignBits / 8)))
381 return false;
382 }
383
384 return true;
385}
386
387// The newer buffer intrinsic forms take their resource arguments as
388// pointers in address space 8, aka s128 values. However, in order to not break
389// SelectionDAG, the underlying operations have to continue to take v4i32
390// arguments. Therefore, we convert resource pointers - or vectors of them
391// to integer values here.
392static bool hasBufferRsrcWorkaround(const LLT Ty) {
394 return true;
395 if (Ty.isVector()) {
396 const LLT ElemTy = Ty.getElementType();
397 return hasBufferRsrcWorkaround(ElemTy);
398 }
399 return false;
400}
401
402// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
403// workaround this. Eventually it should ignore the type for loads and only care
404// about the size. Return true in cases where we will workaround this for now by
405// bitcasting.
406static bool loadStoreBitcastWorkaround(const LLT Ty) {
408 return false;
409
410 const unsigned Size = Ty.getSizeInBits();
411 if (Size <= 64)
412 return false;
413 // Address space 8 pointers get their own workaround.
415 return false;
416 if (!Ty.isVector())
417 return true;
418
419 LLT EltTy = Ty.getElementType();
420 if (EltTy.isPointer())
421 return true;
422
423 unsigned EltSize = EltTy.getSizeInBits();
424 return EltSize != 32 && EltSize != 64;
425}
426
427static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
428 const LLT Ty = Query.Types[0];
429 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
431}
432
433/// Return true if a load or store of the type should be lowered with a bitcast
434/// to a different type.
435static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
436 const LLT MemTy) {
437 const unsigned MemSizeInBits = MemTy.getSizeInBits();
438 const unsigned Size = Ty.getSizeInBits();
439 if (Size != MemSizeInBits)
440 return Size <= 32 && Ty.isVector();
441
443 return true;
444
445 // Don't try to handle bitcasting vector ext loads for now.
446 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
447 (Size <= 32 || isRegisterSize(Size)) &&
449}
450
451/// Return true if we should legalize a load by widening an odd sized memory
452/// access up to the alignment. Note this case when the memory access itself
453/// changes, not the size of the result register.
454static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
455 uint64_t AlignInBits, unsigned AddrSpace,
456 unsigned Opcode) {
457 unsigned SizeInBits = MemoryTy.getSizeInBits();
458 // We don't want to widen cases that are naturally legal.
459 if (isPowerOf2_32(SizeInBits))
460 return false;
461
462 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
463 // end up widening these for a scalar load during RegBankSelect, since there
464 // aren't 96-bit scalar loads.
465 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
466 return false;
467
468 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
469 return false;
470
471 // A load is known dereferenceable up to the alignment, so it's legal to widen
472 // to it.
473 //
474 // TODO: Could check dereferenceable for less aligned cases.
475 unsigned RoundedSize = NextPowerOf2(SizeInBits);
476 if (AlignInBits < RoundedSize)
477 return false;
478
479 // Do not widen if it would introduce a slow unaligned load.
480 const SITargetLowering *TLI = ST.getTargetLowering();
481 unsigned Fast = 0;
483 RoundedSize, AddrSpace, Align(AlignInBits / 8),
485 Fast;
486}
487
488static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
489 unsigned Opcode) {
490 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
491 return false;
492
493 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
494 Query.MMODescrs[0].AlignInBits,
495 Query.Types[1].getAddressSpace(), Opcode);
496}
497
498/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
499/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
500/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
502 MachineRegisterInfo &MRI, unsigned Idx) {
503 MachineOperand &MO = MI.getOperand(Idx);
504
505 const LLT PointerTy = MRI.getType(MO.getReg());
506
507 // Paranoidly prevent us from doing this multiple times.
509 return PointerTy;
510
511 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
512 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
513 if (!PointerTy.isVector()) {
514 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
515 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
516 const LLT S32 = LLT::scalar(32);
517
518 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
519 std::array<Register, 4> VectorElems;
520 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
521 for (unsigned I = 0; I < NumParts; ++I)
522 VectorElems[I] =
523 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
524 B.buildMergeValues(MO, VectorElems);
525 MO.setReg(VectorReg);
526 return VectorTy;
527 }
528 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
529 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
530 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
531 B.buildIntToPtr(MO, Scalar);
532 MO.setReg(BitcastReg);
533
534 return VectorTy;
535}
536
537/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
538/// the form in which the value must be in order to be passed to the low-level
539/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
540/// needed in order to account for the fact that we can't define a register
541/// class for s128 without breaking SelectionDAG.
543 MachineRegisterInfo &MRI = *B.getMRI();
544 const LLT PointerTy = MRI.getType(Pointer);
545 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
546 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
547
548 if (!PointerTy.isVector()) {
549 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
550 SmallVector<Register, 4> PointerParts;
551 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
552 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
553 for (unsigned I = 0; I < NumParts; ++I)
554 PointerParts.push_back(Unmerged.getReg(I));
555 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
556 }
557 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
558 return B.buildBitcast(VectorTy, Scalar).getReg(0);
559}
560
562 unsigned Idx) {
563 MachineOperand &MO = MI.getOperand(Idx);
564
565 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
566 // Paranoidly prevent us from doing this multiple times.
568 return;
570}
571
573 const GCNTargetMachine &TM)
574 : ST(ST_) {
575 using namespace TargetOpcode;
576
577 auto GetAddrSpacePtr = [&TM](unsigned AS) {
578 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
579 };
580
581 const LLT S1 = LLT::scalar(1);
582 const LLT S8 = LLT::scalar(8);
583 const LLT S16 = LLT::scalar(16);
584 const LLT S32 = LLT::scalar(32);
585 const LLT S64 = LLT::scalar(64);
586 const LLT S128 = LLT::scalar(128);
587 const LLT S256 = LLT::scalar(256);
588 const LLT S512 = LLT::scalar(512);
589 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
590
591 const LLT V2S8 = LLT::fixed_vector(2, 8);
592 const LLT V2S16 = LLT::fixed_vector(2, 16);
593 const LLT V4S16 = LLT::fixed_vector(4, 16);
594
595 const LLT V2S32 = LLT::fixed_vector(2, 32);
596 const LLT V3S32 = LLT::fixed_vector(3, 32);
597 const LLT V4S32 = LLT::fixed_vector(4, 32);
598 const LLT V5S32 = LLT::fixed_vector(5, 32);
599 const LLT V6S32 = LLT::fixed_vector(6, 32);
600 const LLT V7S32 = LLT::fixed_vector(7, 32);
601 const LLT V8S32 = LLT::fixed_vector(8, 32);
602 const LLT V9S32 = LLT::fixed_vector(9, 32);
603 const LLT V10S32 = LLT::fixed_vector(10, 32);
604 const LLT V11S32 = LLT::fixed_vector(11, 32);
605 const LLT V12S32 = LLT::fixed_vector(12, 32);
606 const LLT V13S32 = LLT::fixed_vector(13, 32);
607 const LLT V14S32 = LLT::fixed_vector(14, 32);
608 const LLT V15S32 = LLT::fixed_vector(15, 32);
609 const LLT V16S32 = LLT::fixed_vector(16, 32);
610 const LLT V32S32 = LLT::fixed_vector(32, 32);
611
612 const LLT V2S64 = LLT::fixed_vector(2, 64);
613 const LLT V3S64 = LLT::fixed_vector(3, 64);
614 const LLT V4S64 = LLT::fixed_vector(4, 64);
615 const LLT V5S64 = LLT::fixed_vector(5, 64);
616 const LLT V6S64 = LLT::fixed_vector(6, 64);
617 const LLT V7S64 = LLT::fixed_vector(7, 64);
618 const LLT V8S64 = LLT::fixed_vector(8, 64);
619 const LLT V16S64 = LLT::fixed_vector(16, 64);
620
621 std::initializer_list<LLT> AllS32Vectors =
622 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
623 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
624 std::initializer_list<LLT> AllS64Vectors =
625 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
626
627 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
628 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
629 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
630 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
631 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
632 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
633 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
634 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
635 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
636
637 const LLT CodePtr = FlatPtr;
638
639 const std::initializer_list<LLT> AddrSpaces64 = {
640 GlobalPtr, ConstantPtr, FlatPtr
641 };
642
643 const std::initializer_list<LLT> AddrSpaces32 = {
644 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
645 };
646
647 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
648
649 const std::initializer_list<LLT> FPTypesBase = {
650 S32, S64
651 };
652
653 const std::initializer_list<LLT> FPTypes16 = {
654 S32, S64, S16
655 };
656
657 const std::initializer_list<LLT> FPTypesPK16 = {
658 S32, S64, S16, V2S16
659 };
660
661 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
662
663 // s1 for VCC branches, s32 for SCC branches.
664 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
665
666 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
667 // elements for v3s16
669 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
670 .legalFor(AllS32Vectors)
671 .legalFor(AllS64Vectors)
672 .legalFor(AddrSpaces64)
673 .legalFor(AddrSpaces32)
674 .legalFor(AddrSpaces128)
675 .legalIf(isPointer(0))
676 .clampScalar(0, S16, S256)
678 .clampMaxNumElements(0, S32, 16)
680 .scalarize(0);
681
682 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
683 // Full set of gfx9 features.
684 getActionDefinitionsBuilder({G_ADD, G_SUB})
685 .legalFor({S32, S16, V2S16})
686 .clampMaxNumElementsStrict(0, S16, 2)
687 .scalarize(0)
688 .minScalar(0, S16)
690 .maxScalar(0, S32);
691
693 .legalFor({S32, S16, V2S16})
694 .clampMaxNumElementsStrict(0, S16, 2)
695 .scalarize(0)
696 .minScalar(0, S16)
698 .custom();
699 assert(ST.hasMad64_32());
700
701 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
702 .legalFor({S32, S16, V2S16}) // Clamp modifier
703 .minScalarOrElt(0, S16)
704 .clampMaxNumElementsStrict(0, S16, 2)
705 .scalarize(0)
707 .lower();
708 } else if (ST.has16BitInsts()) {
709 getActionDefinitionsBuilder({G_ADD, G_SUB})
710 .legalFor({S32, S16})
711 .minScalar(0, S16)
713 .maxScalar(0, S32)
714 .scalarize(0);
715
717 .legalFor({S32, S16})
718 .scalarize(0)
719 .minScalar(0, S16)
720 .widenScalarToNextMultipleOf(0, 32)
721 .custom();
722 assert(ST.hasMad64_32());
723
724 // Technically the saturating operations require clamp bit support, but this
725 // was introduced at the same time as 16-bit operations.
726 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
727 .legalFor({S32, S16}) // Clamp modifier
728 .minScalar(0, S16)
729 .scalarize(0)
731 .lower();
732
733 // We're just lowering this, but it helps get a better result to try to
734 // coerce to the desired type first.
735 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
736 .minScalar(0, S16)
737 .scalarize(0)
738 .lower();
739 } else {
740 getActionDefinitionsBuilder({G_ADD, G_SUB})
741 .legalFor({S32})
742 .widenScalarToNextMultipleOf(0, 32)
743 .clampScalar(0, S32, S32)
744 .scalarize(0);
745
746 auto &Mul = getActionDefinitionsBuilder(G_MUL)
747 .legalFor({S32})
748 .scalarize(0)
749 .minScalar(0, S32)
750 .widenScalarToNextMultipleOf(0, 32);
751
752 if (ST.hasMad64_32())
753 Mul.custom();
754 else
755 Mul.maxScalar(0, S32);
756
757 if (ST.hasIntClamp()) {
758 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
759 .legalFor({S32}) // Clamp modifier.
760 .scalarize(0)
761 .minScalarOrElt(0, S32)
762 .lower();
763 } else {
764 // Clamp bit support was added in VI, along with 16-bit operations.
765 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
766 .minScalar(0, S32)
767 .scalarize(0)
768 .lower();
769 }
770
771 // FIXME: DAG expansion gets better results. The widening uses the smaller
772 // range values and goes for the min/max lowering directly.
773 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
774 .minScalar(0, S32)
775 .scalarize(0)
776 .lower();
777 }
778
780 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
781 .customFor({S32, S64})
782 .clampScalar(0, S32, S64)
784 .scalarize(0);
785
786 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
787 .legalFor({S32})
788 .maxScalar(0, S32);
789
790 if (ST.hasVOP3PInsts()) {
791 Mulh
792 .clampMaxNumElements(0, S8, 2)
793 .lowerFor({V2S8});
794 }
795
796 Mulh
797 .scalarize(0)
798 .lower();
799
800 // Report legal for any types we can handle anywhere. For the cases only legal
801 // on the SALU, RegBankSelect will be able to re-legalize.
802 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
803 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
804 .clampScalar(0, S32, S64)
808 .scalarize(0);
809
811 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
812 .legalFor({{S32, S1}, {S32, S32}})
813 .clampScalar(0, S32, S32)
814 .scalarize(0);
815
817 // Don't worry about the size constraint.
819 .lower();
820
821
823 .legalFor({S1, S32, S64, S16, GlobalPtr,
824 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
825 .legalIf(isPointer(0))
826 .clampScalar(0, S32, S64)
828
829 getActionDefinitionsBuilder(G_FCONSTANT)
830 .legalFor({S32, S64, S16})
831 .clampScalar(0, S16, S64);
832
833 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
834 .legalIf(isRegisterType(0))
835 // s1 and s16 are special cases because they have legal operations on
836 // them, but don't really occupy registers in the normal way.
837 .legalFor({S1, S16})
838 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
839 .clampScalarOrElt(0, S32, MaxScalar)
841 .clampMaxNumElements(0, S32, 16);
842
843 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
844
845 // If the amount is divergent, we have to do a wave reduction to get the
846 // maximum value, so this is expanded during RegBankSelect.
847 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
848 .legalFor({{PrivatePtr, S32}});
849
850 getActionDefinitionsBuilder(G_STACKSAVE)
851 .customFor({PrivatePtr});
852 getActionDefinitionsBuilder(G_STACKRESTORE)
853 .legalFor({PrivatePtr});
854
855 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
856 .customIf(typeIsNot(0, PrivatePtr));
857
858 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
859
860 auto &FPOpActions = getActionDefinitionsBuilder(
861 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
862 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
863 .legalFor({S32, S64});
864 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
865 .customFor({S32, S64});
866 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
867 .customFor({S32, S64});
868
869 if (ST.has16BitInsts()) {
870 if (ST.hasVOP3PInsts())
871 FPOpActions.legalFor({S16, V2S16});
872 else
873 FPOpActions.legalFor({S16});
874
875 TrigActions.customFor({S16});
876 FDIVActions.customFor({S16});
877 }
878
879 if (ST.hasPackedFP32Ops()) {
880 FPOpActions.legalFor({V2S32});
881 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
882 }
883
884 auto &MinNumMaxNum = getActionDefinitionsBuilder({
885 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
886
887 if (ST.hasVOP3PInsts()) {
888 MinNumMaxNum.customFor(FPTypesPK16)
890 .clampMaxNumElements(0, S16, 2)
891 .clampScalar(0, S16, S64)
892 .scalarize(0);
893 } else if (ST.has16BitInsts()) {
894 MinNumMaxNum.customFor(FPTypes16)
895 .clampScalar(0, S16, S64)
896 .scalarize(0);
897 } else {
898 MinNumMaxNum.customFor(FPTypesBase)
899 .clampScalar(0, S32, S64)
900 .scalarize(0);
901 }
902
903 if (ST.hasVOP3PInsts())
904 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
905
906 FPOpActions
907 .scalarize(0)
908 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
909
910 TrigActions
911 .scalarize(0)
912 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
913
914 FDIVActions
915 .scalarize(0)
916 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
917
918 getActionDefinitionsBuilder({G_FNEG, G_FABS})
919 .legalFor(FPTypesPK16)
920 .clampMaxNumElementsStrict(0, S16, 2)
921 .scalarize(0)
922 .clampScalar(0, S16, S64);
923
924 if (ST.has16BitInsts()) {
926 .legalFor({S16})
927 .customFor({S32, S64})
928 .scalarize(0)
929 .unsupported();
931 .legalFor({S32, S64, S16})
932 .scalarize(0)
933 .clampScalar(0, S16, S64);
934
935 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
936 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
937 .scalarize(0)
938 .maxScalarIf(typeIs(0, S16), 1, S16)
939 .clampScalar(1, S32, S32)
940 .lower();
941
943 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
944 .scalarize(0)
945 .lower();
946 } else {
948 .customFor({S32, S64, S16})
949 .scalarize(0)
950 .unsupported();
951
952
953 if (ST.hasFractBug()) {
955 .customFor({S64})
956 .legalFor({S32, S64})
957 .scalarize(0)
958 .clampScalar(0, S32, S64);
959 } else {
961 .legalFor({S32, S64})
962 .scalarize(0)
963 .clampScalar(0, S32, S64);
964 }
965
966 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
967 .legalFor({{S32, S32}, {S64, S32}})
968 .scalarize(0)
969 .clampScalar(0, S32, S64)
970 .clampScalar(1, S32, S32)
971 .lower();
972
974 .customFor({{S32, S32}, {S64, S32}})
975 .scalarize(0)
976 .minScalar(0, S32)
977 .clampScalar(1, S32, S32)
978 .lower();
979 }
980
982 .legalFor({{S32, S64}, {S16, S32}})
983 .scalarize(0)
984 .lower();
985
987 .legalFor({{S64, S32}, {S32, S16}})
988 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
989 .scalarize(0);
990
991 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
992 if (ST.has16BitInsts()) {
993 FSubActions
994 // Use actual fsub instruction
995 .legalFor({S32, S16})
996 // Must use fadd + fneg
997 .lowerFor({S64, V2S16});
998 } else {
999 FSubActions
1000 // Use actual fsub instruction
1001 .legalFor({S32})
1002 // Must use fadd + fneg
1003 .lowerFor({S64, S16, V2S16});
1004 }
1005
1006 FSubActions
1007 .scalarize(0)
1008 .clampScalar(0, S32, S64);
1009
1010 // Whether this is legal depends on the floating point mode for the function.
1011 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1012 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1013 FMad.customFor({S32, S16});
1014 else if (ST.hasMadMacF32Insts())
1015 FMad.customFor({S32});
1016 else if (ST.hasMadF16())
1017 FMad.customFor({S16});
1018 FMad.scalarize(0)
1019 .lower();
1020
1021 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1022 if (ST.has16BitInsts()) {
1023 FRem.customFor({S16, S32, S64});
1024 } else {
1025 FRem.minScalar(0, S32)
1026 .customFor({S32, S64});
1027 }
1028 FRem.scalarize(0);
1029
1030 // TODO: Do we need to clamp maximum bitwidth?
1032 .legalIf(isScalar(0))
1033 .legalFor({{V2S16, V2S32}})
1034 .clampMaxNumElements(0, S16, 2)
1035 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1036 // situations (like an invalid implicit use), we don't want to infinite loop
1037 // in the legalizer.
1039 .alwaysLegal();
1040
1041 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1042 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1043 {S32, S1}, {S64, S1}, {S16, S1}})
1044 .scalarize(0)
1045 .clampScalar(0, S32, S64)
1046 .widenScalarToNextPow2(1, 32);
1047
1048 // TODO: Split s1->s64 during regbankselect for VALU.
1049 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1050 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1051 .lowerIf(typeIs(1, S1))
1052 .customFor({{S32, S64}, {S64, S64}});
1053 if (ST.has16BitInsts())
1054 IToFP.legalFor({{S16, S16}});
1055 IToFP.clampScalar(1, S32, S64)
1056 .minScalar(0, S32)
1057 .scalarize(0)
1059
1060 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1061 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1062 .customFor({{S64, S32}, {S64, S64}})
1063 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1064 if (ST.has16BitInsts())
1065 FPToI.legalFor({{S16, S16}});
1066 else
1067 FPToI.minScalar(1, S32);
1068
1069 FPToI.minScalar(0, S32)
1070 .widenScalarToNextPow2(0, 32)
1071 .scalarize(0)
1072 .lower();
1073
1074 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1075 .customFor({S16, S32})
1076 .scalarize(0)
1077 .lower();
1078
1079 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1080 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1081 .scalarize(0)
1082 .lower();
1083
1084 if (ST.has16BitInsts()) {
1085 getActionDefinitionsBuilder(
1086 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1087 .legalFor({S16, S32, S64})
1088 .clampScalar(0, S16, S64)
1089 .scalarize(0);
1090 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1091 getActionDefinitionsBuilder(
1092 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1093 .legalFor({S32, S64})
1094 .clampScalar(0, S32, S64)
1095 .scalarize(0);
1096 } else {
1097 getActionDefinitionsBuilder(
1098 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1099 .legalFor({S32})
1100 .customFor({S64})
1101 .clampScalar(0, S32, S64)
1102 .scalarize(0);
1103 }
1104
1105 getActionDefinitionsBuilder(G_PTR_ADD)
1106 .unsupportedFor({BufferFatPtr, RsrcPtr})
1107 .legalIf(all(isPointer(0), sameSize(0, 1)))
1108 .scalarize(0)
1109 .scalarSameSizeAs(1, 0);
1110
1111 getActionDefinitionsBuilder(G_PTRMASK)
1112 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1113 .scalarSameSizeAs(1, 0)
1114 .scalarize(0);
1115
1116 auto &CmpBuilder =
1117 getActionDefinitionsBuilder(G_ICMP)
1118 // The compare output type differs based on the register bank of the output,
1119 // so make both s1 and s32 legal.
1120 //
1121 // Scalar compares producing output in scc will be promoted to s32, as that
1122 // is the allocatable register type that will be needed for the copy from
1123 // scc. This will be promoted during RegBankSelect, and we assume something
1124 // before that won't try to use s32 result types.
1125 //
1126 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1127 // bank.
1128 .legalForCartesianProduct(
1129 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1130 .legalForCartesianProduct(
1131 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1132 if (ST.has16BitInsts()) {
1133 CmpBuilder.legalFor({{S1, S16}});
1134 }
1135
1136 CmpBuilder
1137 .widenScalarToNextPow2(1)
1138 .clampScalar(1, S32, S64)
1139 .scalarize(0)
1140 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1141
1142 auto &FCmpBuilder =
1143 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1144 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1145
1146 if (ST.hasSALUFloatInsts())
1147 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1148
1149 FCmpBuilder
1150 .widenScalarToNextPow2(1)
1151 .clampScalar(1, S32, S64)
1152 .scalarize(0);
1153
1154 // FIXME: fpow has a selection pattern that should move to custom lowering.
1155 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1156 if (ST.has16BitInsts())
1157 ExpOps.customFor({{S32}, {S16}});
1158 else
1159 ExpOps.customFor({S32});
1160 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1161 .scalarize(0);
1162
1163 getActionDefinitionsBuilder(G_FPOWI)
1164 .clampScalar(0, MinScalarFPTy, S32)
1165 .lower();
1166
1167 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1168 Log2Ops.customFor({S32});
1169 if (ST.has16BitInsts())
1170 Log2Ops.legalFor({S16});
1171 else
1172 Log2Ops.customFor({S16});
1173 Log2Ops.scalarize(0)
1174 .lower();
1175
1176 auto &LogOps =
1177 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1178 LogOps.customFor({S32, S16});
1179 LogOps.clampScalar(0, MinScalarFPTy, S32)
1180 .scalarize(0);
1181
1182 // The 64-bit versions produce 32-bit results, but only on the SALU.
1183 getActionDefinitionsBuilder(G_CTPOP)
1184 .legalFor({{S32, S32}, {S32, S64}})
1185 .clampScalar(0, S32, S32)
1186 .widenScalarToNextPow2(1, 32)
1187 .clampScalar(1, S32, S64)
1188 .scalarize(0)
1189 .widenScalarToNextPow2(0, 32);
1190
1191 // If no 16 bit instr is available, lower into different instructions.
1192 if (ST.has16BitInsts())
1193 getActionDefinitionsBuilder(G_IS_FPCLASS)
1194 .legalForCartesianProduct({S1}, FPTypes16)
1195 .widenScalarToNextPow2(1)
1196 .scalarize(0)
1197 .lower();
1198 else
1199 getActionDefinitionsBuilder(G_IS_FPCLASS)
1200 .legalForCartesianProduct({S1}, FPTypesBase)
1201 .lowerFor({S1, S16})
1202 .widenScalarToNextPow2(1)
1203 .scalarize(0)
1204 .lower();
1205
1206 // The hardware instructions return a different result on 0 than the generic
1207 // instructions expect. The hardware produces -1, but these produce the
1208 // bitwidth.
1209 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1210 .scalarize(0)
1211 .clampScalar(0, S32, S32)
1212 .clampScalar(1, S32, S64)
1213 .widenScalarToNextPow2(0, 32)
1214 .widenScalarToNextPow2(1, 32)
1215 .custom();
1216
1217 // The 64-bit versions produce 32-bit results, but only on the SALU.
1218 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1219 .legalFor({{S32, S32}, {S32, S64}})
1220 .clampScalar(0, S32, S32)
1221 .clampScalar(1, S32, S64)
1222 .scalarize(0)
1223 .widenScalarToNextPow2(0, 32)
1224 .widenScalarToNextPow2(1, 32);
1225
1226 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1227 // RegBankSelect.
1228 getActionDefinitionsBuilder(G_BITREVERSE)
1229 .legalFor({S32, S64})
1230 .clampScalar(0, S32, S64)
1231 .scalarize(0)
1232 .widenScalarToNextPow2(0);
1233
1234 if (ST.has16BitInsts()) {
1235 getActionDefinitionsBuilder(G_BSWAP)
1236 .legalFor({S16, S32, V2S16})
1237 .clampMaxNumElementsStrict(0, S16, 2)
1238 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1239 // narrowScalar limitation.
1240 .widenScalarToNextPow2(0)
1241 .clampScalar(0, S16, S32)
1242 .scalarize(0);
1243
1244 if (ST.hasVOP3PInsts()) {
1245 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1246 .legalFor({S32, S16, V2S16})
1247 .clampMaxNumElements(0, S16, 2)
1248 .minScalar(0, S16)
1249 .widenScalarToNextPow2(0)
1250 .scalarize(0)
1251 .lower();
1252 } else {
1253 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1254 .legalFor({S32, S16})
1255 .widenScalarToNextPow2(0)
1256 .minScalar(0, S16)
1257 .scalarize(0)
1258 .lower();
1259 }
1260 } else {
1261 // TODO: Should have same legality without v_perm_b32
1262 getActionDefinitionsBuilder(G_BSWAP)
1263 .legalFor({S32})
1264 .lowerIf(scalarNarrowerThan(0, 32))
1265 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1266 // narrowScalar limitation.
1267 .widenScalarToNextPow2(0)
1268 .maxScalar(0, S32)
1269 .scalarize(0)
1270 .lower();
1271
1272 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1273 .legalFor({S32})
1274 .minScalar(0, S32)
1275 .widenScalarToNextPow2(0)
1276 .scalarize(0)
1277 .lower();
1278 }
1279
1280 getActionDefinitionsBuilder(G_INTTOPTR)
1281 // List the common cases
1282 .legalForCartesianProduct(AddrSpaces64, {S64})
1283 .legalForCartesianProduct(AddrSpaces32, {S32})
1284 .scalarize(0)
1285 // Accept any address space as long as the size matches
1286 .legalIf(sameSize(0, 1))
1287 .widenScalarIf(smallerThan(1, 0),
1288 [](const LegalityQuery &Query) {
1289 return std::pair(
1290 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1291 })
1292 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1293 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1294 });
1295
1296 getActionDefinitionsBuilder(G_PTRTOINT)
1297 // List the common cases
1298 .legalForCartesianProduct(AddrSpaces64, {S64})
1299 .legalForCartesianProduct(AddrSpaces32, {S32})
1300 .scalarize(0)
1301 // Accept any address space as long as the size matches
1302 .legalIf(sameSize(0, 1))
1303 .widenScalarIf(smallerThan(0, 1),
1304 [](const LegalityQuery &Query) {
1305 return std::pair(
1306 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1307 })
1308 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1309 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1310 });
1311
1312 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1313 .scalarize(0)
1314 .custom();
1315
1316 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1317 bool IsLoad) -> bool {
1318 const LLT DstTy = Query.Types[0];
1319
1320 // Split vector extloads.
1321 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1322
1323 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1324 return true;
1325
1326 const LLT PtrTy = Query.Types[1];
1327 unsigned AS = PtrTy.getAddressSpace();
1328 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1329 Query.MMODescrs[0].Ordering !=
1331 return true;
1332
1333 // Catch weird sized loads that don't evenly divide into the access sizes
1334 // TODO: May be able to widen depending on alignment etc.
1335 unsigned NumRegs = (MemSize + 31) / 32;
1336 if (NumRegs == 3) {
1337 if (!ST.hasDwordx3LoadStores())
1338 return true;
1339 } else {
1340 // If the alignment allows, these should have been widened.
1341 if (!isPowerOf2_32(NumRegs))
1342 return true;
1343 }
1344
1345 return false;
1346 };
1347
1348 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1349 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1350 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1351
1352 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1353 // LDS
1354 // TODO: Unsupported flat for SI.
1355
1356 for (unsigned Op : {G_LOAD, G_STORE}) {
1357 const bool IsStore = Op == G_STORE;
1358
1359 auto &Actions = getActionDefinitionsBuilder(Op);
1360 // Explicitly list some common cases.
1361 // TODO: Does this help compile time at all?
1362 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1363 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1364 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1365 {S64, GlobalPtr, S64, GlobalAlign32},
1366 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1367 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1368 {S32, GlobalPtr, S8, GlobalAlign8},
1369 {S32, GlobalPtr, S16, GlobalAlign16},
1370
1371 {S32, LocalPtr, S32, 32},
1372 {S64, LocalPtr, S64, 32},
1373 {V2S32, LocalPtr, V2S32, 32},
1374 {S32, LocalPtr, S8, 8},
1375 {S32, LocalPtr, S16, 16},
1376 {V2S16, LocalPtr, S32, 32},
1377
1378 {S32, PrivatePtr, S32, 32},
1379 {S32, PrivatePtr, S8, 8},
1380 {S32, PrivatePtr, S16, 16},
1381 {V2S16, PrivatePtr, S32, 32},
1382
1383 {S32, ConstantPtr, S32, GlobalAlign32},
1384 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1385 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1386 {S64, ConstantPtr, S64, GlobalAlign32},
1387 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1388 Actions.legalIf(
1389 [=](const LegalityQuery &Query) -> bool {
1390 return isLoadStoreLegal(ST, Query);
1391 });
1392
1393 // The custom pointers (fat pointers, buffer resources) don't work with load
1394 // and store at this level. Fat pointers should have been lowered to
1395 // intrinsics before the translation to MIR.
1396 Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr}));
1397
1398 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1399 // ptrtoint. This is needed to account for the fact that we can't have i128
1400 // as a register class for SelectionDAG reasons.
1401 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1402 return hasBufferRsrcWorkaround(Query.Types[0]);
1403 });
1404
1405 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1406 // 64-bits.
1407 //
1408 // TODO: Should generalize bitcast action into coerce, which will also cover
1409 // inserting addrspacecasts.
1410 Actions.customIf(typeIs(1, Constant32Ptr));
1411
1412 // Turn any illegal element vectors into something easier to deal
1413 // with. These will ultimately produce 32-bit scalar shifts to extract the
1414 // parts anyway.
1415 //
1416 // For odd 16-bit element vectors, prefer to split those into pieces with
1417 // 16-bit vector parts.
1418 Actions.bitcastIf(
1419 [=](const LegalityQuery &Query) -> bool {
1420 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1421 Query.MMODescrs[0].MemoryTy);
1422 }, bitcastToRegisterType(0));
1423
1424 if (!IsStore) {
1425 // Widen suitably aligned loads by loading extra bytes. The standard
1426 // legalization actions can't properly express widening memory operands.
1427 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1428 return shouldWidenLoad(ST, Query, G_LOAD);
1429 });
1430 }
1431
1432 // FIXME: load/store narrowing should be moved to lower action
1433 Actions
1434 .narrowScalarIf(
1435 [=](const LegalityQuery &Query) -> bool {
1436 return !Query.Types[0].isVector() &&
1437 needToSplitMemOp(Query, Op == G_LOAD);
1438 },
1439 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1440 const LLT DstTy = Query.Types[0];
1441 const LLT PtrTy = Query.Types[1];
1442
1443 const unsigned DstSize = DstTy.getSizeInBits();
1444 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1445
1446 // Split extloads.
1447 if (DstSize > MemSize)
1448 return std::pair(0, LLT::scalar(MemSize));
1449
1450 unsigned MaxSize = maxSizeForAddrSpace(
1451 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1452 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1453 if (MemSize > MaxSize)
1454 return std::pair(0, LLT::scalar(MaxSize));
1455
1456 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1457 return std::pair(0, LLT::scalar(Align));
1458 })
1459 .fewerElementsIf(
1460 [=](const LegalityQuery &Query) -> bool {
1461 return Query.Types[0].isVector() &&
1462 needToSplitMemOp(Query, Op == G_LOAD);
1463 },
1464 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1465 const LLT DstTy = Query.Types[0];
1466 const LLT PtrTy = Query.Types[1];
1467
1468 LLT EltTy = DstTy.getElementType();
1469 unsigned MaxSize = maxSizeForAddrSpace(
1470 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1471 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1472
1473 // FIXME: Handle widened to power of 2 results better. This ends
1474 // up scalarizing.
1475 // FIXME: 3 element stores scalarized on SI
1476
1477 // Split if it's too large for the address space.
1478 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1479 if (MemSize > MaxSize) {
1480 unsigned NumElts = DstTy.getNumElements();
1481 unsigned EltSize = EltTy.getSizeInBits();
1482
1483 if (MaxSize % EltSize == 0) {
1484 return std::pair(
1486 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1487 }
1488
1489 unsigned NumPieces = MemSize / MaxSize;
1490
1491 // FIXME: Refine when odd breakdowns handled
1492 // The scalars will need to be re-legalized.
1493 if (NumPieces == 1 || NumPieces >= NumElts ||
1494 NumElts % NumPieces != 0)
1495 return std::pair(0, EltTy);
1496
1497 return std::pair(0,
1498 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1499 }
1500
1501 // FIXME: We could probably handle weird extending loads better.
1502 if (DstTy.getSizeInBits() > MemSize)
1503 return std::pair(0, EltTy);
1504
1505 unsigned EltSize = EltTy.getSizeInBits();
1506 unsigned DstSize = DstTy.getSizeInBits();
1507 if (!isPowerOf2_32(DstSize)) {
1508 // We're probably decomposing an odd sized store. Try to split
1509 // to the widest type. TODO: Account for alignment. As-is it
1510 // should be OK, since the new parts will be further legalized.
1511 unsigned FloorSize = llvm::bit_floor(DstSize);
1512 return std::pair(
1514 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1515 }
1516
1517 // May need relegalization for the scalars.
1518 return std::pair(0, EltTy);
1519 })
1520 .minScalar(0, S32)
1521 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1522 .widenScalarToNextPow2(0)
1523 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1524 .lower();
1525 }
1526
1527 // FIXME: Unaligned accesses not lowered.
1528 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1529 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1530 {S32, GlobalPtr, S16, 2 * 8},
1531 {S32, LocalPtr, S8, 8},
1532 {S32, LocalPtr, S16, 16},
1533 {S32, PrivatePtr, S8, 8},
1534 {S32, PrivatePtr, S16, 16},
1535 {S32, ConstantPtr, S8, 8},
1536 {S32, ConstantPtr, S16, 2 * 8}})
1537 .legalIf(
1538 [=](const LegalityQuery &Query) -> bool {
1539 return isLoadStoreLegal(ST, Query);
1540 });
1541
1542 if (ST.hasFlatAddressSpace()) {
1543 ExtLoads.legalForTypesWithMemDesc(
1544 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1545 }
1546
1547 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1548 // 64-bits.
1549 //
1550 // TODO: Should generalize bitcast action into coerce, which will also cover
1551 // inserting addrspacecasts.
1552 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1553
1554 ExtLoads.clampScalar(0, S32, S32)
1555 .widenScalarToNextPow2(0)
1556 .lower();
1557
1558 auto &Atomics = getActionDefinitionsBuilder(
1559 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1560 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1561 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1562 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1563 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1564 {S64, GlobalPtr}, {S64, LocalPtr},
1565 {S32, RegionPtr}, {S64, RegionPtr}});
1566 if (ST.hasFlatAddressSpace()) {
1567 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1568 }
1569
1570 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1571 if (ST.hasLDSFPAtomicAdd()) {
1572 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1573 if (ST.hasGFX90AInsts())
1574 Atomic.legalFor({{S64, LocalPtr}});
1575 if (ST.hasAtomicDsPkAdd16Insts())
1576 Atomic.legalFor({{V2S16, LocalPtr}});
1577 }
1578 if (ST.hasAtomicFaddInsts())
1579 Atomic.legalFor({{S32, GlobalPtr}});
1580 if (ST.hasFlatAtomicFaddF32Inst())
1581 Atomic.legalFor({{S32, FlatPtr}});
1582
1583 if (ST.hasGFX90AInsts()) {
1584 // These are legal with some caveats, and should have undergone expansion in
1585 // the IR in most situations
1586 // TODO: Move atomic expansion into legalizer
1587 Atomic.legalFor({
1588 {S32, GlobalPtr},
1589 {S64, GlobalPtr},
1590 {S64, FlatPtr}
1591 });
1592 }
1593
1594 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1595 // demarshalling
1596 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1597 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1598 {S32, FlatPtr}, {S64, FlatPtr}})
1599 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1600 {S32, RegionPtr}, {S64, RegionPtr}});
1601 // TODO: Pointer types, any 32-bit or 64-bit vector
1602
1603 // Condition should be s32 for scalar, s1 for vector.
1604 getActionDefinitionsBuilder(G_SELECT)
1605 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1606 LocalPtr, FlatPtr, PrivatePtr,
1607 LLT::fixed_vector(2, LocalPtr),
1608 LLT::fixed_vector(2, PrivatePtr)},
1609 {S1, S32})
1610 .clampScalar(0, S16, S64)
1611 .scalarize(1)
1612 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1613 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1614 .clampMaxNumElements(0, S32, 2)
1615 .clampMaxNumElements(0, LocalPtr, 2)
1616 .clampMaxNumElements(0, PrivatePtr, 2)
1617 .scalarize(0)
1618 .widenScalarToNextPow2(0)
1619 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1620
1621 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1622 // be more flexible with the shift amount type.
1623 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1624 .legalFor({{S32, S32}, {S64, S32}});
1625 if (ST.has16BitInsts()) {
1626 if (ST.hasVOP3PInsts()) {
1627 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1628 .clampMaxNumElements(0, S16, 2);
1629 } else
1630 Shifts.legalFor({{S16, S16}});
1631
1632 // TODO: Support 16-bit shift amounts for all types
1633 Shifts.widenScalarIf(
1634 [=](const LegalityQuery &Query) {
1635 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1636 // 32-bit amount.
1637 const LLT ValTy = Query.Types[0];
1638 const LLT AmountTy = Query.Types[1];
1639 return ValTy.getSizeInBits() <= 16 &&
1640 AmountTy.getSizeInBits() < 16;
1641 }, changeTo(1, S16));
1642 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1643 Shifts.clampScalar(1, S32, S32);
1644 Shifts.widenScalarToNextPow2(0, 16);
1645 Shifts.clampScalar(0, S16, S64);
1646
1647 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1648 .minScalar(0, S16)
1649 .scalarize(0)
1650 .lower();
1651 } else {
1652 // Make sure we legalize the shift amount type first, as the general
1653 // expansion for the shifted type will produce much worse code if it hasn't
1654 // been truncated already.
1655 Shifts.clampScalar(1, S32, S32);
1656 Shifts.widenScalarToNextPow2(0, 32);
1657 Shifts.clampScalar(0, S32, S64);
1658
1659 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1660 .minScalar(0, S32)
1661 .scalarize(0)
1662 .lower();
1663 }
1664 Shifts.scalarize(0);
1665
1666 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1667 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1668 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1669 unsigned IdxTypeIdx = 2;
1670
1671 getActionDefinitionsBuilder(Op)
1672 .customIf([=](const LegalityQuery &Query) {
1673 const LLT EltTy = Query.Types[EltTypeIdx];
1674 const LLT VecTy = Query.Types[VecTypeIdx];
1675 const LLT IdxTy = Query.Types[IdxTypeIdx];
1676 const unsigned EltSize = EltTy.getSizeInBits();
1677 const bool isLegalVecType =
1679 // Address space 8 pointers are 128-bit wide values, but the logic
1680 // below will try to bitcast them to 2N x s64, which will fail.
1681 // Therefore, as an intermediate step, wrap extracts/insertions from a
1682 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1683 // extraction result) in order to produce a vector operation that can
1684 // be handled by the logic below.
1685 if (EltTy.isPointer() && EltSize > 64)
1686 return true;
1687 return (EltSize == 32 || EltSize == 64) &&
1688 VecTy.getSizeInBits() % 32 == 0 &&
1689 VecTy.getSizeInBits() <= MaxRegisterSize &&
1690 IdxTy.getSizeInBits() == 32 &&
1691 isLegalVecType;
1692 })
1693 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1694 bitcastToVectorElement32(VecTypeIdx))
1695 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1696 .bitcastIf(
1697 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1698 [=](const LegalityQuery &Query) {
1699 // For > 64-bit element types, try to turn this into a 64-bit
1700 // element vector since we may be able to do better indexing
1701 // if this is scalar. If not, fall back to 32.
1702 const LLT EltTy = Query.Types[EltTypeIdx];
1703 const LLT VecTy = Query.Types[VecTypeIdx];
1704 const unsigned DstEltSize = EltTy.getSizeInBits();
1705 const unsigned VecSize = VecTy.getSizeInBits();
1706
1707 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1708 return std::pair(
1709 VecTypeIdx,
1710 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1711 })
1712 .clampScalar(EltTypeIdx, S32, S64)
1713 .clampScalar(VecTypeIdx, S32, S64)
1714 .clampScalar(IdxTypeIdx, S32, S32)
1715 .clampMaxNumElements(VecTypeIdx, S32, 32)
1716 // TODO: Clamp elements for 64-bit vectors?
1717 .moreElementsIf(
1718 isIllegalRegisterType(VecTypeIdx),
1720 // It should only be necessary with variable indexes.
1721 // As a last resort, lower to the stack
1722 .lower();
1723 }
1724
1725 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1726 .unsupportedIf([=](const LegalityQuery &Query) {
1727 const LLT &EltTy = Query.Types[1].getElementType();
1728 return Query.Types[0] != EltTy;
1729 });
1730
1731 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1732 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1733 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1734
1735 // FIXME: Doesn't handle extract of illegal sizes.
1736 getActionDefinitionsBuilder(Op)
1737 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1738 .lowerIf([=](const LegalityQuery &Query) {
1739 // Sub-vector(or single element) insert and extract.
1740 // TODO: verify immediate offset here since lower only works with
1741 // whole elements.
1742 const LLT BigTy = Query.Types[BigTyIdx];
1743 return BigTy.isVector();
1744 })
1745 // FIXME: Multiples of 16 should not be legal.
1746 .legalIf([=](const LegalityQuery &Query) {
1747 const LLT BigTy = Query.Types[BigTyIdx];
1748 const LLT LitTy = Query.Types[LitTyIdx];
1749 return (BigTy.getSizeInBits() % 32 == 0) &&
1750 (LitTy.getSizeInBits() % 16 == 0);
1751 })
1752 .widenScalarIf(
1753 [=](const LegalityQuery &Query) {
1754 const LLT BigTy = Query.Types[BigTyIdx];
1755 return (BigTy.getScalarSizeInBits() < 16);
1756 },
1758 .widenScalarIf(
1759 [=](const LegalityQuery &Query) {
1760 const LLT LitTy = Query.Types[LitTyIdx];
1761 return (LitTy.getScalarSizeInBits() < 16);
1762 },
1764 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1765 .widenScalarToNextPow2(BigTyIdx, 32);
1766
1767 }
1768
1769 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1770 .legalForCartesianProduct(AllS32Vectors, {S32})
1771 .legalForCartesianProduct(AllS64Vectors, {S64})
1772 .clampNumElements(0, V16S32, V32S32)
1773 .clampNumElements(0, V2S64, V16S64)
1774 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1775 .moreElementsIf(
1778
1779 if (ST.hasScalarPackInsts()) {
1780 BuildVector
1781 // FIXME: Should probably widen s1 vectors straight to s32
1782 .minScalarOrElt(0, S16)
1783 .minScalar(1, S16);
1784
1785 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1786 .legalFor({V2S16, S32})
1787 .lower();
1788 } else {
1789 BuildVector.customFor({V2S16, S16});
1790 BuildVector.minScalarOrElt(0, S32);
1791
1792 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1793 .customFor({V2S16, S32})
1794 .lower();
1795 }
1796
1797 BuildVector.legalIf(isRegisterType(0));
1798
1799 // FIXME: Clamp maximum size
1800 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1801 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1802 .clampMaxNumElements(0, S32, 32)
1803 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1804 .clampMaxNumElements(0, S16, 64);
1805
1806 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1807
1808 // Merge/Unmerge
1809 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1810 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1811 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1812
1813 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1814 const LLT Ty = Query.Types[TypeIdx];
1815 if (Ty.isVector()) {
1816 const LLT &EltTy = Ty.getElementType();
1817 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1818 return true;
1819 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1820 return true;
1821 }
1822 return false;
1823 };
1824
1825 auto &Builder = getActionDefinitionsBuilder(Op)
1826 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1827 .lowerFor({{S16, V2S16}})
1828 .lowerIf([=](const LegalityQuery &Query) {
1829 const LLT BigTy = Query.Types[BigTyIdx];
1830 return BigTy.getSizeInBits() == 32;
1831 })
1832 // Try to widen to s16 first for small types.
1833 // TODO: Only do this on targets with legal s16 shifts
1834 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1835 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1836 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1837 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1838 elementTypeIs(1, S16)),
1839 changeTo(1, V2S16))
1840 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1841 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1842 // valid.
1843 .clampScalar(LitTyIdx, S32, S512)
1844 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1845 // Break up vectors with weird elements into scalars
1846 .fewerElementsIf(
1847 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1848 scalarize(0))
1849 .fewerElementsIf(
1850 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1851 scalarize(1))
1852 .clampScalar(BigTyIdx, S32, MaxScalar);
1853
1854 if (Op == G_MERGE_VALUES) {
1855 Builder.widenScalarIf(
1856 // TODO: Use 16-bit shifts if legal for 8-bit values?
1857 [=](const LegalityQuery &Query) {
1858 const LLT Ty = Query.Types[LitTyIdx];
1859 return Ty.getSizeInBits() < 32;
1860 },
1861 changeTo(LitTyIdx, S32));
1862 }
1863
1864 Builder.widenScalarIf(
1865 [=](const LegalityQuery &Query) {
1866 const LLT Ty = Query.Types[BigTyIdx];
1867 return Ty.getSizeInBits() % 16 != 0;
1868 },
1869 [=](const LegalityQuery &Query) {
1870 // Pick the next power of 2, or a multiple of 64 over 128.
1871 // Whichever is smaller.
1872 const LLT &Ty = Query.Types[BigTyIdx];
1873 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1874 if (NewSizeInBits >= 256) {
1875 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1876 if (RoundedTo < NewSizeInBits)
1877 NewSizeInBits = RoundedTo;
1878 }
1879 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1880 })
1881 // Any vectors left are the wrong size. Scalarize them.
1882 .scalarize(0)
1883 .scalarize(1);
1884 }
1885
1886 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1887 // RegBankSelect.
1888 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1889 .legalFor({{S32}, {S64}});
1890
1891 if (ST.hasVOP3PInsts()) {
1892 SextInReg.lowerFor({{V2S16}})
1893 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1894 // get more vector shift opportunities, since we'll get those when
1895 // expanded.
1896 .clampMaxNumElementsStrict(0, S16, 2);
1897 } else if (ST.has16BitInsts()) {
1898 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1899 } else {
1900 // Prefer to promote to s32 before lowering if we don't have 16-bit
1901 // shifts. This avoid a lot of intermediate truncate and extend operations.
1902 SextInReg.lowerFor({{S32}, {S64}});
1903 }
1904
1905 SextInReg
1906 .scalarize(0)
1907 .clampScalar(0, S32, S64)
1908 .lower();
1909
1910 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1911 .scalarize(0)
1912 .lower();
1913
1914 // TODO: Only Try to form v2s16 with legal packed instructions.
1915 getActionDefinitionsBuilder(G_FSHR)
1916 .legalFor({{S32, S32}})
1917 .lowerFor({{V2S16, V2S16}})
1918 .clampMaxNumElementsStrict(0, S16, 2)
1919 .scalarize(0)
1920 .lower();
1921
1922 if (ST.hasVOP3PInsts()) {
1923 getActionDefinitionsBuilder(G_FSHL)
1924 .lowerFor({{V2S16, V2S16}})
1925 .clampMaxNumElementsStrict(0, S16, 2)
1926 .scalarize(0)
1927 .lower();
1928 } else {
1929 getActionDefinitionsBuilder(G_FSHL)
1930 .scalarize(0)
1931 .lower();
1932 }
1933
1934 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1935 .legalFor({S64});
1936
1937 getActionDefinitionsBuilder(G_FENCE)
1938 .alwaysLegal();
1939
1940 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1941 .scalarize(0)
1942 .minScalar(0, S32)
1943 .lower();
1944
1945 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1946 .legalFor({{S32, S32}, {S64, S32}})
1947 .clampScalar(1, S32, S32)
1948 .clampScalar(0, S32, S64)
1949 .widenScalarToNextPow2(0)
1950 .scalarize(0);
1951
1952 getActionDefinitionsBuilder({
1953 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1954 G_FCOPYSIGN,
1955
1956 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1957 G_ATOMICRMW_NAND,
1958 G_ATOMICRMW_FSUB,
1959 G_READ_REGISTER,
1960 G_WRITE_REGISTER,
1961
1962 G_SADDO, G_SSUBO,
1963
1964 // TODO: Implement
1965 G_FMINIMUM, G_FMAXIMUM}).lower();
1966
1967 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1968 .lower();
1969
1970 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1971 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1972 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1973 .unsupported();
1974
1975 getLegacyLegalizerInfo().computeTables();
1976 verify(*ST.getInstrInfo());
1977}
1978
1980 MachineInstr &MI) const {
1981 MachineIRBuilder &B = Helper.MIRBuilder;
1982 MachineRegisterInfo &MRI = *B.getMRI();
1983
1984 switch (MI.getOpcode()) {
1985 case TargetOpcode::G_ADDRSPACE_CAST:
1986 return legalizeAddrSpaceCast(MI, MRI, B);
1987 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
1988 return legalizeFroundeven(MI, MRI, B);
1989 case TargetOpcode::G_FCEIL:
1990 return legalizeFceil(MI, MRI, B);
1991 case TargetOpcode::G_FREM:
1992 return legalizeFrem(MI, MRI, B);
1993 case TargetOpcode::G_INTRINSIC_TRUNC:
1994 return legalizeIntrinsicTrunc(MI, MRI, B);
1995 case TargetOpcode::G_SITOFP:
1996 return legalizeITOFP(MI, MRI, B, true);
1997 case TargetOpcode::G_UITOFP:
1998 return legalizeITOFP(MI, MRI, B, false);
1999 case TargetOpcode::G_FPTOSI:
2000 return legalizeFPTOI(MI, MRI, B, true);
2001 case TargetOpcode::G_FPTOUI:
2002 return legalizeFPTOI(MI, MRI, B, false);
2003 case TargetOpcode::G_FMINNUM:
2004 case TargetOpcode::G_FMAXNUM:
2005 case TargetOpcode::G_FMINNUM_IEEE:
2006 case TargetOpcode::G_FMAXNUM_IEEE:
2007 return legalizeMinNumMaxNum(Helper, MI);
2008 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2009 return legalizeExtractVectorElt(MI, MRI, B);
2010 case TargetOpcode::G_INSERT_VECTOR_ELT:
2011 return legalizeInsertVectorElt(MI, MRI, B);
2012 case TargetOpcode::G_FSIN:
2013 case TargetOpcode::G_FCOS:
2014 return legalizeSinCos(MI, MRI, B);
2015 case TargetOpcode::G_GLOBAL_VALUE:
2016 return legalizeGlobalValue(MI, MRI, B);
2017 case TargetOpcode::G_LOAD:
2018 case TargetOpcode::G_SEXTLOAD:
2019 case TargetOpcode::G_ZEXTLOAD:
2020 return legalizeLoad(Helper, MI);
2021 case TargetOpcode::G_STORE:
2022 return legalizeStore(Helper, MI);
2023 case TargetOpcode::G_FMAD:
2024 return legalizeFMad(MI, MRI, B);
2025 case TargetOpcode::G_FDIV:
2026 return legalizeFDIV(MI, MRI, B);
2027 case TargetOpcode::G_FFREXP:
2028 return legalizeFFREXP(MI, MRI, B);
2029 case TargetOpcode::G_FSQRT:
2030 return legalizeFSQRT(MI, MRI, B);
2031 case TargetOpcode::G_UDIV:
2032 case TargetOpcode::G_UREM:
2033 case TargetOpcode::G_UDIVREM:
2034 return legalizeUnsignedDIV_REM(MI, MRI, B);
2035 case TargetOpcode::G_SDIV:
2036 case TargetOpcode::G_SREM:
2037 case TargetOpcode::G_SDIVREM:
2038 return legalizeSignedDIV_REM(MI, MRI, B);
2039 case TargetOpcode::G_ATOMIC_CMPXCHG:
2040 return legalizeAtomicCmpXChg(MI, MRI, B);
2041 case TargetOpcode::G_FLOG2:
2042 return legalizeFlog2(MI, B);
2043 case TargetOpcode::G_FLOG:
2044 case TargetOpcode::G_FLOG10:
2045 return legalizeFlogCommon(MI, B);
2046 case TargetOpcode::G_FEXP2:
2047 return legalizeFExp2(MI, B);
2048 case TargetOpcode::G_FEXP:
2049 case TargetOpcode::G_FEXP10:
2050 return legalizeFExp(MI, B);
2051 case TargetOpcode::G_FPOW:
2052 return legalizeFPow(MI, B);
2053 case TargetOpcode::G_FFLOOR:
2054 return legalizeFFloor(MI, MRI, B);
2055 case TargetOpcode::G_BUILD_VECTOR:
2056 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2057 return legalizeBuildVector(MI, MRI, B);
2058 case TargetOpcode::G_MUL:
2059 return legalizeMul(Helper, MI);
2060 case TargetOpcode::G_CTLZ:
2061 case TargetOpcode::G_CTTZ:
2062 return legalizeCTLZ_CTTZ(MI, MRI, B);
2063 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2064 return legalizeFPTruncRound(MI, B);
2065 case TargetOpcode::G_STACKSAVE:
2066 return legalizeStackSave(MI, B);
2067 default:
2068 return false;
2069 }
2070
2071 llvm_unreachable("expected switch to return");
2072}
2073
2075 unsigned AS,
2077 MachineIRBuilder &B) const {
2078 MachineFunction &MF = B.getMF();
2079 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2080 const LLT S32 = LLT::scalar(32);
2081 const LLT S64 = LLT::scalar(64);
2082
2084
2085 if (ST.hasApertureRegs()) {
2086 // Note: this register is somewhat broken. When used as a 32-bit operand,
2087 // it only returns zeroes. The real value is in the upper 32 bits.
2088 // Thus, we must emit extract the high 32 bits.
2089 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2090 ? AMDGPU::SRC_SHARED_BASE
2091 : AMDGPU::SRC_PRIVATE_BASE;
2092 // FIXME: It would be more natural to emit a COPY here, but then copy
2093 // coalescing would kick in and it would think it's okay to use the "HI"
2094 // subregister (instead of extracting the HI 32 bits) which is an artificial
2095 // (unusable) register.
2096 // Register TableGen definitions would need an overhaul to get rid of the
2097 // artificial "HI" aperture registers and prevent this kind of issue from
2098 // happening.
2099 Register Dst = MRI.createGenericVirtualRegister(S64);
2100 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2101 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2102 return B.buildUnmerge(S32, Dst).getReg(1);
2103 }
2104
2105 // TODO: can we be smarter about machine pointer info?
2107 Register LoadAddr = MRI.createGenericVirtualRegister(
2109 // For code object version 5, private_base and shared_base are passed through
2110 // implicit kernargs.
2117 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2118
2119 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2121
2122 if (!loadInputValue(KernargPtrReg, B,
2124 return Register();
2125
2127 PtrInfo,
2131
2132 // Pointer address
2133 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2134 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2135 // Load address
2136 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2137 }
2138
2139 Register QueuePtr = MRI.createGenericVirtualRegister(
2141
2143 return Register();
2144
2145 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2146 // private_segment_aperture_base_hi.
2147 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2148
2150 PtrInfo,
2153 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2154
2155 B.buildPtrAdd(LoadAddr, QueuePtr,
2156 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2157 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2158}
2159
2160/// Return true if the value is a known valid address, such that a null check is
2161/// not necessary.
2163 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2164 MachineInstr *Def = MRI.getVRegDef(Val);
2165 switch (Def->getOpcode()) {
2166 case AMDGPU::G_FRAME_INDEX:
2167 case AMDGPU::G_GLOBAL_VALUE:
2168 case AMDGPU::G_BLOCK_ADDR:
2169 return true;
2170 case AMDGPU::G_CONSTANT: {
2171 const ConstantInt *CI = Def->getOperand(1).getCImm();
2172 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2173 }
2174 default:
2175 return false;
2176 }
2177
2178 return false;
2179}
2180
2183 MachineIRBuilder &B) const {
2184 MachineFunction &MF = B.getMF();
2185
2186 const LLT S32 = LLT::scalar(32);
2187 Register Dst = MI.getOperand(0).getReg();
2188 Register Src = MI.getOperand(1).getReg();
2189
2190 LLT DstTy = MRI.getType(Dst);
2191 LLT SrcTy = MRI.getType(Src);
2192 unsigned DestAS = DstTy.getAddressSpace();
2193 unsigned SrcAS = SrcTy.getAddressSpace();
2194
2195 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2196 // vector element.
2197 assert(!DstTy.isVector());
2198
2199 const AMDGPUTargetMachine &TM
2200 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2201
2202 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2203 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2204 return true;
2205 }
2206
2207 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2208 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2209 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2210 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2211 // Extract low 32-bits of the pointer.
2212 B.buildExtract(Dst, Src, 0);
2213 MI.eraseFromParent();
2214 return true;
2215 }
2216
2217 unsigned NullVal = TM.getNullPointerValue(DestAS);
2218
2219 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2220 auto FlatNull = B.buildConstant(SrcTy, 0);
2221
2222 // Extract low 32-bits of the pointer.
2223 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2224
2225 auto CmpRes =
2226 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2227 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2228
2229 MI.eraseFromParent();
2230 return true;
2231 }
2232
2233 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2234 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2235 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2236 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2237 if (!ApertureReg.isValid())
2238 return false;
2239
2240 // Coerce the type of the low half of the result so we can use merge_values.
2241 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2242
2243 // TODO: Should we allow mismatched types but matching sizes in merges to
2244 // avoid the ptrtoint?
2245 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2246
2247 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2248 B.buildCopy(Dst, BuildPtr);
2249 MI.eraseFromParent();
2250 return true;
2251 }
2252
2253 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2254 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2255
2256 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2257 SegmentNull.getReg(0));
2258
2259 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2260
2261 MI.eraseFromParent();
2262 return true;
2263 }
2264
2265 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2266 SrcTy.getSizeInBits() == 64) {
2267 // Truncate.
2268 B.buildExtract(Dst, Src, 0);
2269 MI.eraseFromParent();
2270 return true;
2271 }
2272
2273 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2274 DstTy.getSizeInBits() == 64) {
2276 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2277 auto PtrLo = B.buildPtrToInt(S32, Src);
2278 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2279 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2280 MI.eraseFromParent();
2281 return true;
2282 }
2283
2284 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2285 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2286
2287 LLVMContext &Ctx = MF.getFunction().getContext();
2288 Ctx.diagnose(InvalidAddrSpaceCast);
2289 B.buildUndef(Dst);
2290 MI.eraseFromParent();
2291 return true;
2292}
2293
2296 MachineIRBuilder &B) const {
2297 Register Src = MI.getOperand(1).getReg();
2298 LLT Ty = MRI.getType(Src);
2299 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2300
2301 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2302 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2303
2304 auto C1 = B.buildFConstant(Ty, C1Val);
2305 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2306
2307 // TODO: Should this propagate fast-math-flags?
2308 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2309 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2310
2311 auto C2 = B.buildFConstant(Ty, C2Val);
2312 auto Fabs = B.buildFAbs(Ty, Src);
2313
2314 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2315 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2316 MI.eraseFromParent();
2317 return true;
2318}
2319
2322 MachineIRBuilder &B) const {
2323
2324 const LLT S1 = LLT::scalar(1);
2325 const LLT S64 = LLT::scalar(64);
2326
2327 Register Src = MI.getOperand(1).getReg();
2328 assert(MRI.getType(Src) == S64);
2329
2330 // result = trunc(src)
2331 // if (src > 0.0 && src != result)
2332 // result += 1.0
2333
2334 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2335
2336 const auto Zero = B.buildFConstant(S64, 0.0);
2337 const auto One = B.buildFConstant(S64, 1.0);
2338 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2339 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2340 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2341 auto Add = B.buildSelect(S64, And, One, Zero);
2342
2343 // TODO: Should this propagate fast-math-flags?
2344 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2345 MI.eraseFromParent();
2346 return true;
2347}
2348
2351 MachineIRBuilder &B) const {
2352 Register DstReg = MI.getOperand(0).getReg();
2353 Register Src0Reg = MI.getOperand(1).getReg();
2354 Register Src1Reg = MI.getOperand(2).getReg();
2355 auto Flags = MI.getFlags();
2356 LLT Ty = MRI.getType(DstReg);
2357
2358 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2359 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2360 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2361 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2362 MI.eraseFromParent();
2363 return true;
2364}
2365
2368 const unsigned FractBits = 52;
2369 const unsigned ExpBits = 11;
2370 LLT S32 = LLT::scalar(32);
2371
2372 auto Const0 = B.buildConstant(S32, FractBits - 32);
2373 auto Const1 = B.buildConstant(S32, ExpBits);
2374
2375 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2376 .addUse(Hi)
2377 .addUse(Const0.getReg(0))
2378 .addUse(Const1.getReg(0));
2379
2380 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2381}
2382
2385 MachineIRBuilder &B) const {
2386 const LLT S1 = LLT::scalar(1);
2387 const LLT S32 = LLT::scalar(32);
2388 const LLT S64 = LLT::scalar(64);
2389
2390 Register Src = MI.getOperand(1).getReg();
2391 assert(MRI.getType(Src) == S64);
2392
2393 // TODO: Should this use extract since the low half is unused?
2394 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2395 Register Hi = Unmerge.getReg(1);
2396
2397 // Extract the upper half, since this is where we will find the sign and
2398 // exponent.
2399 auto Exp = extractF64Exponent(Hi, B);
2400
2401 const unsigned FractBits = 52;
2402
2403 // Extract the sign bit.
2404 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2405 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2406
2407 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2408
2409 const auto Zero32 = B.buildConstant(S32, 0);
2410
2411 // Extend back to 64-bits.
2412 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2413
2414 auto Shr = B.buildAShr(S64, FractMask, Exp);
2415 auto Not = B.buildNot(S64, Shr);
2416 auto Tmp0 = B.buildAnd(S64, Src, Not);
2417 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2418
2419 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2420 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2421
2422 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2423 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2424 MI.eraseFromParent();
2425 return true;
2426}
2427
2430 MachineIRBuilder &B, bool Signed) const {
2431
2432 Register Dst = MI.getOperand(0).getReg();
2433 Register Src = MI.getOperand(1).getReg();
2434
2435 const LLT S64 = LLT::scalar(64);
2436 const LLT S32 = LLT::scalar(32);
2437
2438 assert(MRI.getType(Src) == S64);
2439
2440 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2441 auto ThirtyTwo = B.buildConstant(S32, 32);
2442
2443 if (MRI.getType(Dst) == S64) {
2444 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2445 : B.buildUITOFP(S64, Unmerge.getReg(1));
2446
2447 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2448 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2449
2450 // TODO: Should this propagate fast-math-flags?
2451 B.buildFAdd(Dst, LdExp, CvtLo);
2452 MI.eraseFromParent();
2453 return true;
2454 }
2455
2456 assert(MRI.getType(Dst) == S32);
2457
2458 auto One = B.buildConstant(S32, 1);
2459
2460 MachineInstrBuilder ShAmt;
2461 if (Signed) {
2462 auto ThirtyOne = B.buildConstant(S32, 31);
2463 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2464 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2465 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2466 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2467 .addUse(Unmerge.getReg(1));
2468 auto LS2 = B.buildSub(S32, LS, One);
2469 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2470 } else
2471 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2472 auto Norm = B.buildShl(S64, Src, ShAmt);
2473 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2474 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2475 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2476 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2477 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2478 B.buildFLdexp(Dst, FVal, Scale);
2479 MI.eraseFromParent();
2480 return true;
2481}
2482
2483// TODO: Copied from DAG implementation. Verify logic and document how this
2484// actually works.
2488 bool Signed) const {
2489
2490 Register Dst = MI.getOperand(0).getReg();
2491 Register Src = MI.getOperand(1).getReg();
2492
2493 const LLT S64 = LLT::scalar(64);
2494 const LLT S32 = LLT::scalar(32);
2495
2496 const LLT SrcLT = MRI.getType(Src);
2497 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2498
2499 unsigned Flags = MI.getFlags();
2500
2501 // The basic idea of converting a floating point number into a pair of 32-bit
2502 // integers is illustrated as follows:
2503 //
2504 // tf := trunc(val);
2505 // hif := floor(tf * 2^-32);
2506 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2507 // hi := fptoi(hif);
2508 // lo := fptoi(lof);
2509 //
2510 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2512 if (Signed && SrcLT == S32) {
2513 // However, a 32-bit floating point number has only 23 bits mantissa and
2514 // it's not enough to hold all the significant bits of `lof` if val is
2515 // negative. To avoid the loss of precision, We need to take the absolute
2516 // value after truncating and flip the result back based on the original
2517 // signedness.
2518 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2519 Trunc = B.buildFAbs(S32, Trunc, Flags);
2520 }
2521 MachineInstrBuilder K0, K1;
2522 if (SrcLT == S64) {
2523 K0 = B.buildFConstant(
2524 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2525 K1 = B.buildFConstant(
2526 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2527 } else {
2528 K0 = B.buildFConstant(
2529 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2530 K1 = B.buildFConstant(
2531 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2532 }
2533
2534 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2535 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2536 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2537
2538 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2539 : B.buildFPTOUI(S32, FloorMul);
2540 auto Lo = B.buildFPTOUI(S32, Fma);
2541
2542 if (Signed && SrcLT == S32) {
2543 // Flip the result based on the signedness, which is either all 0s or 1s.
2544 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2545 // r := xor({lo, hi}, sign) - sign;
2546 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2547 Sign);
2548 } else
2549 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2550 MI.eraseFromParent();
2551
2552 return true;
2553}
2554
2556 MachineInstr &MI) const {
2557 MachineFunction &MF = Helper.MIRBuilder.getMF();
2559
2560 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2561 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2562
2563 // With ieee_mode disabled, the instructions have the correct behavior
2564 // already for G_FMINNUM/G_FMAXNUM
2565 if (!MFI->getMode().IEEE)
2566 return !IsIEEEOp;
2567
2568 if (IsIEEEOp)
2569 return true;
2570
2572}
2573
2576 MachineIRBuilder &B) const {
2577 // TODO: Should move some of this into LegalizerHelper.
2578
2579 // TODO: Promote dynamic indexing of s16 to s32
2580
2581 Register Dst = MI.getOperand(0).getReg();
2582 Register Vec = MI.getOperand(1).getReg();
2583
2584 LLT VecTy = MRI.getType(Vec);
2585 LLT EltTy = VecTy.getElementType();
2586 assert(EltTy == MRI.getType(Dst));
2587
2588 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2589 // but we can't go directly to that logic becasue you can't bitcast a vector
2590 // of pointers to a vector of integers. Therefore, introduce an intermediate
2591 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2592 // drive the legalization forward.
2593 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2594 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2595 LLT IntVecTy = VecTy.changeElementType(IntTy);
2596
2597 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2598 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2599 B.buildIntToPtr(Dst, IntElt);
2600
2601 MI.eraseFromParent();
2602 return true;
2603 }
2604
2605 // FIXME: Artifact combiner probably should have replaced the truncated
2606 // constant before this, so we shouldn't need
2607 // getIConstantVRegValWithLookThrough.
2608 std::optional<ValueAndVReg> MaybeIdxVal =
2609 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2610 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2611 return true;
2612 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2613
2614 if (IdxVal < VecTy.getNumElements()) {
2615 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2616 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2617 } else {
2618 B.buildUndef(Dst);
2619 }
2620
2621 MI.eraseFromParent();
2622 return true;
2623}
2624
2627 MachineIRBuilder &B) const {
2628 // TODO: Should move some of this into LegalizerHelper.
2629
2630 // TODO: Promote dynamic indexing of s16 to s32
2631
2632 Register Dst = MI.getOperand(0).getReg();
2633 Register Vec = MI.getOperand(1).getReg();
2634 Register Ins = MI.getOperand(2).getReg();
2635
2636 LLT VecTy = MRI.getType(Vec);
2637 LLT EltTy = VecTy.getElementType();
2638 assert(EltTy == MRI.getType(Ins));
2639
2640 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2641 // but we can't go directly to that logic becasue you can't bitcast a vector
2642 // of pointers to a vector of integers. Therefore, make the pointer vector
2643 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2644 // new value, and then inttoptr the result vector back. This will then allow
2645 // the rest of legalization to take over.
2646 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2647 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2648 LLT IntVecTy = VecTy.changeElementType(IntTy);
2649
2650 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2651 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2652 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2653 MI.getOperand(3));
2654 B.buildIntToPtr(Dst, IntVecDest);
2655 MI.eraseFromParent();
2656 return true;
2657 }
2658
2659 // FIXME: Artifact combiner probably should have replaced the truncated
2660 // constant before this, so we shouldn't need
2661 // getIConstantVRegValWithLookThrough.
2662 std::optional<ValueAndVReg> MaybeIdxVal =
2663 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2664 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2665 return true;
2666
2667 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2668
2669 unsigned NumElts = VecTy.getNumElements();
2670 if (IdxVal < NumElts) {
2672 for (unsigned i = 0; i < NumElts; ++i)
2673 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2674 B.buildUnmerge(SrcRegs, Vec);
2675
2676 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2677 B.buildMergeLikeInstr(Dst, SrcRegs);
2678 } else {
2679 B.buildUndef(Dst);
2680 }
2681
2682 MI.eraseFromParent();
2683 return true;
2684}
2685
2688 MachineIRBuilder &B) const {
2689
2690 Register DstReg = MI.getOperand(0).getReg();
2691 Register SrcReg = MI.getOperand(1).getReg();
2692 LLT Ty = MRI.getType(DstReg);
2693 unsigned Flags = MI.getFlags();
2694
2695 Register TrigVal;
2696 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2697 if (ST.hasTrigReducedRange()) {
2698 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2699 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2700 .addUse(MulVal.getReg(0))
2701 .setMIFlags(Flags)
2702 .getReg(0);
2703 } else
2704 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2705
2706 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2707 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2708 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2709 .addUse(TrigVal)
2710 .setMIFlags(Flags);
2711 MI.eraseFromParent();
2712 return true;
2713}
2714
2717 const GlobalValue *GV,
2718 int64_t Offset,
2719 unsigned GAFlags) const {
2720 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2721 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2722 // to the following code sequence:
2723 //
2724 // For constant address space:
2725 // s_getpc_b64 s[0:1]
2726 // s_add_u32 s0, s0, $symbol
2727 // s_addc_u32 s1, s1, 0
2728 //
2729 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2730 // a fixup or relocation is emitted to replace $symbol with a literal
2731 // constant, which is a pc-relative offset from the encoding of the $symbol
2732 // operand to the global variable.
2733 //
2734 // For global address space:
2735 // s_getpc_b64 s[0:1]
2736 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2737 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2738 //
2739 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2740 // fixups or relocations are emitted to replace $symbol@*@lo and
2741 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2742 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2743 // operand to the global variable.
2744
2746
2747 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2748 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2749
2750 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2751 .addDef(PCReg);
2752
2753 MIB.addGlobalAddress(GV, Offset, GAFlags);
2754 if (GAFlags == SIInstrInfo::MO_NONE)
2755 MIB.addImm(0);
2756 else
2757 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2758
2759 if (!B.getMRI()->getRegClassOrNull(PCReg))
2760 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2761
2762 if (PtrTy.getSizeInBits() == 32)
2763 B.buildExtract(DstReg, PCReg, 0);
2764 return true;
2765}
2766
2767// Emit a ABS32_LO / ABS32_HI relocation stub.
2769 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2770 MachineRegisterInfo &MRI) const {
2771 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2772
2773 LLT S32 = LLT::scalar(32);
2774
2775 // Use the destination directly, if and only if we store the lower address
2776 // part only and we don't have a register class being set.
2777 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2778 ? DstReg
2779 : MRI.createGenericVirtualRegister(S32);
2780
2781 if (!MRI.getRegClassOrNull(AddrLo))
2782 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2783
2784 // Write the lower half.
2785 B.buildInstr(AMDGPU::S_MOV_B32)
2786 .addDef(AddrLo)
2787 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2788
2789 // If required, write the upper half as well.
2790 if (RequiresHighHalf) {
2791 assert(PtrTy.getSizeInBits() == 64 &&
2792 "Must provide a 64-bit pointer type!");
2793
2794 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2795 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2796
2797 B.buildInstr(AMDGPU::S_MOV_B32)
2798 .addDef(AddrHi)
2799 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2800
2801 // Use the destination directly, if and only if we don't have a register
2802 // class being set.
2803 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2804 ? DstReg
2805 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2806
2807 if (!MRI.getRegClassOrNull(AddrDst))
2808 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2809
2810 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2811
2812 // If we created a new register for the destination, cast the result into
2813 // the final output.
2814 if (AddrDst != DstReg)
2815 B.buildCast(DstReg, AddrDst);
2816 } else if (AddrLo != DstReg) {
2817 // If we created a new register for the destination, cast the result into
2818 // the final output.
2819 B.buildCast(DstReg, AddrLo);
2820 }
2821}
2822
2825 MachineIRBuilder &B) const {
2826 Register DstReg = MI.getOperand(0).getReg();
2827 LLT Ty = MRI.getType(DstReg);
2828 unsigned AS = Ty.getAddressSpace();
2829
2830 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2831 MachineFunction &MF = B.getMF();
2833
2835 if (!MFI->isModuleEntryFunction() &&
2836 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2837 const Function &Fn = MF.getFunction();
2838 DiagnosticInfoUnsupported BadLDSDecl(
2839 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2840 DS_Warning);
2841 Fn.getContext().diagnose(BadLDSDecl);
2842
2843 // We currently don't have a way to correctly allocate LDS objects that
2844 // aren't directly associated with a kernel. We do force inlining of
2845 // functions that use local objects. However, if these dead functions are
2846 // not eliminated, we don't want a compile time error. Just emit a warning
2847 // and a trap, since there should be no callable path here.
2848 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>());
2849 B.buildUndef(DstReg);
2850 MI.eraseFromParent();
2851 return true;
2852 }
2853
2854 // TODO: We could emit code to handle the initialization somewhere.
2855 // We ignore the initializer for now and legalize it to allow selection.
2856 // The initializer will anyway get errored out during assembly emission.
2857 const SITargetLowering *TLI = ST.getTargetLowering();
2858 if (!TLI->shouldUseLDSConstAddress(GV)) {
2859 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2860 return true; // Leave in place;
2861 }
2862
2863 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2864 Type *Ty = GV->getValueType();
2865 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2866 // zero-sized type in other languages to declare the dynamic shared
2867 // memory which size is not known at the compile time. They will be
2868 // allocated by the runtime and placed directly after the static
2869 // allocated ones. They all share the same offset.
2870 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2871 // Adjust alignment for that dynamic shared memory array.
2872 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2873 LLT S32 = LLT::scalar(32);
2874 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2875 B.buildIntToPtr(DstReg, Sz);
2876 MI.eraseFromParent();
2877 return true;
2878 }
2879 }
2880
2881 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2882 *cast<GlobalVariable>(GV)));
2883 MI.eraseFromParent();
2884 return true;
2885 }
2886
2887 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2888 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
2889 MI.eraseFromParent();
2890 return true;
2891 }
2892
2893 const SITargetLowering *TLI = ST.getTargetLowering();
2894
2895 if (TLI->shouldEmitFixup(GV)) {
2896 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2897 MI.eraseFromParent();
2898 return true;
2899 }
2900
2901 if (TLI->shouldEmitPCReloc(GV)) {
2902 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2903 MI.eraseFromParent();
2904 return true;
2905 }
2906
2908 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2909
2910 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2915 LoadTy, Align(8));
2916
2917 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2918
2919 if (Ty.getSizeInBits() == 32) {
2920 // Truncate if this is a 32-bit constant address.
2921 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2922 B.buildExtract(DstReg, Load, 0);
2923 } else
2924 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2925
2926 MI.eraseFromParent();
2927 return true;
2928}
2929
2931 if (Ty.isVector())
2932 return Ty.changeElementCount(
2935}
2936
2938 MachineInstr &MI) const {
2939 MachineIRBuilder &B = Helper.MIRBuilder;
2940 MachineRegisterInfo &MRI = *B.getMRI();
2941 GISelChangeObserver &Observer = Helper.Observer;
2942
2943 Register PtrReg = MI.getOperand(1).getReg();
2944 LLT PtrTy = MRI.getType(PtrReg);
2945 unsigned AddrSpace = PtrTy.getAddressSpace();
2946
2947 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2949 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2950 Observer.changingInstr(MI);
2951 MI.getOperand(1).setReg(Cast.getReg(0));
2952 Observer.changedInstr(MI);
2953 return true;
2954 }
2955
2956 if (MI.getOpcode() != AMDGPU::G_LOAD)
2957 return false;
2958
2959 Register ValReg = MI.getOperand(0).getReg();
2960 LLT ValTy = MRI.getType(ValReg);
2961
2962 if (hasBufferRsrcWorkaround(ValTy)) {
2963 Observer.changingInstr(MI);
2965 Observer.changedInstr(MI);
2966 return true;
2967 }
2968
2969 MachineMemOperand *MMO = *MI.memoperands_begin();
2970 const unsigned ValSize = ValTy.getSizeInBits();
2971 const LLT MemTy = MMO->getMemoryType();
2972 const Align MemAlign = MMO->getAlign();
2973 const unsigned MemSize = MemTy.getSizeInBits();
2974 const uint64_t AlignInBits = 8 * MemAlign.value();
2975
2976 // Widen non-power-of-2 loads to the alignment if needed
2977 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2978 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2979
2980 // This was already the correct extending load result type, so just adjust
2981 // the memory type.
2982 if (WideMemSize == ValSize) {
2983 MachineFunction &MF = B.getMF();
2984
2985 MachineMemOperand *WideMMO =
2986 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2987 Observer.changingInstr(MI);
2988 MI.setMemRefs(MF, {WideMMO});
2989 Observer.changedInstr(MI);
2990 return true;
2991 }
2992
2993 // Don't bother handling edge case that should probably never be produced.
2994 if (ValSize > WideMemSize)
2995 return false;
2996
2997 LLT WideTy = widenToNextPowerOf2(ValTy);
2998
2999 Register WideLoad;
3000 if (!WideTy.isVector()) {
3001 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3002 B.buildTrunc(ValReg, WideLoad).getReg(0);
3003 } else {
3004 // Extract the subvector.
3005
3006 if (isRegisterType(ValTy)) {
3007 // If this a case where G_EXTRACT is legal, use it.
3008 // (e.g. <3 x s32> -> <4 x s32>)
3009 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3010 B.buildExtract(ValReg, WideLoad, 0);
3011 } else {
3012 // For cases where the widened type isn't a nice register value, unmerge
3013 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3014 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3015 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3016 }
3017 }
3018
3019 MI.eraseFromParent();
3020 return true;
3021 }
3022
3023 return false;
3024}
3025
3027 MachineInstr &MI) const {
3028 MachineIRBuilder &B = Helper.MIRBuilder;
3029 MachineRegisterInfo &MRI = *B.getMRI();
3030 GISelChangeObserver &Observer = Helper.Observer;
3031
3032 Register DataReg = MI.getOperand(0).getReg();
3033 LLT DataTy = MRI.getType(DataReg);
3034
3035 if (hasBufferRsrcWorkaround(DataTy)) {
3036 Observer.changingInstr(MI);
3038 Observer.changedInstr(MI);
3039 return true;
3040 }
3041 return false;
3042}
3043
3046 MachineIRBuilder &B) const {
3047 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3048 assert(Ty.isScalar());
3049
3050 MachineFunction &MF = B.getMF();
3052
3053 // TODO: Always legal with future ftz flag.
3054 // FIXME: Do we need just output?
3055 if (Ty == LLT::float32() &&
3057 return true;
3058 if (Ty == LLT::float16() &&
3060 return true;
3061
3062 MachineIRBuilder HelperBuilder(MI);
3063 GISelObserverWrapper DummyObserver;
3064 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3065 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3066}
3067
3070 Register DstReg = MI.getOperand(0).getReg();
3071 Register PtrReg = MI.getOperand(1).getReg();
3072 Register CmpVal = MI.getOperand(2).getReg();
3073 Register NewVal = MI.getOperand(3).getReg();
3074
3075 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3076 "this should not have been custom lowered");
3077
3078 LLT ValTy = MRI.getType(CmpVal);
3079 LLT VecTy = LLT::fixed_vector(2, ValTy);
3080
3081 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3082
3083 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3084 .addDef(DstReg)
3085 .addUse(PtrReg)
3086 .addUse(PackedVal)
3087 .setMemRefs(MI.memoperands());
3088
3089 MI.eraseFromParent();
3090 return true;
3091}
3092
3093/// Return true if it's known that \p Src can never be an f32 denormal value.
3095 Register Src) {
3096 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3097 switch (DefMI->getOpcode()) {
3098 case TargetOpcode::G_INTRINSIC: {
3099 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3100 case Intrinsic::amdgcn_frexp_mant:
3101 return true;
3102 default:
3103 break;
3104 }
3105
3106 break;
3107 }
3108 case TargetOpcode::G_FFREXP: {
3109 if (DefMI->getOperand(0).getReg() == Src)
3110 return true;
3111 break;
3112 }
3113 case TargetOpcode::G_FPEXT: {
3114 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3115 }
3116 default:
3117 return false;
3118 }
3119
3120 return false;
3121}
3122
3123static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3124 if (Flags & MachineInstr::FmAfn)
3125 return true;
3126 const auto &Options = MF.getTarget().Options;
3127 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3128}
3129
3131 unsigned Flags) {
3132 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3135}
3136
3137std::pair<Register, Register>
3139 unsigned Flags) const {
3140 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3141 return {};
3142
3143 const LLT F32 = LLT::scalar(32);
3144 auto SmallestNormal = B.buildFConstant(
3146 auto IsLtSmallestNormal =
3147 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3148
3149 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3150 auto One = B.buildFConstant(F32, 1.0);
3151 auto ScaleFactor =
3152 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3153 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3154
3155 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3156}
3157
3159 MachineIRBuilder &B) const {
3160 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3161 // If we have to handle denormals, scale up the input and adjust the result.
3162
3163 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3164 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3165
3166 Register Dst = MI.getOperand(0).getReg();
3167 Register Src = MI.getOperand(1).getReg();
3168 LLT Ty = B.getMRI()->getType(Dst);
3169 unsigned Flags = MI.getFlags();
3170
3171 if (Ty == LLT::scalar(16)) {
3172 const LLT F32 = LLT::scalar(32);
3173 // Nothing in half is a denormal when promoted to f32.
3174 auto Ext = B.buildFPExt(F32, Src, Flags);
3175 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3176 .addUse(Ext.getReg(0))
3177 .setMIFlags(Flags);
3178 B.buildFPTrunc(Dst, Log2, Flags);
3179 MI.eraseFromParent();
3180 return true;
3181 }
3182
3183 assert(Ty == LLT::scalar(32));
3184
3185 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3186 if (!ScaledInput) {
3187 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3188 .addUse(Src)
3189 .setMIFlags(Flags);
3190 MI.eraseFromParent();
3191 return true;
3192 }
3193
3194 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3195 .addUse(ScaledInput)
3196 .setMIFlags(Flags);
3197
3198 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3199 auto Zero = B.buildFConstant(Ty, 0.0);
3200 auto ResultOffset =
3201 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3202 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3203
3204 MI.eraseFromParent();
3205 return true;
3206}
3207
3209 Register Z, unsigned Flags) {
3210 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3211 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3212}
3213
3215 MachineIRBuilder &B) const {
3216 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3217 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3218
3219 MachineRegisterInfo &MRI = *B.getMRI();
3220 Register Dst = MI.getOperand(0).getReg();
3221 Register X = MI.getOperand(1).getReg();
3222 unsigned Flags = MI.getFlags();
3223 const LLT Ty = MRI.getType(X);
3224 MachineFunction &MF = B.getMF();
3225
3226 const LLT F32 = LLT::scalar(32);
3227 const LLT F16 = LLT::scalar(16);
3228
3229 const AMDGPUTargetMachine &TM =
3230 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3231
3232 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3233 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3234 if (Ty == F16 && !ST.has16BitInsts()) {
3235 Register LogVal = MRI.createGenericVirtualRegister(F32);
3236 auto PromoteSrc = B.buildFPExt(F32, X);
3237 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3238 B.buildFPTrunc(Dst, LogVal);
3239 } else {
3240 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3241 }
3242
3243 MI.eraseFromParent();
3244 return true;
3245 }
3246
3247 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3248 if (ScaledInput)
3249 X = ScaledInput;
3250
3251 auto Y =
3252 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3253
3254 Register R;
3255 if (ST.hasFastFMAF32()) {
3256 // c+cc are ln(2)/ln(10) to more than 49 bits
3257 const float c_log10 = 0x1.344134p-2f;
3258 const float cc_log10 = 0x1.09f79ep-26f;
3259
3260 // c + cc is ln(2) to more than 49 bits
3261 const float c_log = 0x1.62e42ep-1f;
3262 const float cc_log = 0x1.efa39ep-25f;
3263
3264 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3265 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3266
3267 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3268 auto NegR = B.buildFNeg(Ty, R, Flags);
3269 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3270 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3271 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3272 } else {
3273 // ch+ct is ln(2)/ln(10) to more than 36 bits
3274 const float ch_log10 = 0x1.344000p-2f;
3275 const float ct_log10 = 0x1.3509f6p-18f;
3276
3277 // ch + ct is ln(2) to more than 36 bits
3278 const float ch_log = 0x1.62e000p-1f;
3279 const float ct_log = 0x1.0bfbe8p-15f;
3280
3281 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3282 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3283
3284 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3285 auto YH = B.buildAnd(Ty, Y, MaskConst);
3286 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3287 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3288
3289 Register Mad0 =
3290 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3291 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3292 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3293 }
3294
3295 const bool IsFiniteOnly =
3296 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3297 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3298
3299 if (!IsFiniteOnly) {
3300 // Expand isfinite(x) => fabs(x) < inf
3301 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3302 auto Fabs = B.buildFAbs(Ty, Y);
3303 auto IsFinite =
3304 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3305 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3306 }
3307
3308 if (ScaledInput) {
3309 auto Zero = B.buildFConstant(Ty, 0.0);
3310 auto ShiftK =
3311 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3312 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3313 B.buildFSub(Dst, R, Shift, Flags);
3314 } else {
3315 B.buildCopy(Dst, R);
3316 }
3317
3318 MI.eraseFromParent();
3319 return true;
3320}
3321
3323 Register Src, bool IsLog10,
3324 unsigned Flags) const {
3325 const double Log2BaseInverted =
3327
3328 LLT Ty = B.getMRI()->getType(Dst);
3329
3330 if (Ty == LLT::scalar(32)) {
3331 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3332 if (ScaledInput) {
3333 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3334 .addUse(Src)
3335 .setMIFlags(Flags);
3336 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3337 auto Zero = B.buildFConstant(Ty, 0.0);
3338 auto ResultOffset =
3339 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3340 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3341
3342 if (ST.hasFastFMAF32())
3343 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3344 else {
3345 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3346 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3347 }
3348
3349 return true;
3350 }
3351 }
3352
3353 auto Log2Operand = Ty == LLT::scalar(16)
3354 ? B.buildFLog2(Ty, Src, Flags)
3355 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3356 .addUse(Src)
3357 .setMIFlags(Flags);
3358 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3359 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3360 return true;
3361}
3362
3364 MachineIRBuilder &B) const {
3365 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3366 // If we have to handle denormals, scale up the input and adjust the result.
3367
3368 Register Dst = MI.getOperand(0).getReg();
3369 Register Src = MI.getOperand(1).getReg();
3370 unsigned Flags = MI.getFlags();
3371 LLT Ty = B.getMRI()->getType(Dst);
3372 const LLT F16 = LLT::scalar(16);
3373 const LLT F32 = LLT::scalar(32);
3374
3375 if (Ty == F16) {
3376 // Nothing in half is a denormal when promoted to f32.
3377 auto Ext = B.buildFPExt(F32, Src, Flags);
3378 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3379 .addUse(Ext.getReg(0))
3380 .setMIFlags(Flags);
3381 B.buildFPTrunc(Dst, Log2, Flags);
3382 MI.eraseFromParent();
3383 return true;
3384 }
3385
3386 assert(Ty == F32);
3387
3388 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3389 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3390 .addUse(Src)
3391 .setMIFlags(Flags);
3392 MI.eraseFromParent();
3393 return true;
3394 }
3395
3396 // bool needs_scaling = x < -0x1.f80000p+6f;
3397 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3398
3399 // -nextafter(128.0, -1)
3400 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3401 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3402 RangeCheckConst, Flags);
3403
3404 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3405 auto Zero = B.buildFConstant(Ty, 0.0);
3406 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3407 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3408
3409 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3410 .addUse(AddInput.getReg(0))
3411 .setMIFlags(Flags);
3412
3413 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3414 auto One = B.buildFConstant(Ty, 1.0);
3415 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3416 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3417 MI.eraseFromParent();
3418 return true;
3419}
3420
3422 Register X, unsigned Flags) const {
3423 LLT Ty = B.getMRI()->getType(Dst);
3424 LLT F32 = LLT::scalar(32);
3425
3426 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3427 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3428 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3429
3430 if (Ty == F32) {
3431 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3432 .addUse(Mul.getReg(0))
3433 .setMIFlags(Flags);
3434 } else {
3435 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3436 }
3437
3438 return true;
3439 }
3440
3441 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3442 auto NeedsScaling =
3443 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3444 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3445 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3446 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3447
3448 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3449 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3450
3451 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3452 .addUse(ExpInput.getReg(0))
3453 .setMIFlags(Flags);
3454
3455 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3456 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3457 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3458 return true;
3459}
3460
3462 MachineIRBuilder &B) const {
3463 Register Dst = MI.getOperand(0).getReg();
3464 Register X = MI.getOperand(1).getReg();
3465 const unsigned Flags = MI.getFlags();
3466 MachineFunction &MF = B.getMF();
3467 MachineRegisterInfo &MRI = *B.getMRI();
3468 LLT Ty = MRI.getType(Dst);
3469 const LLT F16 = LLT::scalar(16);
3470 const LLT F32 = LLT::scalar(32);
3471 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3472
3473 if (Ty == F16) {
3474 // v_exp_f16 (fmul x, log2e)
3475 if (allowApproxFunc(MF, Flags)) {
3476 // TODO: Does this really require fast?
3477 legalizeFExpUnsafe(B, Dst, X, Flags);
3478 MI.eraseFromParent();
3479 return true;
3480 }
3481
3482 // exp(f16 x) ->
3483 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3484
3485 // Nothing in half is a denormal when promoted to f32.
3486 auto Ext = B.buildFPExt(F32, X, Flags);
3487 Register Lowered = MRI.createGenericVirtualRegister(F32);
3488 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3489 B.buildFPTrunc(Dst, Lowered, Flags);
3490 MI.eraseFromParent();
3491 return true;
3492 }
3493
3494 assert(Ty == F32);
3495
3496 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3497 // library behavior. Also, is known-not-daz source sufficient?
3498 if (allowApproxFunc(MF, Flags)) {
3499 legalizeFExpUnsafe(B, Dst, X, Flags);
3500 MI.eraseFromParent();
3501 return true;
3502 }
3503
3504 // Algorithm:
3505 //
3506 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3507 //
3508 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3509 // n = 64*m + j, 0 <= j < 64
3510 //
3511 // e^x = 2^((64*m + j + f)/64)
3512 // = (2^m) * (2^(j/64)) * 2^(f/64)
3513 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3514 //
3515 // f = x*(64/ln(2)) - n
3516 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3517 //
3518 // e^x = (2^m) * (2^(j/64)) * e^r
3519 //
3520 // (2^(j/64)) is precomputed
3521 //
3522 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3523 // e^r = 1 + q
3524 //
3525 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3526 //
3527 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3528 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3529 Register PH, PL;
3530
3531 if (ST.hasFastFMAF32()) {
3532 const float c_exp = numbers::log2ef;
3533 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3534 const float c_exp10 = 0x1.a934f0p+1f;
3535 const float cc_exp10 = 0x1.2f346ep-24f;
3536
3537 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3538 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3539 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3540 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3541
3542 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3543 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3544 } else {
3545 const float ch_exp = 0x1.714000p+0f;
3546 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3547
3548 const float ch_exp10 = 0x1.a92000p+1f;
3549 const float cl_exp10 = 0x1.4f0978p-11f;
3550
3551 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3552 auto XH = B.buildAnd(Ty, X, MaskConst);
3553 auto XL = B.buildFSub(Ty, X, XH, Flags);
3554
3555 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3556 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3557
3558 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3559 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3560
3561 Register Mad0 =
3562 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3563 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3564 }
3565
3566 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3567
3568 // It is unsafe to contract this fsub into the PH multiply.
3569 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3570 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3571 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3572
3573 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3574 .addUse(A.getReg(0))
3575 .setMIFlags(Flags);
3576 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3577
3578 auto UnderflowCheckConst =
3579 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3580 auto Zero = B.buildFConstant(Ty, 0.0);
3581 auto Underflow =
3582 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3583
3584 R = B.buildSelect(Ty, Underflow, Zero, R);
3585
3586 const auto &Options = MF.getTarget().Options;
3587
3588 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3589 auto OverflowCheckConst =
3590 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3591
3592 auto Overflow =
3593 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3594 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3595 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3596 }
3597
3598 B.buildCopy(Dst, R);
3599 MI.eraseFromParent();
3600 return true;
3601}
3602
3604 MachineIRBuilder &B) const {
3605 Register Dst = MI.getOperand(0).getReg();
3606 Register Src0 = MI.getOperand(1).getReg();
3607 Register Src1 = MI.getOperand(2).getReg();
3608 unsigned Flags = MI.getFlags();
3609 LLT Ty = B.getMRI()->getType(Dst);
3610 const LLT F16 = LLT::float16();
3611 const LLT F32 = LLT::float32();
3612
3613 if (Ty == F32) {
3614 auto Log = B.buildFLog2(F32, Src0, Flags);
3615 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3616 .addUse(Log.getReg(0))
3617 .addUse(Src1)
3618 .setMIFlags(Flags);
3619 B.buildFExp2(Dst, Mul, Flags);
3620 } else if (Ty == F16) {
3621 // There's no f16 fmul_legacy, so we need to convert for it.
3622 auto Log = B.buildFLog2(F16, Src0, Flags);
3623 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3624 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3625 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3626 .addUse(Ext0.getReg(0))
3627 .addUse(Ext1.getReg(0))
3628 .setMIFlags(Flags);
3629 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3630 } else
3631 return false;
3632
3633 MI.eraseFromParent();
3634 return true;
3635}
3636
3637// Find a source register, ignoring any possible source modifiers.
3639 Register ModSrc = OrigSrc;
3640 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3641 ModSrc = SrcFNeg->getOperand(1).getReg();
3642 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3643 ModSrc = SrcFAbs->getOperand(1).getReg();
3644 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3645 ModSrc = SrcFAbs->getOperand(1).getReg();
3646 return ModSrc;
3647}
3648
3651 MachineIRBuilder &B) const {
3652
3653 const LLT S1 = LLT::scalar(1);
3654 const LLT F64 = LLT::float64();
3655 Register Dst = MI.getOperand(0).getReg();
3656 Register OrigSrc = MI.getOperand(1).getReg();
3657 unsigned Flags = MI.getFlags();
3658 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3659 "this should not have been custom lowered");
3660
3661 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3662 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3663 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3664 // V_FRACT bug is:
3665 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3666 //
3667 // Convert floor(x) to (x - fract(x))
3668
3669 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3670 .addUse(OrigSrc)
3671 .setMIFlags(Flags);
3672
3673 // Give source modifier matching some assistance before obscuring a foldable
3674 // pattern.
3675
3676 // TODO: We can avoid the neg on the fract? The input sign to fract
3677 // shouldn't matter?
3678 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3679
3680 auto Const =
3681 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3682
3683 Register Min = MRI.createGenericVirtualRegister(F64);
3684
3685 // We don't need to concern ourselves with the snan handling difference, so
3686 // use the one which will directly select.
3687 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3688 if (MFI->getMode().IEEE)
3689 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3690 else
3691 B.buildFMinNum(Min, Fract, Const, Flags);
3692
3693 Register CorrectedFract = Min;
3694 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3695 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3696 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3697 }
3698
3699 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3700 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3701
3702 MI.eraseFromParent();
3703 return true;
3704}
3705
3706// Turn an illegal packed v2s16 build vector into bit operations.
3707// TODO: This should probably be a bitcast action in LegalizerHelper.
3710 Register Dst = MI.getOperand(0).getReg();
3711 const LLT S32 = LLT::scalar(32);
3712 const LLT S16 = LLT::scalar(16);
3713 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3714
3715 Register Src0 = MI.getOperand(1).getReg();
3716 Register Src1 = MI.getOperand(2).getReg();
3717
3718 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3719 assert(MRI.getType(Src0) == S32);
3720 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3721 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3722 }
3723
3724 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3725 B.buildBitcast(Dst, Merge);
3726
3727 MI.eraseFromParent();
3728 return true;
3729}
3730
3731// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3732//
3733// Source and accumulation registers must all be 32-bits.
3734//
3735// TODO: When the multiply is uniform, we should produce a code sequence
3736// that is better suited to instruction selection on the SALU. Instead of
3737// the outer loop going over parts of the result, the outer loop should go
3738// over parts of one of the factors. This should result in instruction
3739// selection that makes full use of S_ADDC_U32 instructions.
3742 ArrayRef<Register> Src0,
3743 ArrayRef<Register> Src1,
3744 bool UsePartialMad64_32,
3745 bool SeparateOddAlignedProducts) const {
3746 // Use (possibly empty) vectors of S1 registers to represent the set of
3747 // carries from one pair of positions to the next.
3748 using Carry = SmallVector<Register, 2>;
3749
3750 MachineIRBuilder &B = Helper.MIRBuilder;
3751 GISelKnownBits &KB = *Helper.getKnownBits();
3752
3753 const LLT S1 = LLT::scalar(1);
3754 const LLT S32 = LLT::scalar(32);
3755 const LLT S64 = LLT::scalar(64);
3756
3757 Register Zero32;
3758 Register Zero64;
3759
3760 auto getZero32 = [&]() -> Register {
3761 if (!Zero32)
3762 Zero32 = B.buildConstant(S32, 0).getReg(0);
3763 return Zero32;
3764 };
3765 auto getZero64 = [&]() -> Register {
3766 if (!Zero64)
3767 Zero64 = B.buildConstant(S64, 0).getReg(0);
3768 return Zero64;
3769 };
3770
3771 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3772 for (unsigned i = 0; i < Src0.size(); ++i) {
3773 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3774 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3775 }
3776
3777 // Merge the given carries into the 32-bit LocalAccum, which is modified
3778 // in-place.
3779 //
3780 // Returns the carry-out, which is a single S1 register or null.
3781 auto mergeCarry =
3782 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3783 if (CarryIn.empty())
3784 return Register();
3785
3786 bool HaveCarryOut = true;
3787 Register CarryAccum;
3788 if (CarryIn.size() == 1) {
3789 if (!LocalAccum) {
3790 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3791 return Register();
3792 }
3793
3794 CarryAccum = getZero32();
3795 } else {
3796 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3797 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3798 CarryAccum =
3799 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3800 .getReg(0);
3801 }
3802
3803 if (!LocalAccum) {
3804 LocalAccum = getZero32();
3805 HaveCarryOut = false;
3806 }
3807 }
3808
3809 auto Add =
3810 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3811 LocalAccum = Add.getReg(0);
3812 return HaveCarryOut ? Add.getReg(1) : Register();
3813 };
3814
3815 // Build a multiply-add chain to compute
3816 //
3817 // LocalAccum + (partial products at DstIndex)
3818 // + (opportunistic subset of CarryIn)
3819 //
3820 // LocalAccum is an array of one or two 32-bit registers that are updated
3821 // in-place. The incoming registers may be null.
3822 //
3823 // In some edge cases, carry-ins can be consumed "for free". In that case,
3824 // the consumed carry bits are removed from CarryIn in-place.
3825 auto buildMadChain =
3826 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3827 -> Carry {
3828 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3829 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3830
3831 Carry CarryOut;
3832 unsigned j0 = 0;
3833
3834 // Use plain 32-bit multiplication for the most significant part of the
3835 // result by default.
3836 if (LocalAccum.size() == 1 &&
3837 (!UsePartialMad64_32 || !CarryIn.empty())) {
3838 do {
3839 // Skip multiplication if one of the operands is 0
3840 unsigned j1 = DstIndex - j0;
3841 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3842 ++j0;
3843 continue;
3844 }
3845 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3846 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3847 LocalAccum[0] = Mul.getReg(0);
3848 } else {
3849 if (CarryIn.empty()) {
3850 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3851 } else {
3852 LocalAccum[0] =
3853 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3854 .getReg(0);
3855 CarryIn.pop_back();
3856 }
3857 }
3858 ++j0;
3859 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3860 }
3861
3862 // Build full 64-bit multiplies.
3863 if (j0 <= DstIndex) {
3864 bool HaveSmallAccum = false;
3865 Register Tmp;
3866
3867 if (LocalAccum[0]) {
3868 if (LocalAccum.size() == 1) {
3869 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3870 HaveSmallAccum = true;
3871 } else if (LocalAccum[1]) {
3872 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3873 HaveSmallAccum = false;
3874 } else {
3875 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3876 HaveSmallAccum = true;
3877 }
3878 } else {
3879 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3880 Tmp = getZero64();
3881 HaveSmallAccum = true;
3882 }
3883
3884 do {
3885 unsigned j1 = DstIndex - j0;
3886 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3887 ++j0;
3888 continue;
3889 }
3890 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3891 {Src0[j0], Src1[j1], Tmp});
3892 Tmp = Mad.getReg(0);
3893 if (!HaveSmallAccum)
3894 CarryOut.push_back(Mad.getReg(1));
3895 HaveSmallAccum = false;
3896
3897 ++j0;
3898 } while (j0 <= DstIndex);
3899
3900 auto Unmerge = B.buildUnmerge(S32, Tmp);
3901 LocalAccum[0] = Unmerge.getReg(0);
3902 if (LocalAccum.size() > 1)
3903 LocalAccum[1] = Unmerge.getReg(1);
3904 }
3905
3906 return CarryOut;
3907 };
3908
3909 // Outer multiply loop, iterating over destination parts from least
3910 // significant to most significant parts.
3911 //
3912 // The columns of the following diagram correspond to the destination parts
3913 // affected by one iteration of the outer loop (ignoring boundary
3914 // conditions).
3915 //
3916 // Dest index relative to 2 * i: 1 0 -1
3917 // ------
3918 // Carries from previous iteration: e o
3919 // Even-aligned partial product sum: E E .
3920 // Odd-aligned partial product sum: O O
3921 //
3922 // 'o' is OddCarry, 'e' is EvenCarry.
3923 // EE and OO are computed from partial products via buildMadChain and use
3924 // accumulation where possible and appropriate.
3925 //
3926 Register SeparateOddCarry;
3927 Carry EvenCarry;
3928 Carry OddCarry;
3929
3930 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3931 Carry OddCarryIn = std::move(OddCarry);
3932 Carry EvenCarryIn = std::move(EvenCarry);
3933 OddCarry.clear();
3934 EvenCarry.clear();
3935
3936 // Partial products at offset 2 * i.
3937 if (2 * i < Accum.size()) {
3938 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3939 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3940 }
3941
3942 // Partial products at offset 2 * i - 1.
3943 if (i > 0) {
3944 if (!SeparateOddAlignedProducts) {
3945 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3946 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3947 } else {
3948 bool IsHighest = 2 * i >= Accum.size();
3949 Register SeparateOddOut[2];
3950 auto LocalAccum = MutableArrayRef(SeparateOddOut)
3951 .take_front(IsHighest ? 1 : 2);
3952 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3953
3955
3956 if (i == 1) {
3957 if (!IsHighest)
3958 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3959 else
3960 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3961 } else {
3962 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3963 SeparateOddCarry);
3964 }
3965 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3966
3967 if (!IsHighest) {
3968 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3969 Lo->getOperand(1).getReg());
3970 Accum[2 * i] = Hi.getReg(0);
3971 SeparateOddCarry = Hi.getReg(1);
3972 }
3973 }
3974 }
3975
3976 // Add in the carries from the previous iteration
3977 if (i > 0) {
3978 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3979 EvenCarryIn.push_back(CarryOut);
3980
3981 if (2 * i < Accum.size()) {
3982 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3983 OddCarry.push_back(CarryOut);
3984 }
3985 }
3986 }
3987}
3988
3989// Custom narrowing of wide multiplies using wide multiply-add instructions.
3990//
3991// TODO: If the multiply is followed by an addition, we should attempt to
3992// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3994 MachineInstr &MI) const {
3995 assert(ST.hasMad64_32());
3996 assert(MI.getOpcode() == TargetOpcode::G_MUL);
3997
3998 MachineIRBuilder &B = Helper.MIRBuilder;
3999 MachineRegisterInfo &MRI = *B.getMRI();
4000
4001 Register DstReg = MI.getOperand(0).getReg();
4002 Register Src0 = MI.getOperand(1).getReg();
4003 Register Src1 = MI.getOperand(2).getReg();
4004
4005 LLT Ty = MRI.getType(DstReg);
4006 assert(Ty.isScalar());
4007
4008 unsigned Size = Ty.getSizeInBits();
4009 unsigned NumParts = Size / 32;
4010 assert((Size % 32) == 0);
4011 assert(NumParts >= 2);
4012
4013 // Whether to use MAD_64_32 for partial products whose high half is
4014 // discarded. This avoids some ADD instructions but risks false dependency
4015 // stalls on some subtargets in some cases.
4016 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4017
4018 // Whether to compute odd-aligned partial products separately. This is
4019 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4020 // in an even-aligned VGPR.
4021 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4022
4023 LLT S32 = LLT::scalar(32);
4024 SmallVector<Register, 2> Src0Parts, Src1Parts;
4025 for (unsigned i = 0; i < NumParts; ++i) {
4026 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4027 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4028 }
4029 B.buildUnmerge(Src0Parts, Src0);
4030 B.buildUnmerge(Src1Parts, Src1);
4031
4032 SmallVector<Register, 2> AccumRegs(NumParts);
4033 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4034 SeparateOddAlignedProducts);
4035
4036 B.buildMergeLikeInstr(DstReg, AccumRegs);
4037 MI.eraseFromParent();
4038 return true;
4039}
4040
4041// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4042// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4043// case with a single min instruction instead of a compare+select.
4046 MachineIRBuilder &B) const {
4047 Register Dst = MI.getOperand(0).getReg();
4048 Register Src = MI.getOperand(1).getReg();
4049 LLT DstTy = MRI.getType(Dst);
4050 LLT SrcTy = MRI.getType(Src);
4051
4052 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4053 ? AMDGPU::G_AMDGPU_FFBH_U32
4054 : AMDGPU::G_AMDGPU_FFBL_B32;
4055 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4056 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4057
4058 MI.eraseFromParent();
4059 return true;
4060}
4061
4062// Check that this is a G_XOR x, -1
4063static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4064 if (MI.getOpcode() != TargetOpcode::G_XOR)
4065 return false;
4066 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4067 return ConstVal && *ConstVal == -1;
4068}
4069
4070// Return the use branch instruction, otherwise null if the usage is invalid.
4071static MachineInstr *
4073 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4074 Register CondDef = MI.getOperand(0).getReg();
4075 if (!MRI.hasOneNonDBGUse(CondDef))
4076 return nullptr;
4077
4078 MachineBasicBlock *Parent = MI.getParent();
4079 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4080
4081 if (isNot(MRI, *UseMI)) {
4082 Register NegatedCond = UseMI->getOperand(0).getReg();
4083 if (!MRI.hasOneNonDBGUse(NegatedCond))
4084 return nullptr;
4085
4086 // We're deleting the def of this value, so we need to remove it.
4087 eraseInstr(*UseMI, MRI);
4088
4089 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4090 Negated = true;
4091 }
4092
4093 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4094 return nullptr;
4095
4096 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4097 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4098 if (Next == Parent->end()) {
4099 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4100 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4101 return nullptr;
4102 UncondBrTarget = &*NextMBB;
4103 } else {
4104 if (Next->getOpcode() != AMDGPU::G_BR)
4105 return nullptr;
4106 Br = &*Next;
4107 UncondBrTarget = Br->getOperand(0).getMBB();
4108 }
4109
4110 return UseMI;
4111}
4112
4114 const ArgDescriptor *Arg,
4115 const TargetRegisterClass *ArgRC,
4116 LLT ArgTy) const {
4117 MCRegister SrcReg = Arg->getRegister();
4118 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4119 assert(DstReg.isVirtual() && "Virtual register expected");
4120
4121 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4122 *ArgRC, B.getDebugLoc(), ArgTy);
4123 if (Arg->isMasked()) {
4124 // TODO: Should we try to emit this once in the entry block?
4125 const LLT S32 = LLT::scalar(32);
4126 const unsigned Mask = Arg->getMask();
4127 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4128
4129 Register AndMaskSrc = LiveIn;
4130
4131 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4132 // 0.
4133 if (Shift != 0) {
4134 auto ShiftAmt = B.buildConstant(S32, Shift);
4135 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4136 }
4137
4138 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4139 } else {
4140 B.buildCopy(DstReg, LiveIn);
4141 }
4142
4143 return true;
4144}
4145
4147 Register DstReg, MachineIRBuilder &B,
4149 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4150 const ArgDescriptor *Arg;
4151 const TargetRegisterClass *ArgRC;
4152 LLT ArgTy;
4153 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4154
4155 if (!Arg) {
4157 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4158 // case the pointer argument may be missing and we use null.
4159 B.buildConstant(DstReg, 0);
4160 return true;
4161 }
4162
4163 // It's undefined behavior if a function marked with the amdgpu-no-*
4164 // attributes uses the corresponding intrinsic.
4165 B.buildUndef(DstReg);
4166 return true;
4167 }
4168
4169 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4170 return false; // TODO: Handle these
4171 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4172}
4173
4177 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4178 return false;
4179
4180 MI.eraseFromParent();
4181 return true;
4182}
4183
4185 int64_t C) {
4186 B.buildConstant(MI.getOperand(0).getReg(), C);
4187 MI.eraseFromParent();
4188 return true;
4189}
4190
4193 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4194 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4195 if (MaxID == 0)
4196 return replaceWithConstant(B, MI, 0);
4197
4198 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4199 const ArgDescriptor *Arg;
4200 const TargetRegisterClass *ArgRC;
4201 LLT ArgTy;
4202 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4203
4204 Register DstReg = MI.getOperand(0).getReg();
4205 if (!Arg) {
4206 // It's undefined behavior if a function marked with the amdgpu-no-*
4207 // attributes uses the corresponding intrinsic.
4208 B.buildUndef(DstReg);
4209 MI.eraseFromParent();
4210 return true;
4211 }
4212
4213 if (Arg->isMasked()) {
4214 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4215 // masking operations anyway.
4216 //
4217 // TODO: We could assert the top bit is 0 for the source copy.
4218 if (!loadInputValue(DstReg, B, ArgType))
4219 return false;
4220 } else {
4221 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4222 if (!loadInputValue(TmpReg, B, ArgType))
4223 return false;
4224 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4225 }
4226
4227 MI.eraseFromParent();
4228 return true;
4229}
4230
4232 int64_t Offset) const {
4234 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4235
4236 // TODO: If we passed in the base kernel offset we could have a better
4237 // alignment than 4, but we don't really need it.
4238 if (!loadInputValue(KernArgReg, B,
4240 llvm_unreachable("failed to find kernarg segment ptr");
4241
4242 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4243 // TODO: Should get nuw
4244 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4245}
4246
4247/// Legalize a value that's loaded from kernel arguments. This is only used by
4248/// legacy intrinsics.
4252 Align Alignment) const {
4253 Register DstReg = MI.getOperand(0).getReg();
4254
4255 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4256 "unexpected kernarg parameter type");
4257
4260 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4263 MI.eraseFromParent();
4264 return true;
4265}
4266
4269 MachineIRBuilder &B) const {
4270 Register Dst = MI.getOperand(0).getReg();
4271 LLT DstTy = MRI.getType(Dst);
4272 LLT S16 = LLT::scalar(16);
4273 LLT S32 = LLT::scalar(32);
4274 LLT S64 = LLT::scalar(64);
4275
4276 if (DstTy == S16)
4277 return legalizeFDIV16(MI, MRI, B);
4278 if (DstTy == S32)
4279 return legalizeFDIV32(MI, MRI, B);
4280 if (DstTy == S64)
4281 return legalizeFDIV64(MI, MRI, B);
4282
4283 return false;
4284}
4285
4287 Register DstDivReg,
4288 Register DstRemReg,
4289 Register X,
4290 Register Y) const {
4291 const LLT S1 = LLT::scalar(1);
4292 const LLT S32 = LLT::scalar(32);
4293
4294 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4295 // algorithm used here.
4296
4297 // Initial estimate of inv(y).
4298 auto FloatY = B.buildUITOFP(S32, Y);
4299 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4300 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4301 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4302 auto Z = B.buildFPTOUI(S32, ScaledY);
4303
4304 // One round of UNR.
4305 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4306 auto NegYZ = B.buildMul(S32, NegY, Z);
4307 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4308
4309 // Quotient/remainder estimate.
4310 auto Q = B.buildUMulH(S32, X, Z);
4311 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4312
4313 // First quotient/remainder refinement.
4314 auto One = B.buildConstant(S32, 1);
4315 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4316 if (DstDivReg)
4317 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4318 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4319
4320 // Second quotient/remainder refinement.
4321 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4322 if (DstDivReg)
4323 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4324
4325 if (DstRemReg)
4326 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4327}
4328
4329// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4330//
4331// Return lo, hi of result
4332//
4333// %cvt.lo = G_UITOFP Val.lo
4334// %cvt.hi = G_UITOFP Val.hi
4335// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4336// %rcp = G_AMDGPU_RCP_IFLAG %mad
4337// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4338// %mul2 = G_FMUL %mul1, 2**(-32)
4339// %trunc = G_INTRINSIC_TRUNC %mul2
4340// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4341// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4342static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4343 Register Val) {
4344 const LLT S32 = LLT::scalar(32);
4345 auto Unmerge = B.buildUnmerge(S32, Val);
4346
4347 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4348 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4349
4350 auto Mad = B.buildFMAD(
4351 S32, CvtHi, // 2**32
4352 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4353
4354 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4355 auto Mul1 = B.buildFMul(
4356 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4357
4358 // 2**(-32)
4359 auto Mul2 = B.buildFMul(
4360 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4361 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4362
4363 // -(2**32)
4364 auto Mad2 = B.buildFMAD(
4365 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4366 Mul1);
4367
4368 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4369 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4370
4371 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4372}
4373
4375 Register DstDivReg,
4376 Register DstRemReg,
4377 Register Numer,
4378 Register Denom) const {
4379 const LLT S32 = LLT::scalar(32);
4380 const LLT S64 = LLT::scalar(64);
4381 const LLT S1 = LLT::scalar(1);
4382 Register RcpLo, RcpHi;
4383
4384 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4385
4386 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4387
4388 auto Zero64 = B.buildConstant(S64, 0);
4389 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4390
4391 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4392 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4393
4394 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4395 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4396 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4397
4398 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4399 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4400 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4401
4402 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4403 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4404 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4405 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4406 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4407
4408 auto Zero32 = B.buildConstant(S32, 0);
4409 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4410 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4411 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4412
4413 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4414 Register NumerLo = UnmergeNumer.getReg(0);
4415 Register NumerHi = UnmergeNumer.getReg(1);
4416
4417 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4418 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4419 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4420 Register Mul3_Lo = UnmergeMul3.getReg(0);
4421 Register Mul3_Hi = UnmergeMul3.getReg(1);
4422 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4423 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4424 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4425 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4426
4427 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4428 Register DenomLo = UnmergeDenom.getReg(0);
4429 Register DenomHi = UnmergeDenom.getReg(1);
4430
4431 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4432 auto C1 = B.buildSExt(S32, CmpHi);
4433
4434 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4435 auto C2 = B.buildSExt(S32, CmpLo);
4436
4437 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4438 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4439
4440 // TODO: Here and below portions of the code can be enclosed into if/endif.
4441 // Currently control flow is unconditional and we have 4 selects after
4442 // potential endif to substitute PHIs.
4443
4444 // if C3 != 0 ...
4445 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4446 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4447 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4448 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4449
4450 auto One64 = B.buildConstant(S64, 1);
4451 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4452
4453 auto C4 =
4454 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4455 auto C5 =
4456 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4457 auto C6 = B.buildSelect(
4458 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4459
4460 // if (C6 != 0)
4461 auto Add4 = B.buildAdd(S64, Add3, One64);
4462 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4463
4464 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4465 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4466 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4467
4468 // endif C6
4469 // endif C3
4470
4471 if (DstDivReg) {
4472 auto Sel1 = B.buildSelect(
4473 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4474 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4475 Sel1, MulHi3);
4476 }
4477
4478 if (DstRemReg) {
4479 auto Sel2 = B.buildSelect(
4480 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4481 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4482 Sel2, Sub1);
4483 }
4484}
4485
4488 MachineIRBuilder &B) const {
4489 Register DstDivReg, DstRemReg;
4490 switch (MI.getOpcode()) {
4491 default:
4492 llvm_unreachable("Unexpected opcode!");
4493 case AMDGPU::G_UDIV: {
4494 DstDivReg = MI.getOperand(0).getReg();
4495 break;
4496 }
4497 case AMDGPU::G_UREM: {
4498 DstRemReg = MI.getOperand(0).getReg();
4499 break;
4500 }
4501 case AMDGPU::G_UDIVREM: {
4502 DstDivReg = MI.getOperand(0).getReg();
4503 DstRemReg = MI.getOperand(1).getReg();
4504 break;
4505 }
4506 }
4507
4508 const LLT S64 = LLT::scalar(64);
4509 const LLT S32 = LLT::scalar(32);
4510 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4511 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4512 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4513 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4514
4515 if (Ty == S32)
4516 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4517 else if (Ty == S64)
4518 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4519 else
4520 return false;
4521
4522 MI.eraseFromParent();
4523 return true;
4524}
4525
4528 MachineIRBuilder &B) const {
4529 const LLT S64 = LLT::scalar(64);
4530 const LLT S32 = LLT::scalar(32);
4531
4532 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4533 if (Ty != S32 && Ty != S64)
4534 return false;
4535
4536 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4537 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4538 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4539
4540 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4541 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4542 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4543
4544 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4545 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4546
4547 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4548 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4549
4550 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4551 switch (MI.getOpcode()) {
4552 default:
4553 llvm_unreachable("Unexpected opcode!");
4554 case AMDGPU::G_SDIV: {
4555 DstDivReg = MI.getOperand(0).getReg();
4556 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4557 break;
4558 }
4559 case AMDGPU::G_SREM: {
4560 DstRemReg = MI.getOperand(0).getReg();
4561 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4562 break;
4563 }
4564 case AMDGPU::G_SDIVREM: {
4565 DstDivReg = MI.getOperand(0).getReg();
4566 DstRemReg = MI.getOperand(1).getReg();
4567 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4568 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4569 break;
4570 }
4571 }
4572
4573 if (Ty == S32)
4574 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4575 else
4576 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4577
4578 if (DstDivReg) {
4579 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4580 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4581 B.buildSub(DstDivReg, SignXor, Sign);
4582 }
4583
4584 if (DstRemReg) {
4585 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4586 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4587 B.buildSub(DstRemReg, SignXor, Sign);
4588 }
4589
4590 MI.eraseFromParent();
4591 return true;
4592}
4593
4596 MachineIRBuilder &B) const {
4597 Register Res = MI.getOperand(0).getReg();
4598 Register LHS = MI.getOperand(1).getReg();
4599 Register RHS = MI.getOperand(2).getReg();
4600 uint16_t Flags = MI.getFlags();
4601 LLT ResTy = MRI.getType(Res);
4602
4603 const MachineFunction &MF = B.getMF();
4604 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4606
4607 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4608 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4609 return false;
4610
4611 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4612 // the CI documentation has a worst case error of 1 ulp.
4613 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4614 // use it as long as we aren't trying to use denormals.
4615 //
4616 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4617
4618 // 1 / x -> RCP(x)
4619 if (CLHS->isExactlyValue(1.0)) {
4620 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4621 .addUse(RHS)
4622 .setMIFlags(Flags);
4623
4624 MI.eraseFromParent();
4625 return true;
4626 }
4627
4628 // -1 / x -> RCP( FNEG(x) )
4629 if (CLHS->isExactlyValue(-1.0)) {
4630 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4631 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4632 .addUse(FNeg.getReg(0))
4633 .setMIFlags(Flags);
4634
4635 MI.eraseFromParent();
4636 return true;
4637 }
4638 }
4639
4640 // For f16 require afn or arcp.
4641 // For f32 require afn.
4642 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4643 !MI.getFlag(MachineInstr::FmArcp)))
4644 return false;
4645
4646 // x / y -> x * (1.0 / y)
4647 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4648 .addUse(RHS)
4649 .setMIFlags(Flags);
4650 B.buildFMul(Res, LHS, RCP, Flags);
4651
4652 MI.eraseFromParent();
4653 return true;
4654}
4655
4658 MachineIRBuilder &B) const {
4659 Register Res = MI.getOperand(0).getReg();
4660 Register X = MI.getOperand(1).getReg();
4661 Register Y = MI.getOperand(2).getReg();
4662 uint16_t Flags = MI.getFlags();
4663 LLT ResTy = MRI.getType(Res);
4664
4665 const MachineFunction &MF = B.getMF();
4666 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4667 MI.getFlag(MachineInstr::FmAfn);
4668
4669 if (!AllowInaccurateRcp)
4670 return false;
4671
4672 auto NegY = B.buildFNeg(ResTy, Y);
4673 auto One = B.buildFConstant(ResTy, 1.0);
4674
4675 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4676 .addUse(Y)
4677 .setMIFlags(Flags);
4678
4679 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4680 R = B.buildFMA(ResTy, Tmp0, R, R);
4681
4682 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4683 R = B.buildFMA(ResTy, Tmp1, R, R);
4684
4685 auto Ret = B.buildFMul(ResTy, X, R);
4686 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4687
4688 B.buildFMA(Res, Tmp2, R, Ret);
4689 MI.eraseFromParent();
4690 return true;
4691}
4692
4695 MachineIRBuilder &B) const {
4697 return true;
4698
4699 Register Res = MI.getOperand(0).getReg();
4700 Register LHS = MI.getOperand(1).getReg();
4701 Register RHS = MI.getOperand(2).getReg();
4702
4703 uint16_t Flags = MI.getFlags();
4704
4705 LLT S16 = LLT::scalar(16);
4706 LLT S32 = LLT::scalar(32);
4707
4708 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4709 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4710
4711 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4712 .addUse(RHSExt.getReg(0))
4713 .setMIFlags(Flags);
4714
4715 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4716 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4717
4718 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4719 .addUse(RDst.getReg(0))
4720 .addUse(RHS)
4721 .addUse(LHS)
4722 .setMIFlags(Flags);
4723
4724 MI.eraseFromParent();
4725 return true;
4726}
4727
4728static const unsigned SPDenormModeBitField =
4731
4732// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4733// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4735 const GCNSubtarget &ST,
4737 // Set SP denorm mode to this value.
4738 unsigned SPDenormMode =
4739 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4740
4741 if (ST.hasDenormModeInst()) {
4742 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4743 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4744
4745 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4746 B.buildInstr(AMDGPU::S_DENORM_MODE)
4747 .addImm(NewDenormModeValue);
4748
4749 } else {
4750 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4751 .addImm(SPDenormMode)
4752 .addImm(SPDenormModeBitField);
4753 }
4754}
4755
4758 MachineIRBuilder &B) const {
4760 return true;
4761
4762 Register Res = MI.getOperand(0).getReg();
4763 Register LHS = MI.getOperand(1).getReg();
4764 Register RHS = MI.getOperand(2).getReg();
4765 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4766 SIModeRegisterDefaults Mode = MFI->getMode();
4767
4768 uint16_t Flags = MI.getFlags();
4769
4770 LLT S32 = LLT::scalar(32);
4771 LLT S1 = LLT::scalar(1);
4772
4773 auto One = B.buildFConstant(S32, 1.0f);
4774
4775 auto DenominatorScaled =
4776 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4777 .addUse(LHS)
4778 .addUse(RHS)
4779 .addImm(0)
4780 .setMIFlags(Flags);
4781 auto NumeratorScaled =
4782 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4783 .addUse(LHS)
4784 .addUse(RHS)
4785 .addImm(1)
4786 .setMIFlags(Flags);
4787
4788 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4789 .addUse(DenominatorScaled.getReg(0))
4790 .setMIFlags(Flags);
4791 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4792
4793 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4794 const bool HasDynamicDenormals =
4795 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4796 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4797
4798 Register SavedSPDenormMode;
4799 if (!PreservesDenormals) {
4800 if (HasDynamicDenormals) {
4801 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4802 B.buildInstr(AMDGPU::S_GETREG_B32)
4803 .addDef(SavedSPDenormMode)
4804 .addImm(SPDenormModeBitField);
4805 }
4806 toggleSPDenormMode(true, B, ST, Mode);
4807 }
4808
4809 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4810 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4811 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4812 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4813 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4814 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4815
4816 if (!PreservesDenormals) {
4817 if (HasDynamicDenormals) {
4818 assert(SavedSPDenormMode);
4819 B.buildInstr(AMDGPU::S_SETREG_B32)
4820 .addReg(SavedSPDenormMode)
4821 .addImm(SPDenormModeBitField);
4822 } else
4823 toggleSPDenormMode(false, B, ST, Mode);
4824 }
4825
4826 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4827 .addUse(Fma4.getReg(0))
4828 .addUse(Fma1.getReg(0))
4829 .addUse(Fma3.getReg(0))
4830 .addUse(NumeratorScaled.getReg(1))
4831 .setMIFlags(Flags);
4832
4833 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4834 .addUse(Fmas.getReg(0))
4835 .addUse(RHS)
4836 .addUse(LHS)
4837 .setMIFlags(Flags);
4838
4839 MI.eraseFromParent();
4840 return true;
4841}
4842
4845 MachineIRBuilder &B) const {
4847 return true;
4848
4849 Register Res = MI.getOperand(0).getReg();
4850 Register LHS = MI.getOperand(1).getReg();
4851 Register RHS = MI.getOperand(2).getReg();
4852
4853 uint16_t Flags = MI.getFlags();
4854
4855 LLT S64 = LLT::scalar(64);
4856 LLT S1 = LLT::scalar(1);
4857
4858 auto One = B.buildFConstant(S64, 1.0);
4859
4860 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4861 .addUse(LHS)
4862 .addUse(RHS)
4863 .addImm(0)
4864 .setMIFlags(Flags);
4865
4866 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4867
4868 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4869 .addUse(DivScale0.getReg(0))
4870 .setMIFlags(Flags);
4871
4872 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4873 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4874 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4875
4876 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4877 .addUse(LHS)
4878 .addUse(RHS)
4879 .addImm(1)
4880 .setMIFlags(Flags);
4881
4882 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4883 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4884 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4885
4886 Register Scale;
4888 // Workaround a hardware bug on SI where the condition output from div_scale
4889 // is not usable.
4890
4891 LLT S32 = LLT::scalar(32);
4892
4893 auto NumUnmerge = B.buildUnmerge(S32, LHS);
4894 auto DenUnmerge = B.buildUnmerge(S32, RHS);
4895 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4896 auto Scale1Unmerge =