LLVM 20.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(1);
283static const LLT S8 = LLT::scalar(8);
284static const LLT S16 = LLT::scalar(16);
285static const LLT S32 = LLT::scalar(32);
286static const LLT F32 = LLT::float32();
287static const LLT S64 = LLT::scalar(64);
288static const LLT F64 = LLT::float64();
289static const LLT S96 = LLT::scalar(96);
290static const LLT S128 = LLT::scalar(128);
291static const LLT S160 = LLT::scalar(160);
292static const LLT S224 = LLT::scalar(224);
293static const LLT S256 = LLT::scalar(256);
294static const LLT S512 = LLT::scalar(512);
296
297static const LLT V2S8 = LLT::fixed_vector(2, 8);
298static const LLT V2S16 = LLT::fixed_vector(2, 16);
299static const LLT V4S16 = LLT::fixed_vector(4, 16);
300static const LLT V6S16 = LLT::fixed_vector(6, 16);
301static const LLT V8S16 = LLT::fixed_vector(8, 16);
302static const LLT V10S16 = LLT::fixed_vector(10, 16);
303static const LLT V12S16 = LLT::fixed_vector(12, 16);
304static const LLT V16S16 = LLT::fixed_vector(16, 16);
305
307static const LLT V2BF16 = V2F16; // FIXME
308
309static const LLT V2S32 = LLT::fixed_vector(2, 32);
310static const LLT V3S32 = LLT::fixed_vector(3, 32);
311static const LLT V4S32 = LLT::fixed_vector(4, 32);
312static const LLT V5S32 = LLT::fixed_vector(5, 32);
313static const LLT V6S32 = LLT::fixed_vector(6, 32);
314static const LLT V7S32 = LLT::fixed_vector(7, 32);
315static const LLT V8S32 = LLT::fixed_vector(8, 32);
316static const LLT V9S32 = LLT::fixed_vector(9, 32);
317static const LLT V10S32 = LLT::fixed_vector(10, 32);
318static const LLT V11S32 = LLT::fixed_vector(11, 32);
319static const LLT V12S32 = LLT::fixed_vector(12, 32);
320static const LLT V16S32 = LLT::fixed_vector(16, 32);
321static const LLT V32S32 = LLT::fixed_vector(32, 32);
322
323static const LLT V2S64 = LLT::fixed_vector(2, 64);
324static const LLT V3S64 = LLT::fixed_vector(3, 64);
325static const LLT V4S64 = LLT::fixed_vector(4, 64);
326static const LLT V5S64 = LLT::fixed_vector(5, 64);
327static const LLT V6S64 = LLT::fixed_vector(6, 64);
328static const LLT V7S64 = LLT::fixed_vector(7, 64);
329static const LLT V8S64 = LLT::fixed_vector(8, 64);
330static const LLT V16S64 = LLT::fixed_vector(16, 64);
331
332static const LLT V2S128 = LLT::fixed_vector(2, 128);
333static const LLT V4S128 = LLT::fixed_vector(4, 128);
334
335static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
336 S160, S224, S256, S512};
337
338static std::initializer_list<LLT> AllS16Vectors{
340
341static std::initializer_list<LLT> AllS32Vectors = {
344
345static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
347
348// Checks whether a type is in the list of legal register types.
349static bool isRegisterClassType(LLT Ty) {
352
355}
356
357static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358 return [TypeIdx](const LegalityQuery &Query) {
359 return isRegisterClassType(Query.Types[TypeIdx]);
360 };
361}
362
363// If we have a truncating store or an extending load with a data size larger
364// than 32-bits, we need to reduce to a 32-bit type.
366 return [=](const LegalityQuery &Query) {
367 const LLT Ty = Query.Types[TypeIdx];
368 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
370 };
371}
372
373// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374// handle some operations by just promoting the register during
375// selection. There are also d16 loads on GFX9+ which preserve the high bits.
376static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377 bool IsLoad, bool IsAtomic) {
378 switch (AS) {
380 // FIXME: Private element size.
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
388 // Treat constant and global as identical. SMRD loads are sometimes usable for
389 // global loads (ideally constant address space should be eliminated)
390 // depending on the context. Legality cannot be context dependent, but
391 // RegBankSelect can split the load as necessary depending on the pointer
392 // register bank/uniformity and if the memory is invariant or not written in a
393 // kernel.
394 return IsLoad ? 512 : 128;
395 default:
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397 // if they may alias scratch depending on the subtarget. This needs to be
398 // moved to custom handling to use addressMayBeAccessedAsPrivate
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
400 }
401}
402
403static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404 const LegalityQuery &Query) {
405 const LLT Ty = Query.Types[0];
406
407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
409
410 unsigned RegSize = Ty.getSizeInBits();
411 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
413 unsigned AS = Query.Types[1].getAddressSpace();
414
415 // All of these need to be custom lowered to cast the pointer operand.
417 return false;
418
419 // Do not handle extending vector loads.
420 if (Ty.isVector() && MemSize != RegSize)
421 return false;
422
423 // TODO: We should be able to widen loads if the alignment is high enough, but
424 // we also need to modify the memory access size.
425#if 0
426 // Accept widening loads based on alignment.
427 if (IsLoad && MemSize < Size)
428 MemSize = std::max(MemSize, Align);
429#endif
430
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
432 if (MemSize != RegSize && RegSize != 32)
433 return false;
434
435 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436 Query.MMODescrs[0].Ordering !=
437 AtomicOrdering::NotAtomic))
438 return false;
439
440 switch (MemSize) {
441 case 8:
442 case 16:
443 case 32:
444 case 64:
445 case 128:
446 break;
447 case 96:
448 if (!ST.hasDwordx3LoadStores())
449 return false;
450 break;
451 case 256:
452 case 512:
453 // These may contextually need to be broken down.
454 break;
455 default:
456 return false;
457 }
458
459 assert(RegSize >= MemSize);
460
461 if (AlignBits < MemSize) {
462 const SITargetLowering *TLI = ST.getTargetLowering();
463 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
464 Align(AlignBits / 8)))
465 return false;
466 }
467
468 return true;
469}
470
471// The newer buffer intrinsic forms take their resource arguments as
472// pointers in address space 8, aka s128 values. However, in order to not break
473// SelectionDAG, the underlying operations have to continue to take v4i32
474// arguments. Therefore, we convert resource pointers - or vectors of them
475// to integer values here.
476static bool hasBufferRsrcWorkaround(const LLT Ty) {
478 return true;
479 if (Ty.isVector()) {
480 const LLT ElemTy = Ty.getElementType();
481 return hasBufferRsrcWorkaround(ElemTy);
482 }
483 return false;
484}
485
486// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487// workaround this. Eventually it should ignore the type for loads and only care
488// about the size. Return true in cases where we will workaround this for now by
489// bitcasting.
490static bool loadStoreBitcastWorkaround(const LLT Ty) {
492 return false;
493
494 const unsigned Size = Ty.getSizeInBits();
495 if (Size <= 64)
496 return false;
497 // Address space 8 pointers get their own workaround.
499 return false;
500 if (!Ty.isVector())
501 return true;
502
503 if (Ty.isPointerVector())
504 return true;
505
506 unsigned EltSize = Ty.getScalarSizeInBits();
507 return EltSize != 32 && EltSize != 64;
508}
509
510static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511 const LLT Ty = Query.Types[0];
512 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
514}
515
516/// Return true if a load or store of the type should be lowered with a bitcast
517/// to a different type.
518static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519 const LLT MemTy) {
520 const unsigned MemSizeInBits = MemTy.getSizeInBits();
521 const unsigned Size = Ty.getSizeInBits();
522 if (Size != MemSizeInBits)
523 return Size <= 32 && Ty.isVector();
524
526 return true;
527
528 // Don't try to handle bitcasting vector ext loads for now.
529 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530 (Size <= 32 || isRegisterSize(Size)) &&
532}
533
534/// Return true if we should legalize a load by widening an odd sized memory
535/// access up to the alignment. Note this case when the memory access itself
536/// changes, not the size of the result register.
537static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538 uint64_t AlignInBits, unsigned AddrSpace,
539 unsigned Opcode) {
540 unsigned SizeInBits = MemoryTy.getSizeInBits();
541 // We don't want to widen cases that are naturally legal.
542 if (isPowerOf2_32(SizeInBits))
543 return false;
544
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546 // end up widening these for a scalar load during RegBankSelect, if we don't
547 // have 96-bit scalar loads.
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549 return false;
550
551 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
552 return false;
553
554 // A load is known dereferenceable up to the alignment, so it's legal to widen
555 // to it.
556 //
557 // TODO: Could check dereferenceable for less aligned cases.
558 unsigned RoundedSize = NextPowerOf2(SizeInBits);
559 if (AlignInBits < RoundedSize)
560 return false;
561
562 // Do not widen if it would introduce a slow unaligned load.
563 const SITargetLowering *TLI = ST.getTargetLowering();
564 unsigned Fast = 0;
566 RoundedSize, AddrSpace, Align(AlignInBits / 8),
568 Fast;
569}
570
571static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572 unsigned Opcode) {
573 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574 return false;
575
576 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
577 Query.MMODescrs[0].AlignInBits,
578 Query.Types[1].getAddressSpace(), Opcode);
579}
580
581/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
585 MachineRegisterInfo &MRI, unsigned Idx) {
586 MachineOperand &MO = MI.getOperand(Idx);
587
588 const LLT PointerTy = MRI.getType(MO.getReg());
589
590 // Paranoidly prevent us from doing this multiple times.
592 return PointerTy;
593
594 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
595 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
596 if (!PointerTy.isVector()) {
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
599 const LLT S32 = LLT::scalar(32);
600
601 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
604 for (unsigned I = 0; I < NumParts; ++I)
605 VectorElems[I] =
606 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
608 MO.setReg(VectorReg);
609 return VectorTy;
610 }
611 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
613 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
615 MO.setReg(BitcastReg);
616
617 return VectorTy;
618}
619
620/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621/// the form in which the value must be in order to be passed to the low-level
622/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623/// needed in order to account for the fact that we can't define a register
624/// class for s128 without breaking SelectionDAG.
626 MachineRegisterInfo &MRI = *B.getMRI();
627 const LLT PointerTy = MRI.getType(Pointer);
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630
631 if (!PointerTy.isVector()) {
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633 SmallVector<Register, 4> PointerParts;
634 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
635 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
636 for (unsigned I = 0; I < NumParts; ++I)
637 PointerParts.push_back(Unmerged.getReg(I));
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
639 }
640 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
642}
643
645 unsigned Idx) {
646 MachineOperand &MO = MI.getOperand(Idx);
647
648 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649 // Paranoidly prevent us from doing this multiple times.
651 return;
653}
654
656 const GCNTargetMachine &TM)
657 : ST(ST_) {
658 using namespace TargetOpcode;
659
660 auto GetAddrSpacePtr = [&TM](unsigned AS) {
661 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
662 };
663
664 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
665 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
666 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
668 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
669 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
670 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
671 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
672 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
673 const LLT BufferStridedPtr =
674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
675
676 const LLT CodePtr = FlatPtr;
677
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
680 };
681
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
684 };
685
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
687
688 const std::initializer_list<LLT> FPTypesBase = {
689 S32, S64
690 };
691
692 const std::initializer_list<LLT> FPTypes16 = {
693 S32, S64, S16
694 };
695
696 const std::initializer_list<LLT> FPTypesPK16 = {
697 S32, S64, S16, V2S16
698 };
699
700 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
701
702 // s1 for VCC branches, s32 for SCC branches.
704
705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706 // elements for v3s16
709 .legalFor(AllS32Vectors)
711 .legalFor(AddrSpaces64)
712 .legalFor(AddrSpaces32)
713 .legalFor(AddrSpaces128)
714 .legalIf(isPointer(0))
715 .clampScalar(0, S16, S256)
717 .clampMaxNumElements(0, S32, 16)
719 .scalarize(0);
720
721 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722 // Full set of gfx9 features.
723 if (ST.hasScalarAddSub64()) {
724 getActionDefinitionsBuilder({G_ADD, G_SUB})
725 .legalFor({S64, S32, S16, V2S16})
726 .clampMaxNumElementsStrict(0, S16, 2)
727 .scalarize(0)
728 .minScalar(0, S16)
730 .maxScalar(0, S32);
731 } else {
732 getActionDefinitionsBuilder({G_ADD, G_SUB})
733 .legalFor({S32, S16, V2S16})
734 .clampMaxNumElementsStrict(0, S16, 2)
735 .scalarize(0)
736 .minScalar(0, S16)
738 .maxScalar(0, S32);
739 }
740
741 if (ST.hasScalarSMulU64()) {
743 .legalFor({S64, S32, S16, V2S16})
744 .clampMaxNumElementsStrict(0, S16, 2)
745 .scalarize(0)
746 .minScalar(0, S16)
748 .custom();
749 } else {
751 .legalFor({S32, S16, V2S16})
752 .clampMaxNumElementsStrict(0, S16, 2)
753 .scalarize(0)
754 .minScalar(0, S16)
756 .custom();
757 }
758 assert(ST.hasMad64_32());
759
760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761 .legalFor({S32, S16, V2S16}) // Clamp modifier
762 .minScalarOrElt(0, S16)
764 .scalarize(0)
766 .lower();
767 } else if (ST.has16BitInsts()) {
768 getActionDefinitionsBuilder({G_ADD, G_SUB})
769 .legalFor({S32, S16})
770 .minScalar(0, S16)
772 .maxScalar(0, S32)
773 .scalarize(0);
774
776 .legalFor({S32, S16})
777 .scalarize(0)
778 .minScalar(0, S16)
779 .widenScalarToNextMultipleOf(0, 32)
780 .custom();
781 assert(ST.hasMad64_32());
782
783 // Technically the saturating operations require clamp bit support, but this
784 // was introduced at the same time as 16-bit operations.
785 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
786 .legalFor({S32, S16}) // Clamp modifier
787 .minScalar(0, S16)
788 .scalarize(0)
790 .lower();
791
792 // We're just lowering this, but it helps get a better result to try to
793 // coerce to the desired type first.
794 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
795 .minScalar(0, S16)
796 .scalarize(0)
797 .lower();
798 } else {
799 getActionDefinitionsBuilder({G_ADD, G_SUB})
800 .legalFor({S32})
801 .widenScalarToNextMultipleOf(0, 32)
802 .clampScalar(0, S32, S32)
803 .scalarize(0);
804
805 auto &Mul = getActionDefinitionsBuilder(G_MUL)
806 .legalFor({S32})
807 .scalarize(0)
808 .minScalar(0, S32)
809 .widenScalarToNextMultipleOf(0, 32);
810
811 if (ST.hasMad64_32())
812 Mul.custom();
813 else
814 Mul.maxScalar(0, S32);
815
816 if (ST.hasIntClamp()) {
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32}) // Clamp modifier.
819 .scalarize(0)
820 .minScalarOrElt(0, S32)
821 .lower();
822 } else {
823 // Clamp bit support was added in VI, along with 16-bit operations.
824 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
825 .minScalar(0, S32)
826 .scalarize(0)
827 .lower();
828 }
829
830 // FIXME: DAG expansion gets better results. The widening uses the smaller
831 // range values and goes for the min/max lowering directly.
832 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
833 .minScalar(0, S32)
834 .scalarize(0)
835 .lower();
836 }
837
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840 .customFor({S32, S64})
841 .clampScalar(0, S32, S64)
843 .scalarize(0);
844
845 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
846 .legalFor({S32})
847 .maxScalar(0, S32);
848
849 if (ST.hasVOP3PInsts()) {
850 Mulh
851 .clampMaxNumElements(0, S8, 2)
852 .lowerFor({V2S8});
853 }
854
855 Mulh
856 .scalarize(0)
857 .lower();
858
859 // Report legal for any types we can handle anywhere. For the cases only legal
860 // on the SALU, RegBankSelect will be able to re-legalize.
861 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
862 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
863 .clampScalar(0, S32, S64)
867 .scalarize(0);
868
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871 .legalFor({{S32, S1}, {S32, S32}})
872 .clampScalar(0, S32, S32)
873 .scalarize(0);
874
876 // Don't worry about the size constraint.
878 .lower();
879
881 .legalFor({S1, S32, S64, S16, GlobalPtr,
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883 .legalIf(isPointer(0))
884 .clampScalar(0, S32, S64)
886
887 getActionDefinitionsBuilder(G_FCONSTANT)
888 .legalFor({S32, S64, S16})
889 .clampScalar(0, S16, S64);
890
891 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
892 .legalIf(isRegisterType(0))
893 // s1 and s16 are special cases because they have legal operations on
894 // them, but don't really occupy registers in the normal way.
895 .legalFor({S1, S16})
896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
899 .clampMaxNumElements(0, S32, 16);
900
901 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
902
903 // If the amount is divergent, we have to do a wave reduction to get the
904 // maximum value, so this is expanded during RegBankSelect.
905 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
906 .legalFor({{PrivatePtr, S32}});
907
908 getActionDefinitionsBuilder(G_STACKSAVE)
909 .customFor({PrivatePtr});
910 getActionDefinitionsBuilder(G_STACKRESTORE)
911 .legalFor({PrivatePtr});
912
913 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
914
915 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
916 .customIf(typeIsNot(0, PrivatePtr));
917
918 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
919
920 auto &FPOpActions = getActionDefinitionsBuilder(
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923 .legalFor({S32, S64});
924 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
925 .customFor({S32, S64});
926 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
927 .customFor({S32, S64});
928
929 if (ST.has16BitInsts()) {
930 if (ST.hasVOP3PInsts())
931 FPOpActions.legalFor({S16, V2S16});
932 else
933 FPOpActions.legalFor({S16});
934
935 TrigActions.customFor({S16});
936 FDIVActions.customFor({S16});
937 }
938
939 if (ST.hasPackedFP32Ops()) {
940 FPOpActions.legalFor({V2S32});
941 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
942 }
943
944 auto &MinNumMaxNum = getActionDefinitionsBuilder({
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
946
947 if (ST.hasVOP3PInsts()) {
948 MinNumMaxNum.customFor(FPTypesPK16)
951 .clampScalar(0, S16, S64)
952 .scalarize(0);
953 } else if (ST.has16BitInsts()) {
954 MinNumMaxNum.customFor(FPTypes16)
955 .clampScalar(0, S16, S64)
956 .scalarize(0);
957 } else {
958 MinNumMaxNum.customFor(FPTypesBase)
959 .clampScalar(0, S32, S64)
960 .scalarize(0);
961 }
962
963 if (ST.hasVOP3PInsts())
964 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
965
966 FPOpActions
967 .scalarize(0)
968 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
969
970 TrigActions
971 .scalarize(0)
972 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
973
974 FDIVActions
975 .scalarize(0)
976 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
977
978 getActionDefinitionsBuilder({G_FNEG, G_FABS})
979 .legalFor(FPTypesPK16)
981 .scalarize(0)
982 .clampScalar(0, S16, S64);
983
984 if (ST.has16BitInsts()) {
986 .legalFor({S16})
987 .customFor({S32, S64})
988 .scalarize(0)
989 .unsupported();
991 .legalFor({S32, S64, S16})
992 .scalarize(0)
993 .clampScalar(0, S16, S64);
994
995 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
996 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
997 .scalarize(0)
998 .maxScalarIf(typeIs(0, S16), 1, S16)
999 .clampScalar(1, S32, S32)
1000 .lower();
1001
1003 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004 .scalarize(0)
1005 .lower();
1006 } else {
1008 .customFor({S32, S64, S16})
1009 .scalarize(0)
1010 .unsupported();
1011
1012
1013 if (ST.hasFractBug()) {
1015 .customFor({S64})
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 } else {
1021 .legalFor({S32, S64})
1022 .scalarize(0)
1023 .clampScalar(0, S32, S64);
1024 }
1025
1026 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1027 .legalFor({{S32, S32}, {S64, S32}})
1028 .scalarize(0)
1029 .clampScalar(0, S32, S64)
1030 .clampScalar(1, S32, S32)
1031 .lower();
1032
1034 .customFor({{S32, S32}, {S64, S32}})
1035 .scalarize(0)
1036 .minScalar(0, S32)
1037 .clampScalar(1, S32, S32)
1038 .lower();
1039 }
1040
1042 .legalFor({{S32, S64}, {S16, S32}})
1043 .scalarize(0)
1044 .lower();
1045
1047 .legalFor({{S64, S32}, {S32, S16}})
1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1049 .scalarize(0);
1050
1051 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1052 if (ST.has16BitInsts()) {
1053 FSubActions
1054 // Use actual fsub instruction
1055 .legalFor({S32, S16})
1056 // Must use fadd + fneg
1057 .lowerFor({S64, V2S16});
1058 } else {
1059 FSubActions
1060 // Use actual fsub instruction
1061 .legalFor({S32})
1062 // Must use fadd + fneg
1063 .lowerFor({S64, S16, V2S16});
1064 }
1065
1066 FSubActions
1067 .scalarize(0)
1068 .clampScalar(0, S32, S64);
1069
1070 // Whether this is legal depends on the floating point mode for the function.
1071 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1072 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073 FMad.customFor({S32, S16});
1074 else if (ST.hasMadMacF32Insts())
1075 FMad.customFor({S32});
1076 else if (ST.hasMadF16())
1077 FMad.customFor({S16});
1078 FMad.scalarize(0)
1079 .lower();
1080
1081 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1082 if (ST.has16BitInsts()) {
1083 FRem.customFor({S16, S32, S64});
1084 } else {
1085 FRem.minScalar(0, S32)
1086 .customFor({S32, S64});
1087 }
1088 FRem.scalarize(0);
1089
1090 // TODO: Do we need to clamp maximum bitwidth?
1092 .legalIf(isScalar(0))
1093 .legalFor({{V2S16, V2S32}})
1094 .clampMaxNumElements(0, S16, 2)
1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096 // situations (like an invalid implicit use), we don't want to infinite loop
1097 // in the legalizer.
1099 .alwaysLegal();
1100
1101 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1102 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1103 {S32, S1}, {S64, S1}, {S16, S1}})
1104 .scalarize(0)
1105 .clampScalar(0, S32, S64)
1106 .widenScalarToNextPow2(1, 32);
1107
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1109 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1110 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1111 .lowerIf(typeIs(1, S1))
1112 .customFor({{S32, S64}, {S64, S64}});
1113 if (ST.has16BitInsts())
1114 IToFP.legalFor({{S16, S16}});
1115 IToFP.clampScalar(1, S32, S64)
1116 .minScalar(0, S32)
1117 .scalarize(0)
1119
1120 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1121 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1122 .customFor({{S64, S32}, {S64, S64}})
1123 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1124 if (ST.has16BitInsts())
1125 FPToI.legalFor({{S16, S16}});
1126 else
1127 FPToI.minScalar(1, S32);
1128
1129 FPToI.minScalar(0, S32)
1130 .widenScalarToNextPow2(0, 32)
1131 .scalarize(0)
1132 .lower();
1133
1134 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1135 .clampScalar(0, S16, S64)
1136 .scalarize(0)
1137 .lower();
1138
1139 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1140 .customFor({S16, S32})
1141 .scalarize(0)
1142 .lower();
1143
1144 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1145 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1146 .scalarize(0)
1147 .lower();
1148
1149 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1150 .clampScalar(0, S16, S64)
1151 .scalarize(0)
1152 .lower();
1153
1154 if (ST.has16BitInsts()) {
1155 getActionDefinitionsBuilder(
1156 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1157 .legalFor({S16, S32, S64})
1158 .clampScalar(0, S16, S64)
1159 .scalarize(0);
1160 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1161 getActionDefinitionsBuilder(
1162 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1163 .legalFor({S32, S64})
1164 .clampScalar(0, S32, S64)
1165 .scalarize(0);
1166 } else {
1167 getActionDefinitionsBuilder(
1168 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1169 .legalFor({S32})
1170 .customFor({S64})
1171 .clampScalar(0, S32, S64)
1172 .scalarize(0);
1173 }
1174
1175 getActionDefinitionsBuilder(G_PTR_ADD)
1176 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1177 .legalIf(all(isPointer(0), sameSize(0, 1)))
1178 .scalarize(0)
1179 .scalarSameSizeAs(1, 0);
1180
1181 getActionDefinitionsBuilder(G_PTRMASK)
1182 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1183 .scalarSameSizeAs(1, 0)
1184 .scalarize(0);
1185
1186 auto &CmpBuilder =
1187 getActionDefinitionsBuilder(G_ICMP)
1188 // The compare output type differs based on the register bank of the output,
1189 // so make both s1 and s32 legal.
1190 //
1191 // Scalar compares producing output in scc will be promoted to s32, as that
1192 // is the allocatable register type that will be needed for the copy from
1193 // scc. This will be promoted during RegBankSelect, and we assume something
1194 // before that won't try to use s32 result types.
1195 //
1196 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1197 // bank.
1198 .legalForCartesianProduct(
1199 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1200 .legalForCartesianProduct(
1201 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1202 if (ST.has16BitInsts()) {
1203 CmpBuilder.legalFor({{S1, S16}});
1204 }
1205
1206 CmpBuilder
1207 .widenScalarToNextPow2(1)
1208 .clampScalar(1, S32, S64)
1209 .scalarize(0)
1210 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1211
1212 auto &FCmpBuilder =
1213 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1214 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1215
1216 if (ST.hasSALUFloatInsts())
1217 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1218
1219 FCmpBuilder
1220 .widenScalarToNextPow2(1)
1221 .clampScalar(1, S32, S64)
1222 .scalarize(0);
1223
1224 // FIXME: fpow has a selection pattern that should move to custom lowering.
1225 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1226 if (ST.has16BitInsts())
1227 ExpOps.customFor({{S32}, {S16}});
1228 else
1229 ExpOps.customFor({S32});
1230 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1231 .scalarize(0);
1232
1233 getActionDefinitionsBuilder(G_FPOWI)
1234 .clampScalar(0, MinScalarFPTy, S32)
1235 .lower();
1236
1237 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1238 Log2Ops.customFor({S32});
1239 if (ST.has16BitInsts())
1240 Log2Ops.legalFor({S16});
1241 else
1242 Log2Ops.customFor({S16});
1243 Log2Ops.scalarize(0)
1244 .lower();
1245
1246 auto &LogOps =
1247 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1248 LogOps.customFor({S32, S16});
1249 LogOps.clampScalar(0, MinScalarFPTy, S32)
1250 .scalarize(0);
1251
1252 // The 64-bit versions produce 32-bit results, but only on the SALU.
1253 getActionDefinitionsBuilder(G_CTPOP)
1254 .legalFor({{S32, S32}, {S32, S64}})
1255 .clampScalar(0, S32, S32)
1256 .widenScalarToNextPow2(1, 32)
1257 .clampScalar(1, S32, S64)
1258 .scalarize(0)
1259 .widenScalarToNextPow2(0, 32);
1260
1261 // If no 16 bit instr is available, lower into different instructions.
1262 if (ST.has16BitInsts())
1263 getActionDefinitionsBuilder(G_IS_FPCLASS)
1264 .legalForCartesianProduct({S1}, FPTypes16)
1265 .widenScalarToNextPow2(1)
1266 .scalarize(0)
1267 .lower();
1268 else
1269 getActionDefinitionsBuilder(G_IS_FPCLASS)
1270 .legalForCartesianProduct({S1}, FPTypesBase)
1271 .lowerFor({S1, S16})
1272 .widenScalarToNextPow2(1)
1273 .scalarize(0)
1274 .lower();
1275
1276 // The hardware instructions return a different result on 0 than the generic
1277 // instructions expect. The hardware produces -1, but these produce the
1278 // bitwidth.
1279 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1280 .scalarize(0)
1281 .clampScalar(0, S32, S32)
1282 .clampScalar(1, S32, S64)
1283 .widenScalarToNextPow2(0, 32)
1284 .widenScalarToNextPow2(1, 32)
1285 .custom();
1286
1287 // The 64-bit versions produce 32-bit results, but only on the SALU.
1288 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1289 .legalFor({{S32, S32}, {S32, S64}})
1290 .customIf(scalarNarrowerThan(1, 32))
1291 .clampScalar(0, S32, S32)
1292 .clampScalar(1, S32, S64)
1293 .scalarize(0)
1294 .widenScalarToNextPow2(0, 32)
1295 .widenScalarToNextPow2(1, 32);
1296
1297 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1298 .legalFor({{S32, S32}, {S32, S64}})
1299 .clampScalar(0, S32, S32)
1300 .clampScalar(1, S32, S64)
1301 .scalarize(0)
1302 .widenScalarToNextPow2(0, 32)
1303 .widenScalarToNextPow2(1, 32);
1304
1305 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1306 // RegBankSelect.
1307 getActionDefinitionsBuilder(G_BITREVERSE)
1308 .legalFor({S32, S64})
1309 .clampScalar(0, S32, S64)
1310 .scalarize(0)
1311 .widenScalarToNextPow2(0);
1312
1313 if (ST.has16BitInsts()) {
1314 getActionDefinitionsBuilder(G_BSWAP)
1315 .legalFor({S16, S32, V2S16})
1316 .clampMaxNumElementsStrict(0, S16, 2)
1317 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1318 // narrowScalar limitation.
1319 .widenScalarToNextPow2(0)
1320 .clampScalar(0, S16, S32)
1321 .scalarize(0);
1322
1323 if (ST.hasVOP3PInsts()) {
1324 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1325 .legalFor({S32, S16, V2S16})
1326 .clampMaxNumElements(0, S16, 2)
1327 .minScalar(0, S16)
1328 .widenScalarToNextPow2(0)
1329 .scalarize(0)
1330 .lower();
1331 } else {
1332 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1333 .legalFor({S32, S16})
1334 .widenScalarToNextPow2(0)
1335 .minScalar(0, S16)
1336 .scalarize(0)
1337 .lower();
1338 }
1339 } else {
1340 // TODO: Should have same legality without v_perm_b32
1341 getActionDefinitionsBuilder(G_BSWAP)
1342 .legalFor({S32})
1343 .lowerIf(scalarNarrowerThan(0, 32))
1344 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1345 // narrowScalar limitation.
1346 .widenScalarToNextPow2(0)
1347 .maxScalar(0, S32)
1348 .scalarize(0)
1349 .lower();
1350
1351 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1352 .legalFor({S32})
1353 .minScalar(0, S32)
1354 .widenScalarToNextPow2(0)
1355 .scalarize(0)
1356 .lower();
1357 }
1358
1359 getActionDefinitionsBuilder(G_INTTOPTR)
1360 // List the common cases
1361 .legalForCartesianProduct(AddrSpaces64, {S64})
1362 .legalForCartesianProduct(AddrSpaces32, {S32})
1363 .scalarize(0)
1364 // Accept any address space as long as the size matches
1365 .legalIf(sameSize(0, 1))
1366 .widenScalarIf(smallerThan(1, 0),
1367 [](const LegalityQuery &Query) {
1368 return std::pair(
1369 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1370 })
1371 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1372 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1373 });
1374
1375 getActionDefinitionsBuilder(G_PTRTOINT)
1376 // List the common cases
1377 .legalForCartesianProduct(AddrSpaces64, {S64})
1378 .legalForCartesianProduct(AddrSpaces32, {S32})
1379 .scalarize(0)
1380 // Accept any address space as long as the size matches
1381 .legalIf(sameSize(0, 1))
1382 .widenScalarIf(smallerThan(0, 1),
1383 [](const LegalityQuery &Query) {
1384 return std::pair(
1385 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1386 })
1387 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1388 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1389 });
1390
1391 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1392 .scalarize(0)
1393 .custom();
1394
1395 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1396 bool IsLoad) -> bool {
1397 const LLT DstTy = Query.Types[0];
1398
1399 // Split vector extloads.
1400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1401
1402 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1403 return true;
1404
1405 const LLT PtrTy = Query.Types[1];
1406 unsigned AS = PtrTy.getAddressSpace();
1407 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1408 Query.MMODescrs[0].Ordering !=
1410 return true;
1411
1412 // Catch weird sized loads that don't evenly divide into the access sizes
1413 // TODO: May be able to widen depending on alignment etc.
1414 unsigned NumRegs = (MemSize + 31) / 32;
1415 if (NumRegs == 3) {
1416 if (!ST.hasDwordx3LoadStores())
1417 return true;
1418 } else {
1419 // If the alignment allows, these should have been widened.
1420 if (!isPowerOf2_32(NumRegs))
1421 return true;
1422 }
1423
1424 return false;
1425 };
1426
1427 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1428 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1429 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1430
1431 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1432 // LDS
1433 // TODO: Unsupported flat for SI.
1434
1435 for (unsigned Op : {G_LOAD, G_STORE}) {
1436 const bool IsStore = Op == G_STORE;
1437
1438 auto &Actions = getActionDefinitionsBuilder(Op);
1439 // Explicitly list some common cases.
1440 // TODO: Does this help compile time at all?
1441 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1442 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1443 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1444 {S64, GlobalPtr, S64, GlobalAlign32},
1445 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1446 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1447 {S32, GlobalPtr, S8, GlobalAlign8},
1448 {S32, GlobalPtr, S16, GlobalAlign16},
1449
1450 {S32, LocalPtr, S32, 32},
1451 {S64, LocalPtr, S64, 32},
1452 {V2S32, LocalPtr, V2S32, 32},
1453 {S32, LocalPtr, S8, 8},
1454 {S32, LocalPtr, S16, 16},
1455 {V2S16, LocalPtr, S32, 32},
1456
1457 {S32, PrivatePtr, S32, 32},
1458 {S32, PrivatePtr, S8, 8},
1459 {S32, PrivatePtr, S16, 16},
1460 {V2S16, PrivatePtr, S32, 32},
1461
1462 {S32, ConstantPtr, S32, GlobalAlign32},
1463 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1464 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1465 {S64, ConstantPtr, S64, GlobalAlign32},
1466 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1467 Actions.legalIf(
1468 [=](const LegalityQuery &Query) -> bool {
1469 return isLoadStoreLegal(ST, Query);
1470 });
1471
1472 // The custom pointers (fat pointers, buffer resources) don't work with load
1473 // and store at this level. Fat pointers should have been lowered to
1474 // intrinsics before the translation to MIR.
1475 Actions.unsupportedIf(
1476 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1477
1478 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1479 // ptrtoint. This is needed to account for the fact that we can't have i128
1480 // as a register class for SelectionDAG reasons.
1481 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1482 return hasBufferRsrcWorkaround(Query.Types[0]);
1483 });
1484
1485 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1486 // 64-bits.
1487 //
1488 // TODO: Should generalize bitcast action into coerce, which will also cover
1489 // inserting addrspacecasts.
1490 Actions.customIf(typeIs(1, Constant32Ptr));
1491
1492 // Turn any illegal element vectors into something easier to deal
1493 // with. These will ultimately produce 32-bit scalar shifts to extract the
1494 // parts anyway.
1495 //
1496 // For odd 16-bit element vectors, prefer to split those into pieces with
1497 // 16-bit vector parts.
1498 Actions.bitcastIf(
1499 [=](const LegalityQuery &Query) -> bool {
1500 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1501 Query.MMODescrs[0].MemoryTy);
1502 }, bitcastToRegisterType(0));
1503
1504 if (!IsStore) {
1505 // Widen suitably aligned loads by loading extra bytes. The standard
1506 // legalization actions can't properly express widening memory operands.
1507 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1508 return shouldWidenLoad(ST, Query, G_LOAD);
1509 });
1510 }
1511
1512 // FIXME: load/store narrowing should be moved to lower action
1513 Actions
1514 .narrowScalarIf(
1515 [=](const LegalityQuery &Query) -> bool {
1516 return !Query.Types[0].isVector() &&
1517 needToSplitMemOp(Query, Op == G_LOAD);
1518 },
1519 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1520 const LLT DstTy = Query.Types[0];
1521 const LLT PtrTy = Query.Types[1];
1522
1523 const unsigned DstSize = DstTy.getSizeInBits();
1524 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1525
1526 // Split extloads.
1527 if (DstSize > MemSize)
1528 return std::pair(0, LLT::scalar(MemSize));
1529
1530 unsigned MaxSize = maxSizeForAddrSpace(
1531 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1532 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1533 if (MemSize > MaxSize)
1534 return std::pair(0, LLT::scalar(MaxSize));
1535
1536 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1537 return std::pair(0, LLT::scalar(Align));
1538 })
1539 .fewerElementsIf(
1540 [=](const LegalityQuery &Query) -> bool {
1541 return Query.Types[0].isVector() &&
1542 needToSplitMemOp(Query, Op == G_LOAD);
1543 },
1544 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1545 const LLT DstTy = Query.Types[0];
1546 const LLT PtrTy = Query.Types[1];
1547
1548 LLT EltTy = DstTy.getElementType();
1549 unsigned MaxSize = maxSizeForAddrSpace(
1550 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1551 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1552
1553 // FIXME: Handle widened to power of 2 results better. This ends
1554 // up scalarizing.
1555 // FIXME: 3 element stores scalarized on SI
1556
1557 // Split if it's too large for the address space.
1558 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1559 if (MemSize > MaxSize) {
1560 unsigned NumElts = DstTy.getNumElements();
1561 unsigned EltSize = EltTy.getSizeInBits();
1562
1563 if (MaxSize % EltSize == 0) {
1564 return std::pair(
1566 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1567 }
1568
1569 unsigned NumPieces = MemSize / MaxSize;
1570
1571 // FIXME: Refine when odd breakdowns handled
1572 // The scalars will need to be re-legalized.
1573 if (NumPieces == 1 || NumPieces >= NumElts ||
1574 NumElts % NumPieces != 0)
1575 return std::pair(0, EltTy);
1576
1577 return std::pair(0,
1578 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1579 }
1580
1581 // FIXME: We could probably handle weird extending loads better.
1582 if (DstTy.getSizeInBits() > MemSize)
1583 return std::pair(0, EltTy);
1584
1585 unsigned EltSize = EltTy.getSizeInBits();
1586 unsigned DstSize = DstTy.getSizeInBits();
1587 if (!isPowerOf2_32(DstSize)) {
1588 // We're probably decomposing an odd sized store. Try to split
1589 // to the widest type. TODO: Account for alignment. As-is it
1590 // should be OK, since the new parts will be further legalized.
1591 unsigned FloorSize = llvm::bit_floor(DstSize);
1592 return std::pair(
1594 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1595 }
1596
1597 // May need relegalization for the scalars.
1598 return std::pair(0, EltTy);
1599 })
1600 .minScalar(0, S32)
1601 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1602 .widenScalarToNextPow2(0)
1603 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1604 .lower();
1605 }
1606
1607 // FIXME: Unaligned accesses not lowered.
1608 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1609 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1610 {S32, GlobalPtr, S16, 2 * 8},
1611 {S32, LocalPtr, S8, 8},
1612 {S32, LocalPtr, S16, 16},
1613 {S32, PrivatePtr, S8, 8},
1614 {S32, PrivatePtr, S16, 16},
1615 {S32, ConstantPtr, S8, 8},
1616 {S32, ConstantPtr, S16, 2 * 8}})
1617 .legalIf(
1618 [=](const LegalityQuery &Query) -> bool {
1619 return isLoadStoreLegal(ST, Query);
1620 });
1621
1622 if (ST.hasFlatAddressSpace()) {
1623 ExtLoads.legalForTypesWithMemDesc(
1624 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1625 }
1626
1627 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1628 // 64-bits.
1629 //
1630 // TODO: Should generalize bitcast action into coerce, which will also cover
1631 // inserting addrspacecasts.
1632 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1633
1634 ExtLoads.clampScalar(0, S32, S32)
1635 .widenScalarToNextPow2(0)
1636 .lower();
1637
1638 auto &Atomics = getActionDefinitionsBuilder(
1639 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1640 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1641 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1642 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1643 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1644 {S64, GlobalPtr}, {S64, LocalPtr},
1645 {S32, RegionPtr}, {S64, RegionPtr}});
1646 if (ST.hasFlatAddressSpace()) {
1647 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1648 }
1649
1650 // TODO: v2bf16 operations, and fat buffer pointer support.
1651 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1652 if (ST.hasLDSFPAtomicAddF32()) {
1653 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1654 if (ST.hasLdsAtomicAddF64())
1655 Atomic.legalFor({{S64, LocalPtr}});
1656 if (ST.hasAtomicDsPkAdd16Insts())
1657 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1658 }
1659 if (ST.hasAtomicFaddInsts())
1660 Atomic.legalFor({{S32, GlobalPtr}});
1661 if (ST.hasFlatAtomicFaddF32Inst())
1662 Atomic.legalFor({{S32, FlatPtr}});
1663
1664 if (ST.hasGFX90AInsts()) {
1665 // These are legal with some caveats, and should have undergone expansion in
1666 // the IR in most situations
1667 // TODO: Move atomic expansion into legalizer
1668 Atomic.legalFor({
1669 {S32, GlobalPtr},
1670 {S64, GlobalPtr},
1671 {S64, FlatPtr}
1672 });
1673 }
1674
1675 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1676 ST.hasAtomicBufferGlobalPkAddF16Insts())
1677 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1678 if (ST.hasAtomicGlobalPkAddBF16Inst())
1679 Atomic.legalFor({{V2BF16, GlobalPtr}});
1680 if (ST.hasAtomicFlatPkAdd16Insts())
1681 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1682
1683
1684 // Most of the legalization work here is done by AtomicExpand. We could
1685 // probably use a simpler legality rule that just assumes anything is OK.
1686 auto &AtomicFMinFMax =
1687 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1688 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1689
1690 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1691 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1692 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1693 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1694 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1695 AtomicFMinFMax.legalFor({F32, FlatPtr});
1696 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1697 AtomicFMinFMax.legalFor({F64, FlatPtr});
1698
1699 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1700 // demarshalling
1701 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1702 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1703 {S32, FlatPtr}, {S64, FlatPtr}})
1704 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1705 {S32, RegionPtr}, {S64, RegionPtr}});
1706 // TODO: Pointer types, any 32-bit or 64-bit vector
1707
1708 // Condition should be s32 for scalar, s1 for vector.
1709 getActionDefinitionsBuilder(G_SELECT)
1710 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1711 LocalPtr, FlatPtr, PrivatePtr,
1712 LLT::fixed_vector(2, LocalPtr),
1713 LLT::fixed_vector(2, PrivatePtr)},
1714 {S1, S32})
1715 .clampScalar(0, S16, S64)
1716 .scalarize(1)
1717 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1718 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1719 .clampMaxNumElements(0, S32, 2)
1720 .clampMaxNumElements(0, LocalPtr, 2)
1721 .clampMaxNumElements(0, PrivatePtr, 2)
1722 .scalarize(0)
1723 .widenScalarToNextPow2(0)
1724 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1725
1726 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1727 // be more flexible with the shift amount type.
1728 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1729 .legalFor({{S32, S32}, {S64, S32}});
1730 if (ST.has16BitInsts()) {
1731 if (ST.hasVOP3PInsts()) {
1732 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1733 .clampMaxNumElements(0, S16, 2);
1734 } else
1735 Shifts.legalFor({{S16, S16}});
1736
1737 // TODO: Support 16-bit shift amounts for all types
1738 Shifts.widenScalarIf(
1739 [=](const LegalityQuery &Query) {
1740 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1741 // 32-bit amount.
1742 const LLT ValTy = Query.Types[0];
1743 const LLT AmountTy = Query.Types[1];
1744 return ValTy.getSizeInBits() <= 16 &&
1745 AmountTy.getSizeInBits() < 16;
1746 }, changeTo(1, S16));
1747 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1748 Shifts.clampScalar(1, S32, S32);
1749 Shifts.widenScalarToNextPow2(0, 16);
1750 Shifts.clampScalar(0, S16, S64);
1751
1752 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1753 .minScalar(0, S16)
1754 .scalarize(0)
1755 .lower();
1756 } else {
1757 // Make sure we legalize the shift amount type first, as the general
1758 // expansion for the shifted type will produce much worse code if it hasn't
1759 // been truncated already.
1760 Shifts.clampScalar(1, S32, S32);
1761 Shifts.widenScalarToNextPow2(0, 32);
1762 Shifts.clampScalar(0, S32, S64);
1763
1764 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1765 .minScalar(0, S32)
1766 .scalarize(0)
1767 .lower();
1768 }
1769 Shifts.scalarize(0);
1770
1771 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1772 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1773 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1774 unsigned IdxTypeIdx = 2;
1775
1776 getActionDefinitionsBuilder(Op)
1777 .customIf([=](const LegalityQuery &Query) {
1778 const LLT EltTy = Query.Types[EltTypeIdx];
1779 const LLT VecTy = Query.Types[VecTypeIdx];
1780 const LLT IdxTy = Query.Types[IdxTypeIdx];
1781 const unsigned EltSize = EltTy.getSizeInBits();
1782 const bool isLegalVecType =
1784 // Address space 8 pointers are 128-bit wide values, but the logic
1785 // below will try to bitcast them to 2N x s64, which will fail.
1786 // Therefore, as an intermediate step, wrap extracts/insertions from a
1787 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1788 // extraction result) in order to produce a vector operation that can
1789 // be handled by the logic below.
1790 if (EltTy.isPointer() && EltSize > 64)
1791 return true;
1792 return (EltSize == 32 || EltSize == 64) &&
1793 VecTy.getSizeInBits() % 32 == 0 &&
1794 VecTy.getSizeInBits() <= MaxRegisterSize &&
1795 IdxTy.getSizeInBits() == 32 &&
1796 isLegalVecType;
1797 })
1798 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1799 bitcastToVectorElement32(VecTypeIdx))
1800 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1801 .bitcastIf(
1802 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1803 [=](const LegalityQuery &Query) {
1804 // For > 64-bit element types, try to turn this into a 64-bit
1805 // element vector since we may be able to do better indexing
1806 // if this is scalar. If not, fall back to 32.
1807 const LLT EltTy = Query.Types[EltTypeIdx];
1808 const LLT VecTy = Query.Types[VecTypeIdx];
1809 const unsigned DstEltSize = EltTy.getSizeInBits();
1810 const unsigned VecSize = VecTy.getSizeInBits();
1811
1812 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1813 return std::pair(
1814 VecTypeIdx,
1815 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1816 })
1817 .clampScalar(EltTypeIdx, S32, S64)
1818 .clampScalar(VecTypeIdx, S32, S64)
1819 .clampScalar(IdxTypeIdx, S32, S32)
1820 .clampMaxNumElements(VecTypeIdx, S32, 32)
1821 // TODO: Clamp elements for 64-bit vectors?
1822 .moreElementsIf(
1823 isIllegalRegisterType(VecTypeIdx),
1825 // It should only be necessary with variable indexes.
1826 // As a last resort, lower to the stack
1827 .lower();
1828 }
1829
1830 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1831 .unsupportedIf([=](const LegalityQuery &Query) {
1832 const LLT &EltTy = Query.Types[1].getElementType();
1833 return Query.Types[0] != EltTy;
1834 });
1835
1836 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1837 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1838 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1839
1840 // FIXME: Doesn't handle extract of illegal sizes.
1841 getActionDefinitionsBuilder(Op)
1842 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1843 .lowerIf([=](const LegalityQuery &Query) {
1844 // Sub-vector(or single element) insert and extract.
1845 // TODO: verify immediate offset here since lower only works with
1846 // whole elements.
1847 const LLT BigTy = Query.Types[BigTyIdx];
1848 return BigTy.isVector();
1849 })
1850 // FIXME: Multiples of 16 should not be legal.
1851 .legalIf([=](const LegalityQuery &Query) {
1852 const LLT BigTy = Query.Types[BigTyIdx];
1853 const LLT LitTy = Query.Types[LitTyIdx];
1854 return (BigTy.getSizeInBits() % 32 == 0) &&
1855 (LitTy.getSizeInBits() % 16 == 0);
1856 })
1857 .widenScalarIf(
1858 [=](const LegalityQuery &Query) {
1859 const LLT BigTy = Query.Types[BigTyIdx];
1860 return (BigTy.getScalarSizeInBits() < 16);
1861 },
1863 .widenScalarIf(
1864 [=](const LegalityQuery &Query) {
1865 const LLT LitTy = Query.Types[LitTyIdx];
1866 return (LitTy.getScalarSizeInBits() < 16);
1867 },
1869 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1870 .widenScalarToNextPow2(BigTyIdx, 32);
1871
1872 }
1873
1874 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1875 .legalForCartesianProduct(AllS32Vectors, {S32})
1876 .legalForCartesianProduct(AllS64Vectors, {S64})
1877 .clampNumElements(0, V16S32, V32S32)
1878 .clampNumElements(0, V2S64, V16S64)
1879 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1880 .moreElementsIf(
1883
1884 if (ST.hasScalarPackInsts()) {
1885 BuildVector
1886 // FIXME: Should probably widen s1 vectors straight to s32
1887 .minScalarOrElt(0, S16)
1888 .minScalar(1, S16);
1889
1890 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1891 .legalFor({V2S16, S32})
1892 .lower();
1893 } else {
1894 BuildVector.customFor({V2S16, S16});
1895 BuildVector.minScalarOrElt(0, S32);
1896
1897 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1898 .customFor({V2S16, S32})
1899 .lower();
1900 }
1901
1902 BuildVector.legalIf(isRegisterType(0));
1903
1904 // FIXME: Clamp maximum size
1905 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1906 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1907 .clampMaxNumElements(0, S32, 32)
1908 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1909 .clampMaxNumElements(0, S16, 64);
1910
1911 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1912
1913 // Merge/Unmerge
1914 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1915 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1916 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1917
1918 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1919 const LLT Ty = Query.Types[TypeIdx];
1920 if (Ty.isVector()) {
1921 const LLT &EltTy = Ty.getElementType();
1922 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1923 return true;
1924 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1925 return true;
1926 }
1927 return false;
1928 };
1929
1930 auto &Builder = getActionDefinitionsBuilder(Op)
1931 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1932 .lowerFor({{S16, V2S16}})
1933 .lowerIf([=](const LegalityQuery &Query) {
1934 const LLT BigTy = Query.Types[BigTyIdx];
1935 return BigTy.getSizeInBits() == 32;
1936 })
1937 // Try to widen to s16 first for small types.
1938 // TODO: Only do this on targets with legal s16 shifts
1939 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1940 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1941 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1942 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1943 elementTypeIs(1, S16)),
1944 changeTo(1, V2S16))
1945 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1946 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1947 // valid.
1948 .clampScalar(LitTyIdx, S32, S512)
1949 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1950 // Break up vectors with weird elements into scalars
1951 .fewerElementsIf(
1952 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1953 scalarize(0))
1954 .fewerElementsIf(
1955 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1956 scalarize(1))
1957 .clampScalar(BigTyIdx, S32, MaxScalar);
1958
1959 if (Op == G_MERGE_VALUES) {
1960 Builder.widenScalarIf(
1961 // TODO: Use 16-bit shifts if legal for 8-bit values?
1962 [=](const LegalityQuery &Query) {
1963 const LLT Ty = Query.Types[LitTyIdx];
1964 return Ty.getSizeInBits() < 32;
1965 },
1966 changeTo(LitTyIdx, S32));
1967 }
1968
1969 Builder.widenScalarIf(
1970 [=](const LegalityQuery &Query) {
1971 const LLT Ty = Query.Types[BigTyIdx];
1972 return Ty.getSizeInBits() % 16 != 0;
1973 },
1974 [=](const LegalityQuery &Query) {
1975 // Pick the next power of 2, or a multiple of 64 over 128.
1976 // Whichever is smaller.
1977 const LLT &Ty = Query.Types[BigTyIdx];
1978 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1979 if (NewSizeInBits >= 256) {
1980 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1981 if (RoundedTo < NewSizeInBits)
1982 NewSizeInBits = RoundedTo;
1983 }
1984 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1985 })
1986 // Any vectors left are the wrong size. Scalarize them.
1987 .scalarize(0)
1988 .scalarize(1);
1989 }
1990
1991 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1992 // RegBankSelect.
1993 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1994 .legalFor({{S32}, {S64}});
1995
1996 if (ST.hasVOP3PInsts()) {
1997 SextInReg.lowerFor({{V2S16}})
1998 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1999 // get more vector shift opportunities, since we'll get those when
2000 // expanded.
2001 .clampMaxNumElementsStrict(0, S16, 2);
2002 } else if (ST.has16BitInsts()) {
2003 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2004 } else {
2005 // Prefer to promote to s32 before lowering if we don't have 16-bit
2006 // shifts. This avoid a lot of intermediate truncate and extend operations.
2007 SextInReg.lowerFor({{S32}, {S64}});
2008 }
2009
2010 SextInReg
2011 .scalarize(0)
2012 .clampScalar(0, S32, S64)
2013 .lower();
2014
2015 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2016 .scalarize(0)
2017 .lower();
2018
2019 // TODO: Only Try to form v2s16 with legal packed instructions.
2020 getActionDefinitionsBuilder(G_FSHR)
2021 .legalFor({{S32, S32}})
2022 .lowerFor({{V2S16, V2S16}})
2023 .clampMaxNumElementsStrict(0, S16, 2)
2024 .scalarize(0)
2025 .lower();
2026
2027 if (ST.hasVOP3PInsts()) {
2028 getActionDefinitionsBuilder(G_FSHL)
2029 .lowerFor({{V2S16, V2S16}})
2030 .clampMaxNumElementsStrict(0, S16, 2)
2031 .scalarize(0)
2032 .lower();
2033 } else {
2034 getActionDefinitionsBuilder(G_FSHL)
2035 .scalarize(0)
2036 .lower();
2037 }
2038
2039 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2040 .legalFor({S64});
2041
2042 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2043
2044 getActionDefinitionsBuilder(G_FENCE)
2045 .alwaysLegal();
2046
2047 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2048 .scalarize(0)
2049 .minScalar(0, S32)
2050 .lower();
2051
2052 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2053 .legalFor({{S32, S32}, {S64, S32}})
2054 .clampScalar(1, S32, S32)
2055 .clampScalar(0, S32, S64)
2056 .widenScalarToNextPow2(0)
2057 .scalarize(0);
2058
2059 getActionDefinitionsBuilder(
2060 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2061 G_FCOPYSIGN,
2062
2063 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2064 G_READ_REGISTER, G_WRITE_REGISTER,
2065
2066 G_SADDO, G_SSUBO})
2067 .lower();
2068
2069 if (ST.hasIEEEMinMax()) {
2070 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2071 .legalFor(FPTypesPK16)
2072 .clampMaxNumElements(0, S16, 2)
2073 .scalarize(0);
2074 } else {
2075 // TODO: Implement
2076 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2077 }
2078
2079 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2080 .lower();
2081
2082 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2083
2084 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2085 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2086 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2087 .unsupported();
2088
2089 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2090
2091 getLegacyLegalizerInfo().computeTables();
2092 verify(*ST.getInstrInfo());
2093}
2094
2097 LostDebugLocObserver &LocObserver) const {
2098 MachineIRBuilder &B = Helper.MIRBuilder;
2099 MachineRegisterInfo &MRI = *B.getMRI();
2100
2101 switch (MI.getOpcode()) {
2102 case TargetOpcode::G_ADDRSPACE_CAST:
2103 return legalizeAddrSpaceCast(MI, MRI, B);
2104 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2105 return legalizeFroundeven(MI, MRI, B);
2106 case TargetOpcode::G_FCEIL:
2107 return legalizeFceil(MI, MRI, B);
2108 case TargetOpcode::G_FREM:
2109 return legalizeFrem(MI, MRI, B);
2110 case TargetOpcode::G_INTRINSIC_TRUNC:
2111 return legalizeIntrinsicTrunc(MI, MRI, B);
2112 case TargetOpcode::G_SITOFP:
2113 return legalizeITOFP(MI, MRI, B, true);
2114 case TargetOpcode::G_UITOFP:
2115 return legalizeITOFP(MI, MRI, B, false);
2116 case TargetOpcode::G_FPTOSI:
2117 return legalizeFPTOI(MI, MRI, B, true);
2118 case TargetOpcode::G_FPTOUI:
2119 return legalizeFPTOI(MI, MRI, B, false);
2120 case TargetOpcode::G_FMINNUM:
2121 case TargetOpcode::G_FMAXNUM:
2122 case TargetOpcode::G_FMINNUM_IEEE:
2123 case TargetOpcode::G_FMAXNUM_IEEE:
2124 return legalizeMinNumMaxNum(Helper, MI);
2125 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2126 return legalizeExtractVectorElt(MI, MRI, B);
2127 case TargetOpcode::G_INSERT_VECTOR_ELT:
2128 return legalizeInsertVectorElt(MI, MRI, B);
2129 case TargetOpcode::G_FSIN:
2130 case TargetOpcode::G_FCOS:
2131 return legalizeSinCos(MI, MRI, B);
2132 case TargetOpcode::G_GLOBAL_VALUE:
2133 return legalizeGlobalValue(MI, MRI, B);
2134 case TargetOpcode::G_LOAD:
2135 case TargetOpcode::G_SEXTLOAD:
2136 case TargetOpcode::G_ZEXTLOAD:
2137 return legalizeLoad(Helper, MI);
2138 case TargetOpcode::G_STORE:
2139 return legalizeStore(Helper, MI);
2140 case TargetOpcode::G_FMAD:
2141 return legalizeFMad(MI, MRI, B);
2142 case TargetOpcode::G_FDIV:
2143 return legalizeFDIV(MI, MRI, B);
2144 case TargetOpcode::G_FFREXP:
2145 return legalizeFFREXP(MI, MRI, B);
2146 case TargetOpcode::G_FSQRT:
2147 return legalizeFSQRT(MI, MRI, B);
2148 case TargetOpcode::G_UDIV:
2149 case TargetOpcode::G_UREM:
2150 case TargetOpcode::G_UDIVREM:
2151 return legalizeUnsignedDIV_REM(MI, MRI, B);
2152 case TargetOpcode::G_SDIV:
2153 case TargetOpcode::G_SREM:
2154 case TargetOpcode::G_SDIVREM:
2155 return legalizeSignedDIV_REM(MI, MRI, B);
2156 case TargetOpcode::G_ATOMIC_CMPXCHG:
2157 return legalizeAtomicCmpXChg(MI, MRI, B);
2158 case TargetOpcode::G_FLOG2:
2159 return legalizeFlog2(MI, B);
2160 case TargetOpcode::G_FLOG:
2161 case TargetOpcode::G_FLOG10:
2162 return legalizeFlogCommon(MI, B);
2163 case TargetOpcode::G_FEXP2:
2164 return legalizeFExp2(MI, B);
2165 case TargetOpcode::G_FEXP:
2166 case TargetOpcode::G_FEXP10:
2167 return legalizeFExp(MI, B);
2168 case TargetOpcode::G_FPOW:
2169 return legalizeFPow(MI, B);
2170 case TargetOpcode::G_FFLOOR:
2171 return legalizeFFloor(MI, MRI, B);
2172 case TargetOpcode::G_BUILD_VECTOR:
2173 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2174 return legalizeBuildVector(MI, MRI, B);
2175 case TargetOpcode::G_MUL:
2176 return legalizeMul(Helper, MI);
2177 case TargetOpcode::G_CTLZ:
2178 case TargetOpcode::G_CTTZ:
2179 return legalizeCTLZ_CTTZ(MI, MRI, B);
2180 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2181 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2182 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2183 return legalizeFPTruncRound(MI, B);
2184 case TargetOpcode::G_STACKSAVE:
2185 return legalizeStackSave(MI, B);
2186 case TargetOpcode::G_GET_FPENV:
2187 return legalizeGetFPEnv(MI, MRI, B);
2188 case TargetOpcode::G_SET_FPENV:
2189 return legalizeSetFPEnv(MI, MRI, B);
2190 case TargetOpcode::G_TRAP:
2191 return legalizeTrap(MI, MRI, B);
2192 case TargetOpcode::G_DEBUGTRAP:
2193 return legalizeDebugTrap(MI, MRI, B);
2194 default:
2195 return false;
2196 }
2197
2198 llvm_unreachable("expected switch to return");
2199}
2200
2202 unsigned AS,
2204 MachineIRBuilder &B) const {
2205 MachineFunction &MF = B.getMF();
2206 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2207 const LLT S32 = LLT::scalar(32);
2208 const LLT S64 = LLT::scalar(64);
2209
2211
2212 if (ST.hasApertureRegs()) {
2213 // Note: this register is somewhat broken. When used as a 32-bit operand,
2214 // it only returns zeroes. The real value is in the upper 32 bits.
2215 // Thus, we must emit extract the high 32 bits.
2216 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE;
2219 // FIXME: It would be more natural to emit a COPY here, but then copy
2220 // coalescing would kick in and it would think it's okay to use the "HI"
2221 // subregister (instead of extracting the HI 32 bits) which is an artificial
2222 // (unusable) register.
2223 // Register TableGen definitions would need an overhaul to get rid of the
2224 // artificial "HI" aperture registers and prevent this kind of issue from
2225 // happening.
2226 Register Dst = MRI.createGenericVirtualRegister(S64);
2227 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2229 return B.buildUnmerge(S32, Dst).getReg(1);
2230 }
2231
2232 // TODO: can we be smarter about machine pointer info?
2234 Register LoadAddr = MRI.createGenericVirtualRegister(
2236 // For code object version 5, private_base and shared_base are passed through
2237 // implicit kernargs.
2244 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2245
2246 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2248
2249 if (!loadInputValue(KernargPtrReg, B,
2251 return Register();
2252
2254 PtrInfo,
2258
2259 // Pointer address
2260 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2261 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2262 // Load address
2263 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2264 }
2265
2266 Register QueuePtr = MRI.createGenericVirtualRegister(
2268
2270 return Register();
2271
2272 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2273 // private_segment_aperture_base_hi.
2274 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2275
2277 PtrInfo,
2280 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2281
2282 B.buildPtrAdd(LoadAddr, QueuePtr,
2283 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2284 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2285}
2286
2287/// Return true if the value is a known valid address, such that a null check is
2288/// not necessary.
2290 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2291 MachineInstr *Def = MRI.getVRegDef(Val);
2292 switch (Def->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX:
2294 case AMDGPU::G_GLOBAL_VALUE:
2295 case AMDGPU::G_BLOCK_ADDR:
2296 return true;
2297 case AMDGPU::G_CONSTANT: {
2298 const ConstantInt *CI = Def->getOperand(1).getCImm();
2299 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2300 }
2301 default:
2302 return false;
2303 }
2304
2305 return false;
2306}
2307
2310 MachineIRBuilder &B) const {
2311 MachineFunction &MF = B.getMF();
2312
2313 // MI can either be a G_ADDRSPACE_CAST or a
2314 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2315 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull));
2318
2319 const LLT S32 = LLT::scalar(32);
2320 Register Dst = MI.getOperand(0).getReg();
2321 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2322 : MI.getOperand(1).getReg();
2323 LLT DstTy = MRI.getType(Dst);
2324 LLT SrcTy = MRI.getType(Src);
2325 unsigned DestAS = DstTy.getAddressSpace();
2326 unsigned SrcAS = SrcTy.getAddressSpace();
2327
2328 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2329 // vector element.
2330 assert(!DstTy.isVector());
2331
2332 const AMDGPUTargetMachine &TM
2333 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2334
2335 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2337 return true;
2338 }
2339
2340 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2341 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2342 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2343 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2344 // G_ADDRSPACE_CAST we need to guess.
2345 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2346 // Extract low 32-bits of the pointer.
2347 B.buildExtract(Dst, Src, 0);
2348 MI.eraseFromParent();
2349 return true;
2350 }
2351
2352 unsigned NullVal = TM.getNullPointerValue(DestAS);
2353
2354 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2355 auto FlatNull = B.buildConstant(SrcTy, 0);
2356
2357 // Extract low 32-bits of the pointer.
2358 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2359
2360 auto CmpRes =
2361 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2362 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2363
2364 MI.eraseFromParent();
2365 return true;
2366 }
2367
2368 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2369 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2370 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2371 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2372 if (!ApertureReg.isValid())
2373 return false;
2374
2375 // Coerce the type of the low half of the result so we can use merge_values.
2376 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2377
2378 // TODO: Should we allow mismatched types but matching sizes in merges to
2379 // avoid the ptrtoint?
2380 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2381
2382 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2383 // G_ADDRSPACE_CAST we need to guess.
2384 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2385 B.buildCopy(Dst, BuildPtr);
2386 MI.eraseFromParent();
2387 return true;
2388 }
2389
2390 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2391 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2392
2393 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2394 SegmentNull.getReg(0));
2395
2396 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2397
2398 MI.eraseFromParent();
2399 return true;
2400 }
2401
2402 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2403 SrcTy.getSizeInBits() == 64) {
2404 // Truncate.
2405 B.buildExtract(Dst, Src, 0);
2406 MI.eraseFromParent();
2407 return true;
2408 }
2409
2410 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2411 DstTy.getSizeInBits() == 64) {
2413 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2414 auto PtrLo = B.buildPtrToInt(S32, Src);
2415 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2416 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2417 MI.eraseFromParent();
2418 return true;
2419 }
2420
2421 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2422 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2423
2424 LLVMContext &Ctx = MF.getFunction().getContext();
2425 Ctx.diagnose(InvalidAddrSpaceCast);
2426 B.buildUndef(Dst);
2427 MI.eraseFromParent();
2428 return true;
2429}
2430
2433 MachineIRBuilder &B) const {
2434 Register Src = MI.getOperand(1).getReg();
2435 LLT Ty = MRI.getType(Src);
2436 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2437
2438 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2439 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2440
2441 auto C1 = B.buildFConstant(Ty, C1Val);
2442 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2443
2444 // TODO: Should this propagate fast-math-flags?
2445 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2446 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2447
2448 auto C2 = B.buildFConstant(Ty, C2Val);
2449 auto Fabs = B.buildFAbs(Ty, Src);
2450
2451 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2452 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2453 MI.eraseFromParent();
2454 return true;
2455}
2456
2459 MachineIRBuilder &B) const {
2460
2461 const LLT S1 = LLT::scalar(1);
2462 const LLT S64 = LLT::scalar(64);
2463
2464 Register Src = MI.getOperand(1).getReg();
2465 assert(MRI.getType(Src) == S64);
2466
2467 // result = trunc(src)
2468 // if (src > 0.0 && src != result)
2469 // result += 1.0
2470
2471 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2472
2473 const auto Zero = B.buildFConstant(S64, 0.0);
2474 const auto One = B.buildFConstant(S64, 1.0);
2475 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2476 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2477 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2478 auto Add = B.buildSelect(S64, And, One, Zero);
2479
2480 // TODO: Should this propagate fast-math-flags?
2481 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2482 MI.eraseFromParent();
2483 return true;
2484}
2485
2488 MachineIRBuilder &B) const {
2489 Register DstReg = MI.getOperand(0).getReg();
2490 Register Src0Reg = MI.getOperand(1).getReg();
2491 Register Src1Reg = MI.getOperand(2).getReg();
2492 auto Flags = MI.getFlags();
2493 LLT Ty = MRI.getType(DstReg);
2494
2495 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2496 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2497 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2498 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2499 MI.eraseFromParent();
2500 return true;
2501}
2502
2505 const unsigned FractBits = 52;
2506 const unsigned ExpBits = 11;
2507 LLT S32 = LLT::scalar(32);
2508
2509 auto Const0 = B.buildConstant(S32, FractBits - 32);
2510 auto Const1 = B.buildConstant(S32, ExpBits);
2511
2512 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2513 .addUse(Hi)
2514 .addUse(Const0.getReg(0))
2515 .addUse(Const1.getReg(0));
2516
2517 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2518}
2519
2522 MachineIRBuilder &B) const {
2523 const LLT S1 = LLT::scalar(1);
2524 const LLT S32 = LLT::scalar(32);
2525 const LLT S64 = LLT::scalar(64);
2526
2527 Register Src = MI.getOperand(1).getReg();
2528 assert(MRI.getType(Src) == S64);
2529
2530 // TODO: Should this use extract since the low half is unused?
2531 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2532 Register Hi = Unmerge.getReg(1);
2533
2534 // Extract the upper half, since this is where we will find the sign and
2535 // exponent.
2536 auto Exp = extractF64Exponent(Hi, B);
2537
2538 const unsigned FractBits = 52;
2539
2540 // Extract the sign bit.
2541 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2542 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2543
2544 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2545
2546 const auto Zero32 = B.buildConstant(S32, 0);
2547
2548 // Extend back to 64-bits.
2549 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2550
2551 auto Shr = B.buildAShr(S64, FractMask, Exp);
2552 auto Not = B.buildNot(S64, Shr);
2553 auto Tmp0 = B.buildAnd(S64, Src, Not);
2554 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2555
2556 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2557 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2558
2559 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2560 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2561 MI.eraseFromParent();
2562 return true;
2563}
2564
2567 MachineIRBuilder &B, bool Signed) const {
2568
2569 Register Dst = MI.getOperand(0).getReg();
2570 Register Src = MI.getOperand(1).getReg();
2571
2572 const LLT S64 = LLT::scalar(64);
2573 const LLT S32 = LLT::scalar(32);
2574
2575 assert(MRI.getType(Src) == S64);
2576
2577 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2578 auto ThirtyTwo = B.buildConstant(S32, 32);
2579
2580 if (MRI.getType(Dst) == S64) {
2581 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2582 : B.buildUITOFP(S64, Unmerge.getReg(1));
2583
2584 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2585 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2586
2587 // TODO: Should this propagate fast-math-flags?
2588 B.buildFAdd(Dst, LdExp, CvtLo);
2589 MI.eraseFromParent();
2590 return true;
2591 }
2592
2593 assert(MRI.getType(Dst) == S32);
2594
2595 auto One = B.buildConstant(S32, 1);
2596
2597 MachineInstrBuilder ShAmt;
2598 if (Signed) {
2599 auto ThirtyOne = B.buildConstant(S32, 31);
2600 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2601 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2602 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2603 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2604 .addUse(Unmerge.getReg(1));
2605 auto LS2 = B.buildSub(S32, LS, One);
2606 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2607 } else
2608 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2609 auto Norm = B.buildShl(S64, Src, ShAmt);
2610 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2611 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2612 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2613 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2614 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2615 B.buildFLdexp(Dst, FVal, Scale);
2616 MI.eraseFromParent();
2617 return true;
2618}
2619
2620// TODO: Copied from DAG implementation. Verify logic and document how this
2621// actually works.
2625 bool Signed) const {
2626
2627 Register Dst = MI.getOperand(0).getReg();
2628 Register Src = MI.getOperand(1).getReg();
2629
2630 const LLT S64 = LLT::scalar(64);
2631 const LLT S32 = LLT::scalar(32);
2632
2633 const LLT SrcLT = MRI.getType(Src);
2634 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2635
2636 unsigned Flags = MI.getFlags();
2637
2638 // The basic idea of converting a floating point number into a pair of 32-bit
2639 // integers is illustrated as follows:
2640 //
2641 // tf := trunc(val);
2642 // hif := floor(tf * 2^-32);
2643 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2644 // hi := fptoi(hif);
2645 // lo := fptoi(lof);
2646 //
2647 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2649 if (Signed && SrcLT == S32) {
2650 // However, a 32-bit floating point number has only 23 bits mantissa and
2651 // it's not enough to hold all the significant bits of `lof` if val is
2652 // negative. To avoid the loss of precision, We need to take the absolute
2653 // value after truncating and flip the result back based on the original
2654 // signedness.
2655 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2656 Trunc = B.buildFAbs(S32, Trunc, Flags);
2657 }
2658 MachineInstrBuilder K0, K1;
2659 if (SrcLT == S64) {
2660 K0 = B.buildFConstant(
2661 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2662 K1 = B.buildFConstant(
2663 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2664 } else {
2665 K0 = B.buildFConstant(
2666 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2667 K1 = B.buildFConstant(
2668 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2669 }
2670
2671 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2672 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2673 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2674
2675 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2676 : B.buildFPTOUI(S32, FloorMul);
2677 auto Lo = B.buildFPTOUI(S32, Fma);
2678
2679 if (Signed && SrcLT == S32) {
2680 // Flip the result based on the signedness, which is either all 0s or 1s.
2681 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2682 // r := xor({lo, hi}, sign) - sign;
2683 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2684 Sign);
2685 } else
2686 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2687 MI.eraseFromParent();
2688
2689 return true;
2690}
2691
2693 MachineInstr &MI) const {
2694 MachineFunction &MF = Helper.MIRBuilder.getMF();
2696
2697 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2698 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2699
2700 // With ieee_mode disabled, the instructions have the correct behavior
2701 // already for G_FMINNUM/G_FMAXNUM
2702 if (!MFI->getMode().IEEE)
2703 return !IsIEEEOp;
2704
2705 if (IsIEEEOp)
2706 return true;
2707
2709}
2710
2713 MachineIRBuilder &B) const {
2714 // TODO: Should move some of this into LegalizerHelper.
2715
2716 // TODO: Promote dynamic indexing of s16 to s32
2717
2718 Register Dst = MI.getOperand(0).getReg();
2719 Register Vec = MI.getOperand(1).getReg();
2720
2721 LLT VecTy = MRI.getType(Vec);
2722 LLT EltTy = VecTy.getElementType();
2723 assert(EltTy == MRI.getType(Dst));
2724
2725 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2726 // but we can't go directly to that logic becasue you can't bitcast a vector
2727 // of pointers to a vector of integers. Therefore, introduce an intermediate
2728 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2729 // drive the legalization forward.
2730 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2731 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2732 LLT IntVecTy = VecTy.changeElementType(IntTy);
2733
2734 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2735 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2736 B.buildIntToPtr(Dst, IntElt);
2737
2738 MI.eraseFromParent();
2739 return true;
2740 }
2741
2742 // FIXME: Artifact combiner probably should have replaced the truncated
2743 // constant before this, so we shouldn't need
2744 // getIConstantVRegValWithLookThrough.
2745 std::optional<ValueAndVReg> MaybeIdxVal =
2746 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2747 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2748 return true;
2749 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2750
2751 if (IdxVal < VecTy.getNumElements()) {
2752 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2753 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2754 } else {
2755 B.buildUndef(Dst);
2756 }
2757
2758 MI.eraseFromParent();
2759 return true;
2760}
2761
2764 MachineIRBuilder &B) const {
2765 // TODO: Should move some of this into LegalizerHelper.
2766
2767 // TODO: Promote dynamic indexing of s16 to s32
2768
2769 Register Dst = MI.getOperand(0).getReg();
2770 Register Vec = MI.getOperand(1).getReg();
2771 Register Ins = MI.getOperand(2).getReg();
2772
2773 LLT VecTy = MRI.getType(Vec);
2774 LLT EltTy = VecTy.getElementType();
2775 assert(EltTy == MRI.getType(Ins));
2776
2777 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2778 // but we can't go directly to that logic becasue you can't bitcast a vector
2779 // of pointers to a vector of integers. Therefore, make the pointer vector
2780 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2781 // new value, and then inttoptr the result vector back. This will then allow
2782 // the rest of legalization to take over.
2783 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2784 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2785 LLT IntVecTy = VecTy.changeElementType(IntTy);
2786
2787 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2788 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2789 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2790 MI.getOperand(3));
2791 B.buildIntToPtr(Dst, IntVecDest);
2792 MI.eraseFromParent();
2793 return true;
2794 }
2795
2796 // FIXME: Artifact combiner probably should have replaced the truncated
2797 // constant before this, so we shouldn't need
2798 // getIConstantVRegValWithLookThrough.
2799 std::optional<ValueAndVReg> MaybeIdxVal =
2800 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2801 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2802 return true;
2803
2804 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2805
2806 unsigned NumElts = VecTy.getNumElements();
2807 if (IdxVal < NumElts) {
2809 for (unsigned i = 0; i < NumElts; ++i)
2810 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2811 B.buildUnmerge(SrcRegs, Vec);
2812
2813 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2814 B.buildMergeLikeInstr(Dst, SrcRegs);
2815 } else {
2816 B.buildUndef(Dst);
2817 }
2818
2819 MI.eraseFromParent();
2820 return true;
2821}
2822
2825 MachineIRBuilder &B) const {
2826
2827 Register DstReg = MI.getOperand(0).getReg();
2828 Register SrcReg = MI.getOperand(1).getReg();
2829 LLT Ty = MRI.getType(DstReg);
2830 unsigned Flags = MI.getFlags();
2831
2832 Register TrigVal;
2833 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2834 if (ST.hasTrigReducedRange()) {
2835 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2836 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2837 .addUse(MulVal.getReg(0))
2838 .setMIFlags(Flags)
2839 .getReg(0);
2840 } else
2841 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2842
2843 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2844 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2845 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2846 .addUse(TrigVal)
2847 .setMIFlags(Flags);
2848 MI.eraseFromParent();
2849 return true;
2850}
2851
2854 const GlobalValue *GV,
2855 int64_t Offset,
2856 unsigned GAFlags) const {
2857 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2858 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2859 // to the following code sequence:
2860 //
2861 // For constant address space:
2862 // s_getpc_b64 s[0:1]
2863 // s_add_u32 s0, s0, $symbol
2864 // s_addc_u32 s1, s1, 0
2865 //
2866 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2867 // a fixup or relocation is emitted to replace $symbol with a literal
2868 // constant, which is a pc-relative offset from the encoding of the $symbol
2869 // operand to the global variable.
2870 //
2871 // For global address space:
2872 // s_getpc_b64 s[0:1]
2873 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2874 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2875 //
2876 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2877 // fixups or relocations are emitted to replace $symbol@*@lo and
2878 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2879 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2880 // operand to the global variable.
2881
2883
2884 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2885 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2886
2887 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2888 .addDef(PCReg);
2889
2890 MIB.addGlobalAddress(GV, Offset, GAFlags);
2891 if (GAFlags == SIInstrInfo::MO_NONE)
2892 MIB.addImm(0);
2893 else
2894 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2895
2896 if (!B.getMRI()->getRegClassOrNull(PCReg))
2897 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2898
2899 if (PtrTy.getSizeInBits() == 32)
2900 B.buildExtract(DstReg, PCReg, 0);
2901 return true;
2902}
2903
2904// Emit a ABS32_LO / ABS32_HI relocation stub.
2906 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2907 MachineRegisterInfo &MRI) const {
2908 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2909
2910 LLT S32 = LLT::scalar(32);
2911
2912 // Use the destination directly, if and only if we store the lower address
2913 // part only and we don't have a register class being set.
2914 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2915 ? DstReg
2916 : MRI.createGenericVirtualRegister(S32);
2917
2918 if (!MRI.getRegClassOrNull(AddrLo))
2919 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2920
2921 // Write the lower half.
2922 B.buildInstr(AMDGPU::S_MOV_B32)
2923 .addDef(AddrLo)
2924 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2925
2926 // If required, write the upper half as well.
2927 if (RequiresHighHalf) {
2928 assert(PtrTy.getSizeInBits() == 64 &&
2929 "Must provide a 64-bit pointer type!");
2930
2931 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2932 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2933
2934 B.buildInstr(AMDGPU::S_MOV_B32)
2935 .addDef(AddrHi)
2936 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2937
2938 // Use the destination directly, if and only if we don't have a register
2939 // class being set.
2940 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2941 ? DstReg
2942 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2943
2944 if (!MRI.getRegClassOrNull(AddrDst))
2945 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2946
2947 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2948
2949 // If we created a new register for the destination, cast the result into
2950 // the final output.
2951 if (AddrDst != DstReg)
2952 B.buildCast(DstReg, AddrDst);
2953 } else if (AddrLo != DstReg) {
2954 // If we created a new register for the destination, cast the result into
2955 // the final output.
2956 B.buildCast(DstReg, AddrLo);
2957 }
2958}
2959
2962 MachineIRBuilder &B) const {
2963 Register DstReg = MI.getOperand(0).getReg();
2964 LLT Ty = MRI.getType(DstReg);
2965 unsigned AS = Ty.getAddressSpace();
2966
2967 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2968 MachineFunction &MF = B.getMF();
2970
2972 if (!MFI->isModuleEntryFunction() &&
2973 GV->getName() != "llvm.amdgcn.module.lds") {
2974 const Function &Fn = MF.getFunction();
2975 DiagnosticInfoUnsupported BadLDSDecl(
2976 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2977 DS_Warning);
2978 Fn.getContext().diagnose(BadLDSDecl);
2979
2980 // We currently don't have a way to correctly allocate LDS objects that
2981 // aren't directly associated with a kernel. We do force inlining of
2982 // functions that use local objects. However, if these dead functions are
2983 // not eliminated, we don't want a compile time error. Just emit a warning
2984 // and a trap, since there should be no callable path here.
2985 B.buildTrap();
2986 B.buildUndef(DstReg);
2987 MI.eraseFromParent();
2988 return true;
2989 }
2990
2991 // TODO: We could emit code to handle the initialization somewhere.
2992 // We ignore the initializer for now and legalize it to allow selection.
2993 // The initializer will anyway get errored out during assembly emission.
2994 const SITargetLowering *TLI = ST.getTargetLowering();
2995 if (!TLI->shouldUseLDSConstAddress(GV)) {
2996 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2997 return true; // Leave in place;
2998 }
2999
3000 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3001 Type *Ty = GV->getValueType();
3002 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3003 // zero-sized type in other languages to declare the dynamic shared
3004 // memory which size is not known at the compile time. They will be
3005 // allocated by the runtime and placed directly after the static
3006 // allocated ones. They all share the same offset.
3007 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3008 // Adjust alignment for that dynamic shared memory array.
3009 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3010 LLT S32 = LLT::scalar(32);
3011 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3012 B.buildIntToPtr(DstReg, Sz);
3013 MI.eraseFromParent();
3014 return true;
3015 }
3016 }
3017
3018 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3019 *cast<GlobalVariable>(GV)));
3020 MI.eraseFromParent();
3021 return true;
3022 }
3023
3024 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3025 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3026 MI.eraseFromParent();
3027 return true;
3028 }
3029
3030 const SITargetLowering *TLI = ST.getTargetLowering();
3031
3032 if (TLI->shouldEmitFixup(GV)) {
3033 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3034 MI.eraseFromParent();
3035 return true;
3036 }
3037
3038 if (TLI->shouldEmitPCReloc(GV)) {
3039 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3040 MI.eraseFromParent();
3041 return true;
3042 }
3043
3045 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3046
3047 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3052 LoadTy, Align(8));
3053
3054 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3055
3056 if (Ty.getSizeInBits() == 32) {
3057 // Truncate if this is a 32-bit constant address.
3058 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3059 B.buildExtract(DstReg, Load, 0);
3060 } else
3061 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3062
3063 MI.eraseFromParent();
3064 return true;
3065}
3066
3068 if (Ty.isVector())
3069 return Ty.changeElementCount(
3072}
3073
3075 MachineInstr &MI) const {
3076 MachineIRBuilder &B = Helper.MIRBuilder;
3077 MachineRegisterInfo &MRI = *B.getMRI();
3078 GISelChangeObserver &Observer = Helper.Observer;
3079
3080 Register PtrReg = MI.getOperand(1).getReg();
3081 LLT PtrTy = MRI.getType(PtrReg);
3082 unsigned AddrSpace = PtrTy.getAddressSpace();
3083
3084 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3086 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3087 Observer.changingInstr(MI);
3088 MI.getOperand(1).setReg(Cast.getReg(0));
3089 Observer.changedInstr(MI);
3090 return true;
3091 }
3092
3093 if (MI.getOpcode() != AMDGPU::G_LOAD)
3094 return false;
3095
3096 Register ValReg = MI.getOperand(0).getReg();
3097 LLT ValTy = MRI.getType(ValReg);
3098
3099 if (hasBufferRsrcWorkaround(ValTy)) {
3100 Observer.changingInstr(MI);
3102 Observer.changedInstr(MI);
3103 return true;
3104 }
3105
3106 MachineMemOperand *MMO = *MI.memoperands_begin();
3107 const unsigned ValSize = ValTy.getSizeInBits();
3108 const LLT MemTy = MMO->getMemoryType();
3109 const Align MemAlign = MMO->getAlign();
3110 const unsigned MemSize = MemTy.getSizeInBits();
3111 const uint64_t AlignInBits = 8 * MemAlign.value();
3112
3113 // Widen non-power-of-2 loads to the alignment if needed
3114 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3115 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3116
3117 // This was already the correct extending load result type, so just adjust
3118 // the memory type.
3119 if (WideMemSize == ValSize) {
3120 MachineFunction &MF = B.getMF();
3121
3122 MachineMemOperand *WideMMO =
3123 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3124 Observer.changingInstr(MI);
3125 MI.setMemRefs(MF, {WideMMO});
3126 Observer.changedInstr(MI);
3127 return true;
3128 }
3129
3130 // Don't bother handling edge case that should probably never be produced.
3131 if (ValSize > WideMemSize)
3132 return false;
3133
3134 LLT WideTy = widenToNextPowerOf2(ValTy);
3135
3136 Register WideLoad;
3137 if (!WideTy.isVector()) {
3138 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3139 B.buildTrunc(ValReg, WideLoad).getReg(0);
3140 } else {
3141 // Extract the subvector.
3142
3143 if (isRegisterType(ValTy)) {
3144 // If this a case where G_EXTRACT is legal, use it.
3145 // (e.g. <3 x s32> -> <4 x s32>)
3146 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3147 B.buildExtract(ValReg, WideLoad, 0);
3148 } else {
3149 // For cases where the widened type isn't a nice register value, unmerge
3150 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3151 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3152 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3153 }
3154 }
3155
3156 MI.eraseFromParent();
3157 return true;
3158 }
3159
3160 return false;
3161}
3162
3164 MachineInstr &MI) const {
3165 MachineIRBuilder &B = Helper.MIRBuilder;
3166 MachineRegisterInfo &MRI = *B.getMRI();
3167 GISelChangeObserver &Observer = Helper.Observer;
3168
3169 Register DataReg = MI.getOperand(0).getReg();
3170 LLT DataTy = MRI.getType(DataReg);
3171
3172 if (hasBufferRsrcWorkaround(DataTy)) {
3173 Observer.changingInstr(MI);
3175 Observer.changedInstr(MI);
3176 return true;
3177 }
3178 return false;
3179}
3180
3183 MachineIRBuilder &B) const {
3184 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3185 assert(Ty.isScalar());
3186
3187 MachineFunction &MF = B.getMF();
3189
3190 // TODO: Always legal with future ftz flag.
3191 // FIXME: Do we need just output?
3192 if (Ty == LLT::float32() &&
3194 return true;
3195 if (Ty == LLT::float16() &&
3197 return true;
3198
3199 MachineIRBuilder HelperBuilder(MI);
3200 GISelObserverWrapper DummyObserver;
3201 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3202 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3203}
3204
3207 Register DstReg = MI.getOperand(0).getReg();
3208 Register PtrReg = MI.getOperand(1).getReg();
3209 Register CmpVal = MI.getOperand(2).getReg();
3210 Register NewVal = MI.getOperand(3).getReg();
3211
3212 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3213 "this should not have been custom lowered");
3214
3215 LLT ValTy = MRI.getType(CmpVal);
3216 LLT VecTy = LLT::fixed_vector(2, ValTy);
3217
3218 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3219
3220 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3221 .addDef(DstReg)
3222 .addUse(PtrReg)
3223 .addUse(PackedVal)
3224 .setMemRefs(MI.memoperands());
3225
3226 MI.eraseFromParent();
3227 return true;
3228}
3229
3230/// Return true if it's known that \p Src can never be an f32 denormal value.
3232 Register Src) {
3233 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3234 switch (DefMI->getOpcode()) {
3235 case TargetOpcode::G_INTRINSIC: {
3236 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3237 case Intrinsic::amdgcn_frexp_mant:
3238 return true;
3239 default:
3240 break;
3241 }
3242
3243 break;
3244 }
3245 case TargetOpcode::G_FFREXP: {
3246 if (DefMI->getOperand(0).getReg() == Src)
3247 return true;
3248 break;
3249 }
3250 case TargetOpcode::G_FPEXT: {
3251 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3252 }
3253 default:
3254 return false;
3255 }
3256
3257 return false;
3258}
3259
3260static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3261 if (Flags & MachineInstr::FmAfn)
3262 return true;
3263 const auto &Options = MF.getTarget().Options;
3264 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3265}
3266
3268 unsigned Flags) {
3269 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3272}
3273
3274std::pair<Register, Register>
3276 unsigned Flags) const {
3277 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3278 return {};
3279
3280 const LLT F32 = LLT::scalar(32);
3281 auto SmallestNormal = B.buildFConstant(
3283 auto IsLtSmallestNormal =
3284 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3285
3286 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3287 auto One = B.buildFConstant(F32, 1.0);
3288 auto ScaleFactor =
3289 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3290 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3291
3292 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3293}
3294
3296 MachineIRBuilder &B) const {
3297 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3298 // If we have to handle denormals, scale up the input and adjust the result.
3299
3300 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3301 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3302
3303 Register Dst = MI.getOperand(0).getReg();
3304 Register Src = MI.getOperand(1).getReg();
3305 LLT Ty = B.getMRI()->getType(Dst);
3306 unsigned Flags = MI.getFlags();
3307
3308 if (Ty == LLT::scalar(16)) {
3309 const LLT F32 = LLT::scalar(32);
3310 // Nothing in half is a denormal when promoted to f32.
3311 auto Ext = B.buildFPExt(F32, Src, Flags);
3312 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3313 .addUse(Ext.getReg(0))
3314 .setMIFlags(Flags);
3315 B.buildFPTrunc(Dst, Log2, Flags);
3316 MI.eraseFromParent();
3317 return true;
3318 }
3319
3320 assert(Ty == LLT::scalar(32));
3321
3322 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3323 if (!ScaledInput) {
3324 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3325 .addUse(Src)
3326 .setMIFlags(Flags);
3327 MI.eraseFromParent();
3328 return true;
3329 }
3330
3331 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3332 .addUse(ScaledInput)
3333 .setMIFlags(Flags);
3334
3335 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3336 auto Zero = B.buildFConstant(Ty, 0.0);
3337 auto ResultOffset =
3338 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3339 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3340
3341 MI.eraseFromParent();
3342 return true;
3343}
3344
3346 Register Z, unsigned Flags) {
3347 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3348 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3349}
3350
3352 MachineIRBuilder &B) const {
3353 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3354 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3355
3356 MachineRegisterInfo &MRI = *B.getMRI();
3357 Register Dst = MI.getOperand(0).getReg();
3358 Register X = MI.getOperand(1).getReg();
3359 unsigned Flags = MI.getFlags();
3360 const LLT Ty = MRI.getType(X);
3361 MachineFunction &MF = B.getMF();
3362
3363 const LLT F32 = LLT::scalar(32);
3364 const LLT F16 = LLT::scalar(16);
3365
3366 const AMDGPUTargetMachine &TM =
3367 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3368
3369 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3370 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3371 if (Ty == F16 && !ST.has16BitInsts()) {
3372 Register LogVal = MRI.createGenericVirtualRegister(F32);
3373 auto PromoteSrc = B.buildFPExt(F32, X);
3374 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3375 B.buildFPTrunc(Dst, LogVal);
3376 } else {
3377 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3378 }
3379
3380 MI.eraseFromParent();
3381 return true;
3382 }
3383
3384 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3385 if (ScaledInput)
3386 X = ScaledInput;
3387
3388 auto Y =
3389 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3390
3391 Register R;
3392 if (ST.hasFastFMAF32()) {
3393 // c+cc are ln(2)/ln(10) to more than 49 bits
3394 const float c_log10 = 0x1.344134p-2f;
3395 const float cc_log10 = 0x1.09f79ep-26f;
3396
3397 // c + cc is ln(2) to more than 49 bits
3398 const float c_log = 0x1.62e42ep-1f;
3399 const float cc_log = 0x1.efa39ep-25f;
3400
3401 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3402 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3403
3404 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3405 auto NegR = B.buildFNeg(Ty, R, Flags);
3406 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3407 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3408 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3409 } else {
3410 // ch+ct is ln(2)/ln(10) to more than 36 bits
3411 const float ch_log10 = 0x1.344000p-2f;
3412 const float ct_log10 = 0x1.3509f6p-18f;
3413
3414 // ch + ct is ln(2) to more than 36 bits
3415 const float ch_log = 0x1.62e000p-1f;
3416 const float ct_log = 0x1.0bfbe8p-15f;
3417
3418 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3419 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3420
3421 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3422 auto YH = B.buildAnd(Ty, Y, MaskConst);
3423 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3424 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3425
3426 Register Mad0 =
3427 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3428 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3429 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3430 }
3431
3432 const bool IsFiniteOnly =
3433 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3434 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3435
3436 if (!IsFiniteOnly) {
3437 // Expand isfinite(x) => fabs(x) < inf
3438 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3439 auto Fabs = B.buildFAbs(Ty, Y);
3440 auto IsFinite =
3441 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3442 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3443 }
3444
3445 if (ScaledInput) {
3446 auto Zero = B.buildFConstant(Ty, 0.0);
3447 auto ShiftK =
3448 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3449 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3450 B.buildFSub(Dst, R, Shift, Flags);
3451 } else {
3452 B.buildCopy(Dst, R);
3453 }
3454
3455 MI.eraseFromParent();
3456 return true;
3457}
3458
3460 Register Src, bool IsLog10,
3461 unsigned Flags) const {
3462 const double Log2BaseInverted =
3464
3465 LLT Ty = B.getMRI()->getType(Dst);
3466
3467 if (Ty == LLT::scalar(32)) {
3468 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3469 if (ScaledInput) {
3470 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3471 .addUse(Src)
3472 .setMIFlags(Flags);
3473 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3474 auto Zero = B.buildFConstant(Ty, 0.0);
3475 auto ResultOffset =
3476 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3477 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3478
3479 if (ST.hasFastFMAF32())
3480 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3481 else {
3482 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3483 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3484 }
3485
3486 return true;
3487 }
3488 }
3489
3490 auto Log2Operand = Ty == LLT::scalar(16)
3491 ? B.buildFLog2(Ty, Src, Flags)
3492 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3493 .addUse(Src)
3494 .setMIFlags(Flags);
3495 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3496 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3497 return true;
3498}
3499
3501 MachineIRBuilder &B) const {
3502 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3503 // If we have to handle denormals, scale up the input and adjust the result.
3504
3505 Register Dst = MI.getOperand(0).getReg();
3506 Register Src = MI.getOperand(1).getReg();
3507 unsigned Flags = MI.getFlags();
3508 LLT Ty = B.getMRI()->getType(Dst);
3509 const LLT F16 = LLT::scalar(16);
3510 const LLT F32 = LLT::scalar(32);
3511
3512 if (Ty == F16) {
3513 // Nothing in half is a denormal when promoted to f32.
3514 auto Ext = B.buildFPExt(F32, Src, Flags);
3515 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3516 .addUse(Ext.getReg(0))
3517 .setMIFlags(Flags);
3518 B.buildFPTrunc(Dst, Log2, Flags);
3519 MI.eraseFromParent();
3520 return true;
3521 }
3522
3523 assert(Ty == F32);
3524
3525 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3526 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3527 .addUse(Src)
3528 .setMIFlags(Flags);
3529 MI.eraseFromParent();
3530 return true;
3531 }
3532
3533 // bool needs_scaling = x < -0x1.f80000p+6f;
3534 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3535
3536 // -nextafter(128.0, -1)
3537 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3538 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3539 RangeCheckConst, Flags);
3540
3541 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3542 auto Zero = B.buildFConstant(Ty, 0.0);
3543 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3544 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3545
3546 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3547 .addUse(AddInput.getReg(0))
3548 .setMIFlags(Flags);
3549
3550 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3551 auto One = B.buildFConstant(Ty, 1.0);
3552 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3553 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3554 MI.eraseFromParent();
3555 return true;
3556}
3557
3559 Register X, unsigned Flags) const {
3560 LLT Ty = B.getMRI()->getType(Dst);
3561 LLT F32 = LLT::scalar(32);
3562
3563 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3564 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3565 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3566
3567 if (Ty == F32) {
3568 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3569 .addUse(Mul.getReg(0))
3570 .setMIFlags(Flags);
3571 } else {
3572 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3573 }
3574
3575 return true;
3576 }
3577
3578 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3579 auto NeedsScaling =
3580 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3581 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3582 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3583 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3584
3585 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3586 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3587
3588 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3589 .addUse(ExpInput.getReg(0))
3590 .setMIFlags(Flags);
3591
3592 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3593 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3594 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3595 return true;
3596}
3597
3599 MachineIRBuilder &B) const {
3600 Register Dst = MI.getOperand(0).getReg();
3601 Register X = MI.getOperand(1).getReg();
3602 const unsigned Flags = MI.getFlags();
3603 MachineFunction &MF = B.getMF();
3604 MachineRegisterInfo &MRI = *B.getMRI();
3605 LLT Ty = MRI.getType(Dst);
3606 const LLT F16 = LLT::scalar(16);
3607 const LLT F32 = LLT::scalar(32);
3608 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3609
3610 if (Ty == F16) {
3611 // v_exp_f16 (fmul x, log2e)
3612 if (allowApproxFunc(MF, Flags)) {
3613 // TODO: Does this really require fast?
3614 legalizeFExpUnsafe(B, Dst, X, Flags);
3615 MI.eraseFromParent();
3616 return true;
3617 }
3618
3619 // exp(f16 x) ->
3620 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3621
3622 // Nothing in half is a denormal when promoted to f32.
3623 auto Ext = B.buildFPExt(F32, X, Flags);
3624 Register Lowered = MRI.createGenericVirtualRegister(F32);
3625 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3626 B.buildFPTrunc(Dst, Lowered, Flags);
3627 MI.eraseFromParent();
3628 return true;
3629 }
3630
3631 assert(Ty == F32);
3632
3633 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3634 // library behavior. Also, is known-not-daz source sufficient?
3635 if (allowApproxFunc(MF, Flags)) {
3636 legalizeFExpUnsafe(B, Dst, X, Flags);
3637 MI.eraseFromParent();
3638 return true;
3639 }
3640
3641 // Algorithm:
3642 //
3643 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3644 //
3645 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3646 // n = 64*m + j, 0 <= j < 64
3647 //
3648 // e^x = 2^((64*m + j + f)/64)
3649 // = (2^m) * (2^(j/64)) * 2^(f/64)
3650 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3651 //
3652 // f = x*(64/ln(2)) - n
3653 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3654 //
3655 // e^x = (2^m) * (2^(j/64)) * e^r
3656 //
3657 // (2^(j/64)) is precomputed
3658 //
3659 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3660 // e^r = 1 + q
3661 //
3662 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3663 //
3664 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3665 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3666 Register PH, PL;
3667
3668 if (ST.hasFastFMAF32()) {
3669 const float c_exp = numbers::log2ef;
3670 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3671 const float c_exp10 = 0x1.a934f0p+1f;
3672 const float cc_exp10 = 0x1.2f346ep-24f;
3673
3674 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3675 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3676 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3677 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3678
3679 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3680 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3681 } else {
3682 const float ch_exp = 0x1.714000p+0f;
3683 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3684
3685 const float ch_exp10 = 0x1.a92000p+1f;
3686 const float cl_exp10 = 0x1.4f0978p-11f;
3687
3688 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3689 auto XH = B.buildAnd(Ty, X, MaskConst);
3690 auto XL = B.buildFSub(Ty, X, XH, Flags);
3691
3692 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3693 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3694
3695 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3696 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3697
3698 Register Mad0 =
3699 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3700 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3701 }
3702
3703 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3704
3705 // It is unsafe to contract this fsub into the PH multiply.
3706 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3707 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3708 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3709
3710 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3711 .addUse(A.getReg(0))
3712 .setMIFlags(Flags);
3713 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3714
3715 auto UnderflowCheckConst =
3716 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3717 auto Zero = B.buildFConstant(Ty, 0.0);
3718 auto Underflow =
3719 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3720
3721 R = B.buildSelect(Ty, Underflow, Zero, R);
3722
3723 const auto &Options = MF.getTarget().Options;
3724
3725 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3726 auto OverflowCheckConst =
3727 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3728
3729 auto Overflow =
3730 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3731 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3732 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3733 }
3734
3735 B.buildCopy(Dst, R);
3736 MI.eraseFromParent();
3737 return true;
3738}
3739
3741 MachineIRBuilder &B) const {
3742 Register Dst = MI.getOperand(0).getReg();
3743 Register Src0 = MI.getOperand(1).getReg();
3744 Register Src1 = MI.getOperand(2).getReg();
3745 unsigned Flags = MI.getFlags();
3746 LLT Ty = B.getMRI()->getType(Dst);
3747 const LLT F16 = LLT::float16();
3748 const LLT F32 = LLT::float32();
3749
3750 if (Ty == F32) {
3751 auto Log = B.buildFLog2(F32, Src0, Flags);
3752 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3753 .addUse(Log.getReg(0))
3754 .addUse(Src1)
3755 .setMIFlags(Flags);
3756 B.buildFExp2(Dst, Mul, Flags);
3757 } else if (Ty == F16) {
3758 // There's no f16 fmul_legacy, so we need to convert for it.
3759 auto Log = B.buildFLog2(F16, Src0, Flags);
3760 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3761 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3762 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3763 .addUse(Ext0.getReg(0))
3764 .addUse(Ext1.getReg(0))
3765 .setMIFlags(Flags);
3766 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3767 } else
3768 return false;
3769
3770 MI.eraseFromParent();
3771 return true;
3772}
3773
3774// Find a source register, ignoring any possible source modifiers.
3776 Register ModSrc = OrigSrc;
3777 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3778 ModSrc = SrcFNeg->getOperand(1).getReg();
3779 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3780 ModSrc = SrcFAbs->getOperand(1).getReg();
3781 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3782 ModSrc = SrcFAbs->getOperand(1).getReg();
3783 return ModSrc;
3784}
3785
3788 MachineIRBuilder &B) const {
3789
3790 const LLT S1 = LLT::scalar(1);
3791 const LLT F64 = LLT::float64();
3792 Register Dst = MI.getOperand(0).getReg();
3793 Register OrigSrc = MI.getOperand(1).getReg();
3794 unsigned Flags = MI.getFlags();
3795 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3796 "this should not have been custom lowered");
3797
3798 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3799 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3800 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3801 // V_FRACT bug is:
3802 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3803 //
3804 // Convert floor(x) to (x - fract(x))
3805
3806 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3807 .addUse(OrigSrc)
3808 .setMIFlags(Flags);
3809
3810 // Give source modifier matching some assistance before obscuring a foldable
3811 // pattern.
3812
3813 // TODO: We can avoid the neg on the fract? The input sign to fract
3814 // shouldn't matter?
3815 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3816
3817 auto Const =
3818 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3819
3820 Register Min = MRI.createGenericVirtualRegister(F64);
3821
3822 // We don't need to concern ourselves with the snan handling difference, so
3823 // use the one which will directly select.
3824 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3825 if (MFI->getMode().IEEE)
3826 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3827 else
3828 B.buildFMinNum(Min, Fract, Const, Flags);
3829
3830 Register CorrectedFract = Min;
3831 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3832 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3833 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3834 }
3835
3836 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3837 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3838
3839 MI.eraseFromParent();
3840 return true;
3841}
3842
3843// Turn an illegal packed v2s16 build vector into bit operations.
3844// TODO: This should probably be a bitcast action in LegalizerHelper.
3847 Register Dst = MI.getOperand(0).getReg();
3848 const LLT S32 = LLT::scalar(32);
3849 const LLT S16 = LLT::scalar(16);
3850 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3851
3852 Register Src0 = MI.getOperand(1).getReg();
3853 Register Src1 = MI.getOperand(2).getReg();
3854
3855 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3856 assert(MRI.getType(Src0) == S32);
3857 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3858 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3859 }
3860
3861 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3862 B.buildBitcast(Dst, Merge);
3863
3864 MI.eraseFromParent();
3865 return true;
3866}
3867
3868// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3869//
3870// Source and accumulation registers must all be 32-bits.
3871//
3872// TODO: When the multiply is uniform, we should produce a code sequence
3873// that is better suited to instruction selection on the SALU. Instead of
3874// the outer loop going over parts of the result, the outer loop should go
3875// over parts of one of the factors. This should result in instruction
3876// selection that makes full use of S_ADDC_U32 instructions.
3879 ArrayRef<Register> Src0,
3880 ArrayRef<Register> Src1,
3881 bool UsePartialMad64_32,
3882 bool SeparateOddAlignedProducts) const {
3883 // Use (possibly empty) vectors of S1 registers to represent the set of
3884 // carries from one pair of positions to the next.
3885 using Carry = SmallVector<Register, 2>;
3886
3887 MachineIRBuilder &B = Helper.MIRBuilder;
3888 GISelKnownBits &KB = *Helper.getKnownBits();
3889
3890 const LLT S1 = LLT::scalar(1);
3891 const LLT S32 = LLT::scalar(32);
3892 const LLT S64 = LLT::scalar(64);
3893
3894 Register Zero32;
3895 Register Zero64;
3896
3897 auto getZero32 = [&]() -> Register {
3898 if (!Zero32)
3899 Zero32 = B.buildConstant(S32, 0).getReg(0);
3900 return Zero32;
3901 };
3902 auto getZero64 = [&]() -> Register {
3903 if (!Zero64)
3904 Zero64 = B.buildConstant(S64, 0).getReg(0);
3905 return Zero64;
3906 };
3907
3908 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3909 for (unsigned i = 0; i < Src0.size(); ++i) {
3910 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3911 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3912 }
3913
3914 // Merge the given carries into the 32-bit LocalAccum, which is modified
3915 // in-place.
3916 //
3917 // Returns the carry-out, which is a single S1 register or null.
3918 auto mergeCarry =
3919 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3920 if (CarryIn.empty())
3921 return Register();
3922
3923 bool HaveCarryOut = true;
3924 Register CarryAccum;
3925 if (CarryIn.size() == 1) {
3926 if (!LocalAccum) {
3927 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3928 return Register();
3929 }
3930
3931 CarryAccum = getZero32();
3932 } else {
3933 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3934 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3935 CarryAccum =
3936 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3937 .getReg(0);
3938 }
3939
3940 if (!LocalAccum) {
3941 LocalAccum = getZero32();
3942 HaveCarryOut = false;
3943 }
3944 }
3945
3946 auto Add =
3947 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3948 LocalAccum = Add.getReg(0);
3949 return HaveCarryOut ? Add.getReg(1) : Register();
3950 };
3951
3952 // Build a multiply-add chain to compute
3953 //
3954 // LocalAccum + (partial products at DstIndex)
3955 // + (opportunistic subset of CarryIn)
3956 //
3957 // LocalAccum is an array of one or two 32-bit registers that are updated
3958 // in-place. The incoming registers may be null.
3959 //
3960 // In some edge cases, carry-ins can be consumed "for free". In that case,
3961 // the consumed carry bits are removed from CarryIn in-place.
3962 auto buildMadChain =
3963 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3964 -> Carry {
3965 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3966 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3967
3968 Carry CarryOut;
3969 unsigned j0 = 0;
3970
3971 // Use plain 32-bit multiplication for the most significant part of the
3972 // result by default.
3973 if (LocalAccum.size() == 1 &&
3974 (!UsePartialMad64_32 || !CarryIn.empty())) {
3975 do {
3976 // Skip multiplication if one of the operands is 0
3977 unsigned j1 = DstIndex - j0;
3978 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3979 ++j0;
3980 continue;
3981 }
3982 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3983 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3984 LocalAccum[0] = Mul.getReg(0);
3985 } else {
3986 if (CarryIn.empty()) {
3987 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3988 } else {
3989 LocalAccum[0] =
3990 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3991 .getReg(0);
3992 CarryIn.pop_back();
3993 }
3994 }
3995 ++j0;
3996 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3997 }
3998
3999 // Build full 64-bit multiplies.
4000 if (j0 <= DstIndex) {
4001 bool HaveSmallAccum = false;
4002 Register Tmp;
4003
4004 if (LocalAccum[0]) {
4005 if (LocalAccum.size() == 1) {
4006 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4007 HaveSmallAccum = true;
4008 } else if (LocalAccum[1]) {
4009 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4010 HaveSmallAccum = false;
4011 } else {
4012 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4013 HaveSmallAccum = true;
4014 }
4015 } else {
4016 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4017 Tmp = getZero64();
4018 HaveSmallAccum = true;
4019 }
4020
4021 do {
4022 unsigned j1 = DstIndex - j0;
4023 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4024 ++j0;
4025 continue;
4026 }
4027 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4028 {Src0[j0], Src1[j1], Tmp});
4029 Tmp = Mad.getReg(0);
4030 if (!HaveSmallAccum)
4031 CarryOut.push_back(Mad.getReg(1));
4032 HaveSmallAccum = false;
4033
4034 ++j0;
4035 } while (j0 <= DstIndex);
4036
4037 auto Unmerge = B.buildUnmerge(S32, Tmp);
4038 LocalAccum[0] = Unmerge.getReg(0);
4039 if (LocalAccum.size() > 1)
4040 LocalAccum[1] = Unmerge.getReg(1);
4041 }
4042
4043 return CarryOut;
4044 };
4045
4046 // Outer multiply loop, iterating over destination parts from least
4047 // significant to most significant parts.
4048 //
4049 // The columns of the following diagram correspond to the destination parts
4050 // affected by one iteration of the outer loop (ignoring boundary
4051 // conditions).
4052 //
4053 // Dest index relative to 2 * i: 1 0 -1
4054 // ------
4055 // Carries from previous iteration: e o
4056 // Even-aligned partial product sum: E E .
4057 // Odd-aligned partial product sum: O O
4058 //
4059 // 'o' is OddCarry, 'e' is EvenCarry.
4060 // EE and OO are computed from partial products via buildMadChain and use
4061 // accumulation where possible and appropriate.
4062 //
4063 Register SeparateOddCarry;
4064 Carry EvenCarry;
4065 Carry OddCarry;
4066
4067 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4068 Carry OddCarryIn = std::move(OddCarry);
4069 Carry EvenCarryIn = std::move(EvenCarry);
4070 OddCarry.clear();
4071 EvenCarry.clear();
4072
4073 // Partial products at offset 2 * i.
4074 if (2 * i < Accum.size()) {
4075 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4076 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4077 }
4078
4079 // Partial products at offset 2 * i - 1.
4080 if (i > 0) {
4081 if (!SeparateOddAlignedProducts) {
4082 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4083 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4084 } else {
4085 bool IsHighest = 2 * i >= Accum.size();
4086 Register SeparateOddOut[2];
4087 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4088 .take_front(IsHighest ? 1 : 2);
4089 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4090
4092
4093 if (i == 1) {
4094 if (!IsHighest)
4095 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4096 else
4097 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4098 } else {
4099 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4100 SeparateOddCarry);
4101 }
4102 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4103
4104 if (!IsHighest) {
4105 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4106 Lo->getOperand(1).getReg());
4107 Accum[2 * i] = Hi.getReg(0);
4108 SeparateOddCarry = Hi.getReg(1);
4109 }
4110 }
4111 }
4112
4113 // Add in the carries from the previous iteration
4114 if (i > 0) {
4115 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4116 EvenCarryIn.push_back(CarryOut);
4117
4118 if (2 * i < Accum.size()) {
4119 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4120 OddCarry.push_back(CarryOut);
4121 }
4122 }
4123 }
4124}
4125
4126// Custom narrowing of wide multiplies using wide multiply-add instructions.
4127//
4128// TODO: If the multiply is followed by an addition, we should attempt to
4129// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4131 MachineInstr &MI) const {
4132 assert(ST.hasMad64_32());
4133 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4134
4135 MachineIRBuilder &B = Helper.MIRBuilder;
4136 MachineRegisterInfo &MRI = *B.getMRI();
4137
4138 Register DstReg = MI.getOperand(0).getReg();
4139 Register Src0 = MI.getOperand(1).getReg();
4140 Register Src1 = MI.getOperand(2).getReg();
4141
4142 LLT Ty = MRI.getType(DstReg);
4143 assert(Ty.isScalar());
4144
4145 unsigned Size = Ty.getSizeInBits();
4146 unsigned NumParts = Size / 32;
4147 assert((Size % 32) == 0);
4148 assert(NumParts >= 2);
4149
4150 // Whether to use MAD_64_32 for partial products whose high half is
4151 // discarded. This avoids some ADD instructions but risks false dependency
4152 // stalls on some subtargets in some cases.
4153 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4154
4155 // Whether to compute odd-aligned partial products separately. This is
4156 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4157 // in an even-aligned VGPR.
4158 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4159
4160 LLT S32 = LLT::scalar(32);
4161 SmallVector<Register, 2> Src0Parts, Src1Parts;
4162 for (unsigned i = 0; i < NumParts; ++i) {
4163 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4164 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4165 }
4166 B.buildUnmerge(Src0Parts, Src0);
4167 B.buildUnmerge(Src1Parts, Src1);
4168
4169 SmallVector<Register, 2> AccumRegs(NumParts);
4170 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4171 SeparateOddAlignedProducts);
4172
4173 B.buildMergeLikeInstr(DstReg, AccumRegs);
4174 MI.eraseFromParent();
4175 return true;
4176}
4177
4178// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4179// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4180// case with a single min instruction instead of a compare+select.
4183 MachineIRBuilder &B) const {
4184 Register Dst = MI.getOperand(0).getReg();
4185 Register Src = MI.getOperand(1).getReg();
4186 LLT DstTy = MRI.getType(Dst);
4187 LLT SrcTy = MRI.getType(Src);
4188
4189 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4190 ? AMDGPU::G_AMDGPU_FFBH_U32
4191 : AMDGPU::G_AMDGPU_FFBL_B32;
4192 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4193 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4194
4195 MI.eraseFromParent();
4196 return true;
4197}
4198
4201 MachineIRBuilder &B) const {
4202 Register Dst = MI.getOperand(0).getReg();
4203 Register Src = MI.getOperand(1).getReg();
4204 LLT SrcTy = MRI.getType(Src);
4205 TypeSize NumBits = SrcTy.getSizeInBits();
4206
4207 assert(NumBits < 32u);
4208
4209 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4210 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4211 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4212 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4213 B.buildTrunc(Dst, Ctlz);
4214 MI.eraseFromParent();
4215 return true;
4216}
4217
4218// Check that this is a G_XOR x, -1
4219static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4220 if (MI.getOpcode() != TargetOpcode::G_XOR)
4221 return false;
4222 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4223 return ConstVal && *ConstVal == -1;
4224}
4225
4226// Return the use branch instruction, otherwise null if the usage is invalid.
4227static MachineInstr *
4229 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4230 Register CondDef = MI.getOperand(0).getReg();
4231 if (!MRI.hasOneNonDBGUse(CondDef))
4232 return nullptr;
4233
4234 MachineBasicBlock *Parent = MI.getParent();
4235 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4236
4237 if (isNot(MRI, *UseMI)) {
4238 Register NegatedCond = UseMI->getOperand(0).getReg();
4239 if (!MRI.hasOneNonDBGUse(NegatedCond))
4240 return nullptr;
4241
4242 // We're deleting the def of this value, so we need to remove it.
4243 eraseInstr(*UseMI, MRI);
4244
4245 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4246 Negated = true;
4247 }
4248
4249 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4250 return nullptr;
4251
4252 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4253 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4254 if (Next == Parent->end()) {
4255 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4256 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4257 return nullptr;
4258 UncondBrTarget = &*NextMBB;
4259 } else {
4260 if (Next->getOpcode() != AMDGPU::G_BR)
4261 return nullptr;
4262 Br = &*Next;
4263 UncondBrTarget = Br->getOperand(0).getMBB();
4264 }
4265
4266 return UseMI;
4267}
4268
4270 const ArgDescriptor *Arg,
4271 const TargetRegisterClass *ArgRC,
4272 LLT ArgTy) const {
4273 MCRegister SrcReg = Arg->getRegister();
4274 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4275 assert(DstReg.isVirtual() && "Virtual register expected");
4276
4277 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4278 *ArgRC, B.getDebugLoc(), ArgTy);
4279 if (Arg->isMasked()) {
4280 // TODO: Should we try to emit this once in the entry block?
4281 const LLT S32 = LLT::scalar(32);
4282 const unsigned Mask = Arg->getMask();
4283 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4284
4285 Register AndMaskSrc = LiveIn;
4286
4287 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4288 // 0.
4289 if (Shift != 0) {
4290 auto ShiftAmt = B.buildConstant(S32, Shift);
4291 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4292 }
4293
4294 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4295 } else {
4296 B.buildCopy(DstReg, LiveIn);
4297 }
4298
4299 return true;
4300}
4301
4303 Register DstReg, MachineIRBuilder &B,
4305 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4306 const ArgDescriptor *Arg = nullptr;
4307 const TargetRegisterClass *ArgRC;
4308 LLT ArgTy;
4309
4310 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4311 const ArgDescriptor WorkGroupIDX =
4312 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4313 // If GridZ is not programmed in an entry function then the hardware will set
4314 // it to all zeros, so there is no need to mask the GridY value in the low
4315 // order bits.
4316 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4317 AMDGPU::TTMP7,
4318 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4319 const ArgDescriptor WorkGroupIDZ =
4320 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4321 if (ST.hasArchitectedSGPRs() &&
4323 switch (ArgType) {
4325 Arg = &WorkGroupIDX;
4326 ArgRC = &AMDGPU::SReg_32RegClass;
4327 ArgTy = LLT::scalar(32);
4328 break;
4330 Arg = &WorkGroupIDY;
4331 ArgRC = &AMDGPU::SReg_32RegClass;
4332 ArgTy = LLT::scalar(32);
4333 break;
4335 Arg = &WorkGroupIDZ;
4336 ArgRC = &AMDGPU::SReg_32RegClass;
4337 ArgTy = LLT::scalar(32);
4338 break;
4339 default:
4340 break;
4341 }
4342 }
4343
4344 if (!Arg)
4345 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4346
4347 if (!Arg) {
4349 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4350 // case the pointer argument may be missing and we use null.
4351 B.buildConstant(DstReg, 0);
4352 return true;
4353 }
4354
4355 // It's undefined behavior if a function marked with the amdgpu-no-*
4356 // attributes uses the corresponding intrinsic.
4357 B.buildUndef(DstReg);
4358 return true;
4359 }
4360
4361 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4362 return false; // TODO: Handle these
4363 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4364}
4365
4369 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4370 return false;
4371
4372 MI.eraseFromParent();
4373 return true;
4374}
4375
4377 int64_t C) {
4378 B.buildConstant(MI.getOperand(0).getReg(), C);
4379 MI.eraseFromParent();
4380 return true;
4381}
4382
4385 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4386 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4387 if (MaxID == 0)
4388 return replaceWithConstant(B, MI, 0);
4389
4390 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4391 const ArgDescriptor *Arg;
4392 const TargetRegisterClass *ArgRC;
4393 LLT ArgTy;
4394 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4395
4396 Register DstReg = MI.getOperand(0).getReg();
4397 if (!Arg) {
4398 // It's undefined behavior if a function marked with the amdgpu-no-*
4399 // attributes uses the corresponding intrinsic.
4400 B.buildUndef(DstReg);
4401 MI.eraseFromParent();
4402 return true;
4403 }
4404
4405 if (Arg->isMasked()) {
4406 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4407 // masking operations anyway.
4408 //
4409 // TODO: We could assert the top bit is 0 for the source copy.
4410 if (!loadInputValue(DstReg, B, ArgType))
4411 return false;
4412 } else {
4413 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4414 if (!loadInputValue(TmpReg, B, ArgType))
4415 return false;
4416 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4417 }
4418
4419 MI.eraseFromParent();
4420 return true;
4421}
4422
4424 int64_t Offset) const {
4426 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4427
4428 // TODO: If we passed in the base kernel offset we could have a better
4429 // alignment than 4, but we don't really need it.
4430 if (!loadInputValue(KernArgReg, B,
4432 llvm_unreachable("failed to find kernarg segment ptr");
4433
4434 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4435 // TODO: Should get nuw
4436 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4437}
4438
4439/// Legalize a value that's loaded from kernel arguments. This is only used by
4440/// legacy intrinsics.
4444 Align Alignment) const {
4445 Register DstReg = MI.getOperand(0).getReg();
4446
4447 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4448 "unexpected kernarg parameter type");
4449
4452 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4455 MI.eraseFromParent();
4456 return true;
4457}
4458
4461 MachineIRBuilder &B) const {
4462 Register Dst = MI.getOperand(0).getReg();
4463 LLT DstTy =