LLVM 20.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(1);
283static const LLT S8 = LLT::scalar(8);
284static const LLT S16 = LLT::scalar(16);
285static const LLT S32 = LLT::scalar(32);
286static const LLT F32 = LLT::float32();
287static const LLT S64 = LLT::scalar(64);
288static const LLT F64 = LLT::float64();
289static const LLT S96 = LLT::scalar(96);
290static const LLT S128 = LLT::scalar(128);
291static const LLT S160 = LLT::scalar(160);
292static const LLT S224 = LLT::scalar(224);
293static const LLT S256 = LLT::scalar(256);
294static const LLT S512 = LLT::scalar(512);
296
297static const LLT V2S8 = LLT::fixed_vector(2, 8);
298static const LLT V2S16 = LLT::fixed_vector(2, 16);
299static const LLT V4S16 = LLT::fixed_vector(4, 16);
300static const LLT V6S16 = LLT::fixed_vector(6, 16);
301static const LLT V8S16 = LLT::fixed_vector(8, 16);
302static const LLT V10S16 = LLT::fixed_vector(10, 16);
303static const LLT V12S16 = LLT::fixed_vector(12, 16);
304static const LLT V16S16 = LLT::fixed_vector(16, 16);
305
307static const LLT V2BF16 = V2F16; // FIXME
308
309static const LLT V2S32 = LLT::fixed_vector(2, 32);
310static const LLT V3S32 = LLT::fixed_vector(3, 32);
311static const LLT V4S32 = LLT::fixed_vector(4, 32);
312static const LLT V5S32 = LLT::fixed_vector(5, 32);
313static const LLT V6S32 = LLT::fixed_vector(6, 32);
314static const LLT V7S32 = LLT::fixed_vector(7, 32);
315static const LLT V8S32 = LLT::fixed_vector(8, 32);
316static const LLT V9S32 = LLT::fixed_vector(9, 32);
317static const LLT V10S32 = LLT::fixed_vector(10, 32);
318static const LLT V11S32 = LLT::fixed_vector(11, 32);
319static const LLT V12S32 = LLT::fixed_vector(12, 32);
320static const LLT V16S32 = LLT::fixed_vector(16, 32);
321static const LLT V32S32 = LLT::fixed_vector(32, 32);
322
323static const LLT V2S64 = LLT::fixed_vector(2, 64);
324static const LLT V3S64 = LLT::fixed_vector(3, 64);
325static const LLT V4S64 = LLT::fixed_vector(4, 64);
326static const LLT V5S64 = LLT::fixed_vector(5, 64);
327static const LLT V6S64 = LLT::fixed_vector(6, 64);
328static const LLT V7S64 = LLT::fixed_vector(7, 64);
329static const LLT V8S64 = LLT::fixed_vector(8, 64);
330static const LLT V16S64 = LLT::fixed_vector(16, 64);
331
332static const LLT V2S128 = LLT::fixed_vector(2, 128);
333static const LLT V4S128 = LLT::fixed_vector(4, 128);
334
335static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
336 S160, S224, S256, S512};
337
338static std::initializer_list<LLT> AllS16Vectors{
340
341static std::initializer_list<LLT> AllS32Vectors = {
344
345static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
347
348// Checks whether a type is in the list of legal register types.
349static bool isRegisterClassType(LLT Ty) {
352
355}
356
357static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358 return [TypeIdx](const LegalityQuery &Query) {
359 return isRegisterClassType(Query.Types[TypeIdx]);
360 };
361}
362
363// If we have a truncating store or an extending load with a data size larger
364// than 32-bits, we need to reduce to a 32-bit type.
366 return [=](const LegalityQuery &Query) {
367 const LLT Ty = Query.Types[TypeIdx];
368 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
370 };
371}
372
373// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374// handle some operations by just promoting the register during
375// selection. There are also d16 loads on GFX9+ which preserve the high bits.
376static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377 bool IsLoad, bool IsAtomic) {
378 switch (AS) {
380 // FIXME: Private element size.
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
388 // Treat constant and global as identical. SMRD loads are sometimes usable for
389 // global loads (ideally constant address space should be eliminated)
390 // depending on the context. Legality cannot be context dependent, but
391 // RegBankSelect can split the load as necessary depending on the pointer
392 // register bank/uniformity and if the memory is invariant or not written in a
393 // kernel.
394 return IsLoad ? 512 : 128;
395 default:
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397 // if they may alias scratch depending on the subtarget. This needs to be
398 // moved to custom handling to use addressMayBeAccessedAsPrivate
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
400 }
401}
402
403static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404 const LegalityQuery &Query) {
405 const LLT Ty = Query.Types[0];
406
407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
409
410 unsigned RegSize = Ty.getSizeInBits();
411 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
413 unsigned AS = Query.Types[1].getAddressSpace();
414
415 // All of these need to be custom lowered to cast the pointer operand.
417 return false;
418
419 // Do not handle extending vector loads.
420 if (Ty.isVector() && MemSize != RegSize)
421 return false;
422
423 // TODO: We should be able to widen loads if the alignment is high enough, but
424 // we also need to modify the memory access size.
425#if 0
426 // Accept widening loads based on alignment.
427 if (IsLoad && MemSize < Size)
428 MemSize = std::max(MemSize, Align);
429#endif
430
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
432 if (MemSize != RegSize && RegSize != 32)
433 return false;
434
435 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436 Query.MMODescrs[0].Ordering !=
437 AtomicOrdering::NotAtomic))
438 return false;
439
440 switch (MemSize) {
441 case 8:
442 case 16:
443 case 32:
444 case 64:
445 case 128:
446 break;
447 case 96:
448 if (!ST.hasDwordx3LoadStores())
449 return false;
450 break;
451 case 256:
452 case 512:
453 // These may contextually need to be broken down.
454 break;
455 default:
456 return false;
457 }
458
459 assert(RegSize >= MemSize);
460
461 if (AlignBits < MemSize) {
462 const SITargetLowering *TLI = ST.getTargetLowering();
463 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
464 Align(AlignBits / 8)))
465 return false;
466 }
467
468 return true;
469}
470
471// The newer buffer intrinsic forms take their resource arguments as
472// pointers in address space 8, aka s128 values. However, in order to not break
473// SelectionDAG, the underlying operations have to continue to take v4i32
474// arguments. Therefore, we convert resource pointers - or vectors of them
475// to integer values here.
476static bool hasBufferRsrcWorkaround(const LLT Ty) {
478 return true;
479 if (Ty.isVector()) {
480 const LLT ElemTy = Ty.getElementType();
481 return hasBufferRsrcWorkaround(ElemTy);
482 }
483 return false;
484}
485
486// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487// workaround this. Eventually it should ignore the type for loads and only care
488// about the size. Return true in cases where we will workaround this for now by
489// bitcasting.
490static bool loadStoreBitcastWorkaround(const LLT Ty) {
492 return false;
493
494 const unsigned Size = Ty.getSizeInBits();
495 if (Size <= 64)
496 return false;
497 // Address space 8 pointers get their own workaround.
499 return false;
500 if (!Ty.isVector())
501 return true;
502
503 if (Ty.isPointerVector())
504 return true;
505
506 unsigned EltSize = Ty.getScalarSizeInBits();
507 return EltSize != 32 && EltSize != 64;
508}
509
510static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511 const LLT Ty = Query.Types[0];
512 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
514}
515
516/// Return true if a load or store of the type should be lowered with a bitcast
517/// to a different type.
518static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519 const LLT MemTy) {
520 const unsigned MemSizeInBits = MemTy.getSizeInBits();
521 const unsigned Size = Ty.getSizeInBits();
522 if (Size != MemSizeInBits)
523 return Size <= 32 && Ty.isVector();
524
526 return true;
527
528 // Don't try to handle bitcasting vector ext loads for now.
529 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530 (Size <= 32 || isRegisterSize(Size)) &&
532}
533
534/// Return true if we should legalize a load by widening an odd sized memory
535/// access up to the alignment. Note this case when the memory access itself
536/// changes, not the size of the result register.
537static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538 uint64_t AlignInBits, unsigned AddrSpace,
539 unsigned Opcode) {
540 unsigned SizeInBits = MemoryTy.getSizeInBits();
541 // We don't want to widen cases that are naturally legal.
542 if (isPowerOf2_32(SizeInBits))
543 return false;
544
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546 // end up widening these for a scalar load during RegBankSelect, if we don't
547 // have 96-bit scalar loads.
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549 return false;
550
551 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
552 return false;
553
554 // A load is known dereferenceable up to the alignment, so it's legal to widen
555 // to it.
556 //
557 // TODO: Could check dereferenceable for less aligned cases.
558 unsigned RoundedSize = NextPowerOf2(SizeInBits);
559 if (AlignInBits < RoundedSize)
560 return false;
561
562 // Do not widen if it would introduce a slow unaligned load.
563 const SITargetLowering *TLI = ST.getTargetLowering();
564 unsigned Fast = 0;
566 RoundedSize, AddrSpace, Align(AlignInBits / 8),
568 Fast;
569}
570
571static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572 unsigned Opcode) {
573 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574 return false;
575
576 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
577 Query.MMODescrs[0].AlignInBits,
578 Query.Types[1].getAddressSpace(), Opcode);
579}
580
581/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
585 MachineRegisterInfo &MRI, unsigned Idx) {
586 MachineOperand &MO = MI.getOperand(Idx);
587
588 const LLT PointerTy = MRI.getType(MO.getReg());
589
590 // Paranoidly prevent us from doing this multiple times.
592 return PointerTy;
593
594 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
595 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
596 if (!PointerTy.isVector()) {
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
599 const LLT S32 = LLT::scalar(32);
600
601 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
604 for (unsigned I = 0; I < NumParts; ++I)
605 VectorElems[I] =
606 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
608 MO.setReg(VectorReg);
609 return VectorTy;
610 }
611 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
613 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
615 MO.setReg(BitcastReg);
616
617 return VectorTy;
618}
619
620/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621/// the form in which the value must be in order to be passed to the low-level
622/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623/// needed in order to account for the fact that we can't define a register
624/// class for s128 without breaking SelectionDAG.
626 MachineRegisterInfo &MRI = *B.getMRI();
627 const LLT PointerTy = MRI.getType(Pointer);
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630
631 if (!PointerTy.isVector()) {
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633 SmallVector<Register, 4> PointerParts;
634 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
635 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
636 for (unsigned I = 0; I < NumParts; ++I)
637 PointerParts.push_back(Unmerged.getReg(I));
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
639 }
640 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
642}
643
645 unsigned Idx) {
646 MachineOperand &MO = MI.getOperand(Idx);
647
648 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649 // Paranoidly prevent us from doing this multiple times.
651 return;
653}
654
656 const GCNTargetMachine &TM)
657 : ST(ST_) {
658 using namespace TargetOpcode;
659
660 auto GetAddrSpacePtr = [&TM](unsigned AS) {
661 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
662 };
663
664 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
665 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
666 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
668 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
669 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
670 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
671 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
672 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
673 const LLT BufferStridedPtr =
674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
675
676 const LLT CodePtr = FlatPtr;
677
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
680 };
681
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
684 };
685
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
687
688 const std::initializer_list<LLT> FPTypesBase = {
689 S32, S64
690 };
691
692 const std::initializer_list<LLT> FPTypes16 = {
693 S32, S64, S16
694 };
695
696 const std::initializer_list<LLT> FPTypesPK16 = {
697 S32, S64, S16, V2S16
698 };
699
700 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
701
702 // s1 for VCC branches, s32 for SCC branches.
704
705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706 // elements for v3s16
709 .legalFor(AllS32Vectors)
711 .legalFor(AddrSpaces64)
712 .legalFor(AddrSpaces32)
713 .legalFor(AddrSpaces128)
714 .legalIf(isPointer(0))
715 .clampScalar(0, S16, S256)
717 .clampMaxNumElements(0, S32, 16)
719 .scalarize(0);
720
721 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722 // Full set of gfx9 features.
723 if (ST.hasScalarAddSub64()) {
724 getActionDefinitionsBuilder({G_ADD, G_SUB})
725 .legalFor({S64, S32, S16, V2S16})
726 .clampMaxNumElementsStrict(0, S16, 2)
727 .scalarize(0)
728 .minScalar(0, S16)
730 .maxScalar(0, S32);
731 } else {
732 getActionDefinitionsBuilder({G_ADD, G_SUB})
733 .legalFor({S32, S16, V2S16})
734 .clampMaxNumElementsStrict(0, S16, 2)
735 .scalarize(0)
736 .minScalar(0, S16)
738 .maxScalar(0, S32);
739 }
740
741 if (ST.hasScalarSMulU64()) {
743 .legalFor({S64, S32, S16, V2S16})
744 .clampMaxNumElementsStrict(0, S16, 2)
745 .scalarize(0)
746 .minScalar(0, S16)
748 .custom();
749 } else {
751 .legalFor({S32, S16, V2S16})
752 .clampMaxNumElementsStrict(0, S16, 2)
753 .scalarize(0)
754 .minScalar(0, S16)
756 .custom();
757 }
758 assert(ST.hasMad64_32());
759
760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761 .legalFor({S32, S16, V2S16}) // Clamp modifier
762 .minScalarOrElt(0, S16)
764 .scalarize(0)
766 .lower();
767 } else if (ST.has16BitInsts()) {
768 getActionDefinitionsBuilder({G_ADD, G_SUB})
769 .legalFor({S32, S16})
770 .minScalar(0, S16)
772 .maxScalar(0, S32)
773 .scalarize(0);
774
776 .legalFor({S32, S16})
777 .scalarize(0)
778 .minScalar(0, S16)
779 .widenScalarToNextMultipleOf(0, 32)
780 .custom();
781 assert(ST.hasMad64_32());
782
783 // Technically the saturating operations require clamp bit support, but this
784 // was introduced at the same time as 16-bit operations.
785 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
786 .legalFor({S32, S16}) // Clamp modifier
787 .minScalar(0, S16)
788 .scalarize(0)
790 .lower();
791
792 // We're just lowering this, but it helps get a better result to try to
793 // coerce to the desired type first.
794 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
795 .minScalar(0, S16)
796 .scalarize(0)
797 .lower();
798 } else {
799 getActionDefinitionsBuilder({G_ADD, G_SUB})
800 .legalFor({S32})
801 .widenScalarToNextMultipleOf(0, 32)
802 .clampScalar(0, S32, S32)
803 .scalarize(0);
804
805 auto &Mul = getActionDefinitionsBuilder(G_MUL)
806 .legalFor({S32})
807 .scalarize(0)
808 .minScalar(0, S32)
809 .widenScalarToNextMultipleOf(0, 32);
810
811 if (ST.hasMad64_32())
812 Mul.custom();
813 else
814 Mul.maxScalar(0, S32);
815
816 if (ST.hasIntClamp()) {
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32}) // Clamp modifier.
819 .scalarize(0)
820 .minScalarOrElt(0, S32)
821 .lower();
822 } else {
823 // Clamp bit support was added in VI, along with 16-bit operations.
824 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
825 .minScalar(0, S32)
826 .scalarize(0)
827 .lower();
828 }
829
830 // FIXME: DAG expansion gets better results. The widening uses the smaller
831 // range values and goes for the min/max lowering directly.
832 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
833 .minScalar(0, S32)
834 .scalarize(0)
835 .lower();
836 }
837
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840 .customFor({S32, S64})
841 .clampScalar(0, S32, S64)
843 .scalarize(0);
844
845 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
846 .legalFor({S32})
847 .maxScalar(0, S32);
848
849 if (ST.hasVOP3PInsts()) {
850 Mulh
851 .clampMaxNumElements(0, S8, 2)
852 .lowerFor({V2S8});
853 }
854
855 Mulh
856 .scalarize(0)
857 .lower();
858
859 // Report legal for any types we can handle anywhere. For the cases only legal
860 // on the SALU, RegBankSelect will be able to re-legalize.
861 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
862 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
863 .clampScalar(0, S32, S64)
867 .scalarize(0);
868
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871 .legalFor({{S32, S1}, {S32, S32}})
872 .clampScalar(0, S32, S32)
873 .scalarize(0);
874
876 // Don't worry about the size constraint.
878 .lower();
879
881 .legalFor({S1, S32, S64, S16, GlobalPtr,
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883 .legalIf(isPointer(0))
884 .clampScalar(0, S32, S64)
886
887 getActionDefinitionsBuilder(G_FCONSTANT)
888 .legalFor({S32, S64, S16})
889 .clampScalar(0, S16, S64);
890
891 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
892 .legalIf(isRegisterType(0))
893 // s1 and s16 are special cases because they have legal operations on
894 // them, but don't really occupy registers in the normal way.
895 .legalFor({S1, S16})
896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
899 .clampMaxNumElements(0, S32, 16);
900
901 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
902
903 // If the amount is divergent, we have to do a wave reduction to get the
904 // maximum value, so this is expanded during RegBankSelect.
905 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
906 .legalFor({{PrivatePtr, S32}});
907
908 getActionDefinitionsBuilder(G_STACKSAVE)
909 .customFor({PrivatePtr});
910 getActionDefinitionsBuilder(G_STACKRESTORE)
911 .legalFor({PrivatePtr});
912
913 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
914
915 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
916 .customIf(typeIsNot(0, PrivatePtr));
917
918 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
919
920 auto &FPOpActions = getActionDefinitionsBuilder(
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923 .legalFor({S32, S64});
924 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
925 .customFor({S32, S64});
926 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
927 .customFor({S32, S64});
928
929 if (ST.has16BitInsts()) {
930 if (ST.hasVOP3PInsts())
931 FPOpActions.legalFor({S16, V2S16});
932 else
933 FPOpActions.legalFor({S16});
934
935 TrigActions.customFor({S16});
936 FDIVActions.customFor({S16});
937 }
938
939 if (ST.hasPackedFP32Ops()) {
940 FPOpActions.legalFor({V2S32});
941 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
942 }
943
944 auto &MinNumMaxNum = getActionDefinitionsBuilder({
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
946
947 if (ST.hasVOP3PInsts()) {
948 MinNumMaxNum.customFor(FPTypesPK16)
951 .clampScalar(0, S16, S64)
952 .scalarize(0);
953 } else if (ST.has16BitInsts()) {
954 MinNumMaxNum.customFor(FPTypes16)
955 .clampScalar(0, S16, S64)
956 .scalarize(0);
957 } else {
958 MinNumMaxNum.customFor(FPTypesBase)
959 .clampScalar(0, S32, S64)
960 .scalarize(0);
961 }
962
963 if (ST.hasVOP3PInsts())
964 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
965
966 FPOpActions
967 .scalarize(0)
968 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
969
970 TrigActions
971 .scalarize(0)
972 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
973
974 FDIVActions
975 .scalarize(0)
976 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
977
978 getActionDefinitionsBuilder({G_FNEG, G_FABS})
979 .legalFor(FPTypesPK16)
981 .scalarize(0)
982 .clampScalar(0, S16, S64);
983
984 if (ST.has16BitInsts()) {
986 .legalFor({S16})
987 .customFor({S32, S64})
988 .scalarize(0)
989 .unsupported();
991 .legalFor({S32, S64, S16})
992 .scalarize(0)
993 .clampScalar(0, S16, S64);
994
995 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
996 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
997 .scalarize(0)
998 .maxScalarIf(typeIs(0, S16), 1, S16)
999 .clampScalar(1, S32, S32)
1000 .lower();
1001
1003 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004 .scalarize(0)
1005 .lower();
1006 } else {
1008 .customFor({S32, S64, S16})
1009 .scalarize(0)
1010 .unsupported();
1011
1012
1013 if (ST.hasFractBug()) {
1015 .customFor({S64})
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 } else {
1021 .legalFor({S32, S64})
1022 .scalarize(0)
1023 .clampScalar(0, S32, S64);
1024 }
1025
1026 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1027 .legalFor({{S32, S32}, {S64, S32}})
1028 .scalarize(0)
1029 .clampScalar(0, S32, S64)
1030 .clampScalar(1, S32, S32)
1031 .lower();
1032
1034 .customFor({{S32, S32}, {S64, S32}})
1035 .scalarize(0)
1036 .minScalar(0, S32)
1037 .clampScalar(1, S32, S32)
1038 .lower();
1039 }
1040
1042 .legalFor({{S32, S64}, {S16, S32}})
1043 .scalarize(0)
1044 .lower();
1045
1047 .legalFor({{S64, S32}, {S32, S16}})
1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1049 .scalarize(0);
1050
1051 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1052 if (ST.has16BitInsts()) {
1053 FSubActions
1054 // Use actual fsub instruction
1055 .legalFor({S32, S16})
1056 // Must use fadd + fneg
1057 .lowerFor({S64, V2S16});
1058 } else {
1059 FSubActions
1060 // Use actual fsub instruction
1061 .legalFor({S32})
1062 // Must use fadd + fneg
1063 .lowerFor({S64, S16, V2S16});
1064 }
1065
1066 FSubActions
1067 .scalarize(0)
1068 .clampScalar(0, S32, S64);
1069
1070 // Whether this is legal depends on the floating point mode for the function.
1071 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1072 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073 FMad.customFor({S32, S16});
1074 else if (ST.hasMadMacF32Insts())
1075 FMad.customFor({S32});
1076 else if (ST.hasMadF16())
1077 FMad.customFor({S16});
1078 FMad.scalarize(0)
1079 .lower();
1080
1081 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1082 if (ST.has16BitInsts()) {
1083 FRem.customFor({S16, S32, S64});
1084 } else {
1085 FRem.minScalar(0, S32)
1086 .customFor({S32, S64});
1087 }
1088 FRem.scalarize(0);
1089
1090 // TODO: Do we need to clamp maximum bitwidth?
1092 .legalIf(isScalar(0))
1093 .legalFor({{V2S16, V2S32}})
1094 .clampMaxNumElements(0, S16, 2)
1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096 // situations (like an invalid implicit use), we don't want to infinite loop
1097 // in the legalizer.
1099 .alwaysLegal();
1100
1101 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1102 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1103 {S32, S1}, {S64, S1}, {S16, S1}})
1104 .scalarize(0)
1105 .clampScalar(0, S32, S64)
1106 .widenScalarToNextPow2(1, 32);
1107
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1109 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1110 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1111 .lowerIf(typeIs(1, S1))
1112 .customFor({{S32, S64}, {S64, S64}});
1113 if (ST.has16BitInsts())
1114 IToFP.legalFor({{S16, S16}});
1115 IToFP.clampScalar(1, S32, S64)
1116 .minScalar(0, S32)
1117 .scalarize(0)
1119
1120 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1121 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1122 .customFor({{S64, S32}, {S64, S64}})
1123 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1124 if (ST.has16BitInsts())
1125 FPToI.legalFor({{S16, S16}});
1126 else
1127 FPToI.minScalar(1, S32);
1128
1129 FPToI.minScalar(0, S32)
1130 .widenScalarToNextPow2(0, 32)
1131 .scalarize(0)
1132 .lower();
1133
1134 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1135 .clampScalar(0, S16, S64)
1136 .scalarize(0)
1137 .lower();
1138
1139 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1140 .customFor({S16, S32})
1141 .scalarize(0)
1142 .lower();
1143
1144 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1145 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1146 .scalarize(0)
1147 .lower();
1148
1149 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1150 .clampScalar(0, S16, S64)
1151 .scalarize(0)
1152 .lower();
1153
1154 if (ST.has16BitInsts()) {
1155 getActionDefinitionsBuilder(
1156 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1157 .legalFor({S16, S32, S64})
1158 .clampScalar(0, S16, S64)
1159 .scalarize(0);
1160 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1161 getActionDefinitionsBuilder(
1162 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1163 .legalFor({S32, S64})
1164 .clampScalar(0, S32, S64)
1165 .scalarize(0);
1166 } else {
1167 getActionDefinitionsBuilder(
1168 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1169 .legalFor({S32})
1170 .customFor({S64})
1171 .clampScalar(0, S32, S64)
1172 .scalarize(0);
1173 }
1174
1175 getActionDefinitionsBuilder(G_PTR_ADD)
1176 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1177 .legalIf(all(isPointer(0), sameSize(0, 1)))
1178 .scalarize(0)
1179 .scalarSameSizeAs(1, 0);
1180
1181 getActionDefinitionsBuilder(G_PTRMASK)
1182 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1183 .scalarSameSizeAs(1, 0)
1184 .scalarize(0);
1185
1186 auto &CmpBuilder =
1187 getActionDefinitionsBuilder(G_ICMP)
1188 // The compare output type differs based on the register bank of the output,
1189 // so make both s1 and s32 legal.
1190 //
1191 // Scalar compares producing output in scc will be promoted to s32, as that
1192 // is the allocatable register type that will be needed for the copy from
1193 // scc. This will be promoted during RegBankSelect, and we assume something
1194 // before that won't try to use s32 result types.
1195 //
1196 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1197 // bank.
1198 .legalForCartesianProduct(
1199 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1200 .legalForCartesianProduct(
1201 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1202 if (ST.has16BitInsts()) {
1203 CmpBuilder.legalFor({{S1, S16}});
1204 }
1205
1206 CmpBuilder
1207 .widenScalarToNextPow2(1)
1208 .clampScalar(1, S32, S64)
1209 .scalarize(0)
1210 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1211
1212 auto &FCmpBuilder =
1213 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1214 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1215
1216 if (ST.hasSALUFloatInsts())
1217 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1218
1219 FCmpBuilder
1220 .widenScalarToNextPow2(1)
1221 .clampScalar(1, S32, S64)
1222 .scalarize(0);
1223
1224 // FIXME: fpow has a selection pattern that should move to custom lowering.
1225 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1226 if (ST.has16BitInsts())
1227 ExpOps.customFor({{S32}, {S16}});
1228 else
1229 ExpOps.customFor({S32});
1230 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1231 .scalarize(0);
1232
1233 getActionDefinitionsBuilder(G_FPOWI)
1234 .clampScalar(0, MinScalarFPTy, S32)
1235 .lower();
1236
1237 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1238 Log2Ops.customFor({S32});
1239 if (ST.has16BitInsts())
1240 Log2Ops.legalFor({S16});
1241 else
1242 Log2Ops.customFor({S16});
1243 Log2Ops.scalarize(0)
1244 .lower();
1245
1246 auto &LogOps =
1247 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1248 LogOps.customFor({S32, S16});
1249 LogOps.clampScalar(0, MinScalarFPTy, S32)
1250 .scalarize(0);
1251
1252 // The 64-bit versions produce 32-bit results, but only on the SALU.
1253 getActionDefinitionsBuilder(G_CTPOP)
1254 .legalFor({{S32, S32}, {S32, S64}})
1255 .clampScalar(0, S32, S32)
1256 .widenScalarToNextPow2(1, 32)
1257 .clampScalar(1, S32, S64)
1258 .scalarize(0)
1259 .widenScalarToNextPow2(0, 32);
1260
1261 // If no 16 bit instr is available, lower into different instructions.
1262 if (ST.has16BitInsts())
1263 getActionDefinitionsBuilder(G_IS_FPCLASS)
1264 .legalForCartesianProduct({S1}, FPTypes16)
1265 .widenScalarToNextPow2(1)
1266 .scalarize(0)
1267 .lower();
1268 else
1269 getActionDefinitionsBuilder(G_IS_FPCLASS)
1270 .legalForCartesianProduct({S1}, FPTypesBase)
1271 .lowerFor({S1, S16})
1272 .widenScalarToNextPow2(1)
1273 .scalarize(0)
1274 .lower();
1275
1276 // The hardware instructions return a different result on 0 than the generic
1277 // instructions expect. The hardware produces -1, but these produce the
1278 // bitwidth.
1279 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1280 .scalarize(0)
1281 .clampScalar(0, S32, S32)
1282 .clampScalar(1, S32, S64)
1283 .widenScalarToNextPow2(0, 32)
1284 .widenScalarToNextPow2(1, 32)
1285 .custom();
1286
1287 // The 64-bit versions produce 32-bit results, but only on the SALU.
1288 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1289 .legalFor({{S32, S32}, {S32, S64}})
1290 .customIf(scalarNarrowerThan(1, 32))
1291 .clampScalar(0, S32, S32)
1292 .clampScalar(1, S32, S64)
1293 .scalarize(0)
1294 .widenScalarToNextPow2(0, 32)
1295 .widenScalarToNextPow2(1, 32);
1296
1297 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1298 .legalFor({{S32, S32}, {S32, S64}})
1299 .clampScalar(0, S32, S32)
1300 .clampScalar(1, S32, S64)
1301 .scalarize(0)
1302 .widenScalarToNextPow2(0, 32)
1303 .widenScalarToNextPow2(1, 32);
1304
1305 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1306 // RegBankSelect.
1307 getActionDefinitionsBuilder(G_BITREVERSE)
1308 .legalFor({S32, S64})
1309 .clampScalar(0, S32, S64)
1310 .scalarize(0)
1311 .widenScalarToNextPow2(0);
1312
1313 if (ST.has16BitInsts()) {
1314 getActionDefinitionsBuilder(G_BSWAP)
1315 .legalFor({S16, S32, V2S16})
1316 .clampMaxNumElementsStrict(0, S16, 2)
1317 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1318 // narrowScalar limitation.
1319 .widenScalarToNextPow2(0)
1320 .clampScalar(0, S16, S32)
1321 .scalarize(0);
1322
1323 if (ST.hasVOP3PInsts()) {
1324 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1325 .legalFor({S32, S16, V2S16})
1326 .clampMaxNumElements(0, S16, 2)
1327 .minScalar(0, S16)
1328 .widenScalarToNextPow2(0)
1329 .scalarize(0)
1330 .lower();
1331 } else {
1332 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1333 .legalFor({S32, S16})
1334 .widenScalarToNextPow2(0)
1335 .minScalar(0, S16)
1336 .scalarize(0)
1337 .lower();
1338 }
1339 } else {
1340 // TODO: Should have same legality without v_perm_b32
1341 getActionDefinitionsBuilder(G_BSWAP)
1342 .legalFor({S32})
1343 .lowerIf(scalarNarrowerThan(0, 32))
1344 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1345 // narrowScalar limitation.
1346 .widenScalarToNextPow2(0)
1347 .maxScalar(0, S32)
1348 .scalarize(0)
1349 .lower();
1350
1351 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1352 .legalFor({S32})
1353 .minScalar(0, S32)
1354 .widenScalarToNextPow2(0)
1355 .scalarize(0)
1356 .lower();
1357 }
1358
1359 getActionDefinitionsBuilder(G_INTTOPTR)
1360 // List the common cases
1361 .legalForCartesianProduct(AddrSpaces64, {S64})
1362 .legalForCartesianProduct(AddrSpaces32, {S32})
1363 .scalarize(0)
1364 // Accept any address space as long as the size matches
1365 .legalIf(sameSize(0, 1))
1366 .widenScalarIf(smallerThan(1, 0),
1367 [](const LegalityQuery &Query) {
1368 return std::pair(
1369 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1370 })
1371 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1372 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1373 });
1374
1375 getActionDefinitionsBuilder(G_PTRTOINT)
1376 // List the common cases
1377 .legalForCartesianProduct(AddrSpaces64, {S64})
1378 .legalForCartesianProduct(AddrSpaces32, {S32})
1379 .scalarize(0)
1380 // Accept any address space as long as the size matches
1381 .legalIf(sameSize(0, 1))
1382 .widenScalarIf(smallerThan(0, 1),
1383 [](const LegalityQuery &Query) {
1384 return std::pair(
1385 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1386 })
1387 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1388 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1389 });
1390
1391 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1392 .scalarize(0)
1393 .custom();
1394
1395 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1396 bool IsLoad) -> bool {
1397 const LLT DstTy = Query.Types[0];
1398
1399 // Split vector extloads.
1400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1401
1402 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1403 return true;
1404
1405 const LLT PtrTy = Query.Types[1];
1406 unsigned AS = PtrTy.getAddressSpace();
1407 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1408 Query.MMODescrs[0].Ordering !=
1410 return true;
1411
1412 // Catch weird sized loads that don't evenly divide into the access sizes
1413 // TODO: May be able to widen depending on alignment etc.
1414 unsigned NumRegs = (MemSize + 31) / 32;
1415 if (NumRegs == 3) {
1416 if (!ST.hasDwordx3LoadStores())
1417 return true;
1418 } else {
1419 // If the alignment allows, these should have been widened.
1420 if (!isPowerOf2_32(NumRegs))
1421 return true;
1422 }
1423
1424 return false;
1425 };
1426
1427 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1428 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1429 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1430
1431 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1432 // LDS
1433 // TODO: Unsupported flat for SI.
1434
1435 for (unsigned Op : {G_LOAD, G_STORE}) {
1436 const bool IsStore = Op == G_STORE;
1437
1438 auto &Actions = getActionDefinitionsBuilder(Op);
1439 // Explicitly list some common cases.
1440 // TODO: Does this help compile time at all?
1441 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1442 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1443 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1444 {S64, GlobalPtr, S64, GlobalAlign32},
1445 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1446 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1447 {S32, GlobalPtr, S8, GlobalAlign8},
1448 {S32, GlobalPtr, S16, GlobalAlign16},
1449
1450 {S32, LocalPtr, S32, 32},
1451 {S64, LocalPtr, S64, 32},
1452 {V2S32, LocalPtr, V2S32, 32},
1453 {S32, LocalPtr, S8, 8},
1454 {S32, LocalPtr, S16, 16},
1455 {V2S16, LocalPtr, S32, 32},
1456
1457 {S32, PrivatePtr, S32, 32},
1458 {S32, PrivatePtr, S8, 8},
1459 {S32, PrivatePtr, S16, 16},
1460 {V2S16, PrivatePtr, S32, 32},
1461
1462 {S32, ConstantPtr, S32, GlobalAlign32},
1463 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1464 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1465 {S64, ConstantPtr, S64, GlobalAlign32},
1466 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1467 Actions.legalIf(
1468 [=](const LegalityQuery &Query) -> bool {
1469 return isLoadStoreLegal(ST, Query);
1470 });
1471
1472 // The custom pointers (fat pointers, buffer resources) don't work with load
1473 // and store at this level. Fat pointers should have been lowered to
1474 // intrinsics before the translation to MIR.
1475 Actions.unsupportedIf(
1476 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1477
1478 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1479 // ptrtoint. This is needed to account for the fact that we can't have i128
1480 // as a register class for SelectionDAG reasons.
1481 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1482 return hasBufferRsrcWorkaround(Query.Types[0]);
1483 });
1484
1485 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1486 // 64-bits.
1487 //
1488 // TODO: Should generalize bitcast action into coerce, which will also cover
1489 // inserting addrspacecasts.
1490 Actions.customIf(typeIs(1, Constant32Ptr));
1491
1492 // Turn any illegal element vectors into something easier to deal
1493 // with. These will ultimately produce 32-bit scalar shifts to extract the
1494 // parts anyway.
1495 //
1496 // For odd 16-bit element vectors, prefer to split those into pieces with
1497 // 16-bit vector parts.
1498 Actions.bitcastIf(
1499 [=](const LegalityQuery &Query) -> bool {
1500 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1501 Query.MMODescrs[0].MemoryTy);
1502 }, bitcastToRegisterType(0));
1503
1504 if (!IsStore) {
1505 // Widen suitably aligned loads by loading extra bytes. The standard
1506 // legalization actions can't properly express widening memory operands.
1507 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1508 return shouldWidenLoad(ST, Query, G_LOAD);
1509 });
1510 }
1511
1512 // FIXME: load/store narrowing should be moved to lower action
1513 Actions
1514 .narrowScalarIf(
1515 [=](const LegalityQuery &Query) -> bool {
1516 return !Query.Types[0].isVector() &&
1517 needToSplitMemOp(Query, Op == G_LOAD);
1518 },
1519 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1520 const LLT DstTy = Query.Types[0];
1521 const LLT PtrTy = Query.Types[1];
1522
1523 const unsigned DstSize = DstTy.getSizeInBits();
1524 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1525
1526 // Split extloads.
1527 if (DstSize > MemSize)
1528 return std::pair(0, LLT::scalar(MemSize));
1529
1530 unsigned MaxSize = maxSizeForAddrSpace(
1531 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1532 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1533 if (MemSize > MaxSize)
1534 return std::pair(0, LLT::scalar(MaxSize));
1535
1536 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1537 return std::pair(0, LLT::scalar(Align));
1538 })
1539 .fewerElementsIf(
1540 [=](const LegalityQuery &Query) -> bool {
1541 return Query.Types[0].isVector() &&
1542 needToSplitMemOp(Query, Op == G_LOAD);
1543 },
1544 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1545 const LLT DstTy = Query.Types[0];
1546 const LLT PtrTy = Query.Types[1];
1547
1548 LLT EltTy = DstTy.getElementType();
1549 unsigned MaxSize = maxSizeForAddrSpace(
1550 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1551 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1552
1553 // FIXME: Handle widened to power of 2 results better. This ends
1554 // up scalarizing.
1555 // FIXME: 3 element stores scalarized on SI
1556
1557 // Split if it's too large for the address space.
1558 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1559 if (MemSize > MaxSize) {
1560 unsigned NumElts = DstTy.getNumElements();
1561 unsigned EltSize = EltTy.getSizeInBits();
1562
1563 if (MaxSize % EltSize == 0) {
1564 return std::pair(
1566 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1567 }
1568
1569 unsigned NumPieces = MemSize / MaxSize;
1570
1571 // FIXME: Refine when odd breakdowns handled
1572 // The scalars will need to be re-legalized.
1573 if (NumPieces == 1 || NumPieces >= NumElts ||
1574 NumElts % NumPieces != 0)
1575 return std::pair(0, EltTy);
1576
1577 return std::pair(0,
1578 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1579 }
1580
1581 // FIXME: We could probably handle weird extending loads better.
1582 if (DstTy.getSizeInBits() > MemSize)
1583 return std::pair(0, EltTy);
1584
1585 unsigned EltSize = EltTy.getSizeInBits();
1586 unsigned DstSize = DstTy.getSizeInBits();
1587 if (!isPowerOf2_32(DstSize)) {
1588 // We're probably decomposing an odd sized store. Try to split
1589 // to the widest type. TODO: Account for alignment. As-is it
1590 // should be OK, since the new parts will be further legalized.
1591 unsigned FloorSize = llvm::bit_floor(DstSize);
1592 return std::pair(
1594 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1595 }
1596
1597 // May need relegalization for the scalars.
1598 return std::pair(0, EltTy);
1599 })
1600 .minScalar(0, S32)
1601 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1602 .widenScalarToNextPow2(0)
1603 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1604 .lower();
1605 }
1606
1607 // FIXME: Unaligned accesses not lowered.
1608 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1609 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1610 {S32, GlobalPtr, S16, 2 * 8},
1611 {S32, LocalPtr, S8, 8},
1612 {S32, LocalPtr, S16, 16},
1613 {S32, PrivatePtr, S8, 8},
1614 {S32, PrivatePtr, S16, 16},
1615 {S32, ConstantPtr, S8, 8},
1616 {S32, ConstantPtr, S16, 2 * 8}})
1617 .legalIf(
1618 [=](const LegalityQuery &Query) -> bool {
1619 return isLoadStoreLegal(ST, Query);
1620 });
1621
1622 if (ST.hasFlatAddressSpace()) {
1623 ExtLoads.legalForTypesWithMemDesc(
1624 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1625 }
1626
1627 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1628 // 64-bits.
1629 //
1630 // TODO: Should generalize bitcast action into coerce, which will also cover
1631 // inserting addrspacecasts.
1632 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1633
1634 ExtLoads.clampScalar(0, S32, S32)
1635 .widenScalarToNextPow2(0)
1636 .lower();
1637
1638 auto &Atomics = getActionDefinitionsBuilder(
1639 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1640 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1641 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1642 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1643 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1644 {S64, GlobalPtr}, {S64, LocalPtr},
1645 {S32, RegionPtr}, {S64, RegionPtr}});
1646 if (ST.hasFlatAddressSpace()) {
1647 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1648 }
1649
1650 // TODO: v2bf16 operations, and fat buffer pointer support.
1651 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1652 if (ST.hasLDSFPAtomicAddF32()) {
1653 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1654 if (ST.hasLdsAtomicAddF64())
1655 Atomic.legalFor({{S64, LocalPtr}});
1656 if (ST.hasAtomicDsPkAdd16Insts())
1657 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1658 }
1659 if (ST.hasAtomicFaddInsts())
1660 Atomic.legalFor({{S32, GlobalPtr}});
1661 if (ST.hasFlatAtomicFaddF32Inst())
1662 Atomic.legalFor({{S32, FlatPtr}});
1663
1664 if (ST.hasGFX90AInsts()) {
1665 // These are legal with some caveats, and should have undergone expansion in
1666 // the IR in most situations
1667 // TODO: Move atomic expansion into legalizer
1668 Atomic.legalFor({
1669 {S32, GlobalPtr},
1670 {S64, GlobalPtr},
1671 {S64, FlatPtr}
1672 });
1673 }
1674
1675 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1676 ST.hasAtomicBufferGlobalPkAddF16Insts())
1677 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1678 if (ST.hasAtomicGlobalPkAddBF16Inst())
1679 Atomic.legalFor({{V2BF16, GlobalPtr}});
1680 if (ST.hasAtomicFlatPkAdd16Insts())
1681 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1682
1683
1684 // Most of the legalization work here is done by AtomicExpand. We could
1685 // probably use a simpler legality rule that just assumes anything is OK.
1686 auto &AtomicFMinFMax =
1687 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1688 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1689
1690 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1691 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1692 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1693 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1694 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1695 AtomicFMinFMax.legalFor({F32, FlatPtr});
1696 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1697 AtomicFMinFMax.legalFor({F64, FlatPtr});
1698
1699 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1700 // demarshalling
1701 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1702 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1703 {S32, FlatPtr}, {S64, FlatPtr}})
1704 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1705 {S32, RegionPtr}, {S64, RegionPtr}});
1706 // TODO: Pointer types, any 32-bit or 64-bit vector
1707
1708 // Condition should be s32 for scalar, s1 for vector.
1709 getActionDefinitionsBuilder(G_SELECT)
1710 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1711 LocalPtr, FlatPtr, PrivatePtr,
1712 LLT::fixed_vector(2, LocalPtr),
1713 LLT::fixed_vector(2, PrivatePtr)},
1714 {S1, S32})
1715 .clampScalar(0, S16, S64)
1716 .scalarize(1)
1717 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1718 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1719 .clampMaxNumElements(0, S32, 2)
1720 .clampMaxNumElements(0, LocalPtr, 2)
1721 .clampMaxNumElements(0, PrivatePtr, 2)
1722 .scalarize(0)
1723 .widenScalarToNextPow2(0)
1724 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1725
1726 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1727 // be more flexible with the shift amount type.
1728 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1729 .legalFor({{S32, S32}, {S64, S32}});
1730 if (ST.has16BitInsts()) {
1731 if (ST.hasVOP3PInsts()) {
1732 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1733 .clampMaxNumElements(0, S16, 2);
1734 } else
1735 Shifts.legalFor({{S16, S16}});
1736
1737 // TODO: Support 16-bit shift amounts for all types
1738 Shifts.widenScalarIf(
1739 [=](const LegalityQuery &Query) {
1740 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1741 // 32-bit amount.
1742 const LLT ValTy = Query.Types[0];
1743 const LLT AmountTy = Query.Types[1];
1744 return ValTy.getSizeInBits() <= 16 &&
1745 AmountTy.getSizeInBits() < 16;
1746 }, changeTo(1, S16));
1747 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1748 Shifts.clampScalar(1, S32, S32);
1749 Shifts.widenScalarToNextPow2(0, 16);
1750 Shifts.clampScalar(0, S16, S64);
1751
1752 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1753 .minScalar(0, S16)
1754 .scalarize(0)
1755 .lower();
1756 } else {
1757 // Make sure we legalize the shift amount type first, as the general
1758 // expansion for the shifted type will produce much worse code if it hasn't
1759 // been truncated already.
1760 Shifts.clampScalar(1, S32, S32);
1761 Shifts.widenScalarToNextPow2(0, 32);
1762 Shifts.clampScalar(0, S32, S64);
1763
1764 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1765 .minScalar(0, S32)
1766 .scalarize(0)
1767 .lower();
1768 }
1769 Shifts.scalarize(0);
1770
1771 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1772 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1773 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1774 unsigned IdxTypeIdx = 2;
1775
1776 getActionDefinitionsBuilder(Op)
1777 .customIf([=](const LegalityQuery &Query) {
1778 const LLT EltTy = Query.Types[EltTypeIdx];
1779 const LLT VecTy = Query.Types[VecTypeIdx];
1780 const LLT IdxTy = Query.Types[IdxTypeIdx];
1781 const unsigned EltSize = EltTy.getSizeInBits();
1782 const bool isLegalVecType =
1784 // Address space 8 pointers are 128-bit wide values, but the logic
1785 // below will try to bitcast them to 2N x s64, which will fail.
1786 // Therefore, as an intermediate step, wrap extracts/insertions from a
1787 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1788 // extraction result) in order to produce a vector operation that can
1789 // be handled by the logic below.
1790 if (EltTy.isPointer() && EltSize > 64)
1791 return true;
1792 return (EltSize == 32 || EltSize == 64) &&
1793 VecTy.getSizeInBits() % 32 == 0 &&
1794 VecTy.getSizeInBits() <= MaxRegisterSize &&
1795 IdxTy.getSizeInBits() == 32 &&
1796 isLegalVecType;
1797 })
1798 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1799 bitcastToVectorElement32(VecTypeIdx))
1800 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1801 .bitcastIf(
1802 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1803 [=](const LegalityQuery &Query) {
1804 // For > 64-bit element types, try to turn this into a 64-bit
1805 // element vector since we may be able to do better indexing
1806 // if this is scalar. If not, fall back to 32.
1807 const LLT EltTy = Query.Types[EltTypeIdx];
1808 const LLT VecTy = Query.Types[VecTypeIdx];
1809 const unsigned DstEltSize = EltTy.getSizeInBits();
1810 const unsigned VecSize = VecTy.getSizeInBits();
1811
1812 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1813 return std::pair(
1814 VecTypeIdx,
1815 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1816 })
1817 .clampScalar(EltTypeIdx, S32, S64)
1818 .clampScalar(VecTypeIdx, S32, S64)
1819 .clampScalar(IdxTypeIdx, S32, S32)
1820 .clampMaxNumElements(VecTypeIdx, S32, 32)
1821 // TODO: Clamp elements for 64-bit vectors?
1822 .moreElementsIf(
1823 isIllegalRegisterType(VecTypeIdx),
1825 // It should only be necessary with variable indexes.
1826 // As a last resort, lower to the stack
1827 .lower();
1828 }
1829
1830 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1831 .unsupportedIf([=](const LegalityQuery &Query) {
1832 const LLT &EltTy = Query.Types[1].getElementType();
1833 return Query.Types[0] != EltTy;
1834 });
1835
1836 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1837 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1838 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1839
1840 // FIXME: Doesn't handle extract of illegal sizes.
1841 getActionDefinitionsBuilder(Op)
1842 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1843 .lowerIf([=](const LegalityQuery &Query) {
1844 // Sub-vector(or single element) insert and extract.
1845 // TODO: verify immediate offset here since lower only works with
1846 // whole elements.
1847 const LLT BigTy = Query.Types[BigTyIdx];
1848 return BigTy.isVector();
1849 })
1850 // FIXME: Multiples of 16 should not be legal.
1851 .legalIf([=](const LegalityQuery &Query) {
1852 const LLT BigTy = Query.Types[BigTyIdx];
1853 const LLT LitTy = Query.Types[LitTyIdx];
1854 return (BigTy.getSizeInBits() % 32 == 0) &&
1855 (LitTy.getSizeInBits() % 16 == 0);
1856 })
1857 .widenScalarIf(
1858 [=](const LegalityQuery &Query) {
1859 const LLT BigTy = Query.Types[BigTyIdx];
1860 return (BigTy.getScalarSizeInBits() < 16);
1861 },
1863 .widenScalarIf(
1864 [=](const LegalityQuery &Query) {
1865 const LLT LitTy = Query.Types[LitTyIdx];
1866 return (LitTy.getScalarSizeInBits() < 16);
1867 },
1869 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1870 .widenScalarToNextPow2(BigTyIdx, 32);
1871
1872 }
1873
1874 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1875 .legalForCartesianProduct(AllS32Vectors, {S32})
1876 .legalForCartesianProduct(AllS64Vectors, {S64})
1877 .clampNumElements(0, V16S32, V32S32)
1878 .clampNumElements(0, V2S64, V16S64)
1879 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1880 .moreElementsIf(
1883
1884 if (ST.hasScalarPackInsts()) {
1885 BuildVector
1886 // FIXME: Should probably widen s1 vectors straight to s32
1887 .minScalarOrElt(0, S16)
1888 .minScalar(1, S16);
1889
1890 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1891 .legalFor({V2S16, S32})
1892 .lower();
1893 } else {
1894 BuildVector.customFor({V2S16, S16});
1895 BuildVector.minScalarOrElt(0, S32);
1896
1897 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1898 .customFor({V2S16, S32})
1899 .lower();
1900 }
1901
1902 BuildVector.legalIf(isRegisterType(0));
1903
1904 // FIXME: Clamp maximum size
1905 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1906 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1907 .clampMaxNumElements(0, S32, 32)
1908 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1909 .clampMaxNumElements(0, S16, 64);
1910
1911 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1912
1913 // Merge/Unmerge
1914 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1915 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1916 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1917
1918 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1919 const LLT Ty = Query.Types[TypeIdx];
1920 if (Ty.isVector()) {
1921 const LLT &EltTy = Ty.getElementType();
1922 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1923 return true;
1924 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1925 return true;
1926 }
1927 return false;
1928 };
1929
1930 auto &Builder = getActionDefinitionsBuilder(Op)
1931 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1932 .lowerFor({{S16, V2S16}})
1933 .lowerIf([=](const LegalityQuery &Query) {
1934 const LLT BigTy = Query.Types[BigTyIdx];
1935 return BigTy.getSizeInBits() == 32;
1936 })
1937 // Try to widen to s16 first for small types.
1938 // TODO: Only do this on targets with legal s16 shifts
1939 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1940 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1941 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1942 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1943 elementTypeIs(1, S16)),
1944 changeTo(1, V2S16))
1945 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1946 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1947 // valid.
1948 .clampScalar(LitTyIdx, S32, S512)
1949 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1950 // Break up vectors with weird elements into scalars
1951 .fewerElementsIf(
1952 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1953 scalarize(0))
1954 .fewerElementsIf(
1955 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1956 scalarize(1))
1957 .clampScalar(BigTyIdx, S32, MaxScalar);
1958
1959 if (Op == G_MERGE_VALUES) {
1960 Builder.widenScalarIf(
1961 // TODO: Use 16-bit shifts if legal for 8-bit values?
1962 [=](const LegalityQuery &Query) {
1963 const LLT Ty = Query.Types[LitTyIdx];
1964 return Ty.getSizeInBits() < 32;
1965 },
1966 changeTo(LitTyIdx, S32));
1967 }
1968
1969 Builder.widenScalarIf(
1970 [=](const LegalityQuery &Query) {
1971 const LLT Ty = Query.Types[BigTyIdx];
1972 return Ty.getSizeInBits() % 16 != 0;
1973 },
1974 [=](const LegalityQuery &Query) {
1975 // Pick the next power of 2, or a multiple of 64 over 128.
1976 // Whichever is smaller.
1977 const LLT &Ty = Query.Types[BigTyIdx];
1978 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1979 if (NewSizeInBits >= 256) {
1980 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1981 if (RoundedTo < NewSizeInBits)
1982 NewSizeInBits = RoundedTo;
1983 }
1984 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1985 })
1986 // Any vectors left are the wrong size. Scalarize them.
1987 .scalarize(0)
1988 .scalarize(1);
1989 }
1990
1991 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1992 // RegBankSelect.
1993 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1994 .legalFor({{S32}, {S64}});
1995
1996 if (ST.hasVOP3PInsts()) {
1997 SextInReg.lowerFor({{V2S16}})
1998 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1999 // get more vector shift opportunities, since we'll get those when
2000 // expanded.
2001 .clampMaxNumElementsStrict(0, S16, 2);
2002 } else if (ST.has16BitInsts()) {
2003 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2004 } else {
2005 // Prefer to promote to s32 before lowering if we don't have 16-bit
2006 // shifts. This avoid a lot of intermediate truncate and extend operations.
2007 SextInReg.lowerFor({{S32}, {S64}});
2008 }
2009
2010 SextInReg
2011 .scalarize(0)
2012 .clampScalar(0, S32, S64)
2013 .lower();
2014
2015 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2016 .scalarize(0)
2017 .lower();
2018
2019 // TODO: Only Try to form v2s16 with legal packed instructions.
2020 getActionDefinitionsBuilder(G_FSHR)
2021 .legalFor({{S32, S32}})
2022 .lowerFor({{V2S16, V2S16}})
2023 .clampMaxNumElementsStrict(0, S16, 2)
2024 .scalarize(0)
2025 .lower();
2026
2027 if (ST.hasVOP3PInsts()) {
2028 getActionDefinitionsBuilder(G_FSHL)
2029 .lowerFor({{V2S16, V2S16}})
2030 .clampMaxNumElementsStrict(0, S16, 2)
2031 .scalarize(0)
2032 .lower();
2033 } else {
2034 getActionDefinitionsBuilder(G_FSHL)
2035 .scalarize(0)
2036 .lower();
2037 }
2038
2039 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2040 .legalFor({S64});
2041
2042 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2043
2044 getActionDefinitionsBuilder(G_FENCE)
2045 .alwaysLegal();
2046
2047 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2048 .scalarize(0)
2049 .minScalar(0, S32)
2050 .lower();
2051
2052 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2053 .legalFor({{S32, S32}, {S64, S32}})
2054 .clampScalar(1, S32, S32)
2055 .clampScalar(0, S32, S64)
2056 .widenScalarToNextPow2(0)
2057 .scalarize(0);
2058
2059 getActionDefinitionsBuilder(
2060 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2061 G_FCOPYSIGN,
2062
2063 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2064 G_READ_REGISTER, G_WRITE_REGISTER,
2065
2066 G_SADDO, G_SSUBO})
2067 .lower();
2068
2069 if (ST.hasIEEEMinMax()) {
2070 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2071 .legalFor(FPTypesPK16)
2072 .clampMaxNumElements(0, S16, 2)
2073 .scalarize(0);
2074 } else {
2075 // TODO: Implement
2076 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2077 }
2078
2079 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2080 .lower();
2081
2082 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2083
2084 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2085 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2086 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2087 .unsupported();
2088
2089 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2090
2091 getLegacyLegalizerInfo().computeTables();
2092 verify(*ST.getInstrInfo());
2093}
2094
2097 LostDebugLocObserver &LocObserver) const {
2098 MachineIRBuilder &B = Helper.MIRBuilder;
2099 MachineRegisterInfo &MRI = *B.getMRI();
2100
2101 switch (MI.getOpcode()) {
2102 case TargetOpcode::G_ADDRSPACE_CAST:
2103 return legalizeAddrSpaceCast(MI, MRI, B);
2104 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2105 return legalizeFroundeven(MI, MRI, B);
2106 case TargetOpcode::G_FCEIL:
2107 return legalizeFceil(MI, MRI, B);
2108 case TargetOpcode::G_FREM:
2109 return legalizeFrem(MI, MRI, B);
2110 case TargetOpcode::G_INTRINSIC_TRUNC:
2111 return legalizeIntrinsicTrunc(MI, MRI, B);
2112 case TargetOpcode::G_SITOFP:
2113 return legalizeITOFP(MI, MRI, B, true);
2114 case TargetOpcode::G_UITOFP:
2115 return legalizeITOFP(MI, MRI, B, false);
2116 case TargetOpcode::G_FPTOSI:
2117 return legalizeFPTOI(MI, MRI, B, true);
2118 case TargetOpcode::G_FPTOUI:
2119 return legalizeFPTOI(MI, MRI, B, false);
2120 case TargetOpcode::G_FMINNUM:
2121 case TargetOpcode::G_FMAXNUM:
2122 case TargetOpcode::G_FMINNUM_IEEE:
2123 case TargetOpcode::G_FMAXNUM_IEEE:
2124 return legalizeMinNumMaxNum(Helper, MI);
2125 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2126 return legalizeExtractVectorElt(MI, MRI, B);
2127 case TargetOpcode::G_INSERT_VECTOR_ELT:
2128 return legalizeInsertVectorElt(MI, MRI, B);
2129 case TargetOpcode::G_FSIN:
2130 case TargetOpcode::G_FCOS:
2131 return legalizeSinCos(MI, MRI, B);
2132 case TargetOpcode::G_GLOBAL_VALUE:
2133 return legalizeGlobalValue(MI, MRI, B);
2134 case TargetOpcode::G_LOAD:
2135 case TargetOpcode::G_SEXTLOAD:
2136 case TargetOpcode::G_ZEXTLOAD:
2137 return legalizeLoad(Helper, MI);
2138 case TargetOpcode::G_STORE:
2139 return legalizeStore(Helper, MI);
2140 case TargetOpcode::G_FMAD:
2141 return legalizeFMad(MI, MRI, B);
2142 case TargetOpcode::G_FDIV:
2143 return legalizeFDIV(MI, MRI, B);
2144 case TargetOpcode::G_FFREXP:
2145 return legalizeFFREXP(MI, MRI, B);
2146 case TargetOpcode::G_FSQRT:
2147 return legalizeFSQRT(MI, MRI, B);
2148 case TargetOpcode::G_UDIV:
2149 case TargetOpcode::G_UREM:
2150 case TargetOpcode::G_UDIVREM:
2151 return legalizeUnsignedDIV_REM(MI, MRI, B);
2152 case TargetOpcode::G_SDIV:
2153 case TargetOpcode::G_SREM:
2154 case TargetOpcode::G_SDIVREM:
2155 return legalizeSignedDIV_REM(MI, MRI, B);
2156 case TargetOpcode::G_ATOMIC_CMPXCHG:
2157 return legalizeAtomicCmpXChg(MI, MRI, B);
2158 case TargetOpcode::G_FLOG2:
2159 return legalizeFlog2(MI, B);
2160 case TargetOpcode::G_FLOG:
2161 case TargetOpcode::G_FLOG10:
2162 return legalizeFlogCommon(MI, B);
2163 case TargetOpcode::G_FEXP2:
2164 return legalizeFExp2(MI, B);
2165 case TargetOpcode::G_FEXP:
2166 case TargetOpcode::G_FEXP10:
2167 return legalizeFExp(MI, B);
2168 case TargetOpcode::G_FPOW:
2169 return legalizeFPow(MI, B);
2170 case TargetOpcode::G_FFLOOR:
2171 return legalizeFFloor(MI, MRI, B);
2172 case TargetOpcode::G_BUILD_VECTOR:
2173 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2174 return legalizeBuildVector(MI, MRI, B);
2175 case TargetOpcode::G_MUL:
2176 return legalizeMul(Helper, MI);
2177 case TargetOpcode::G_CTLZ:
2178 case TargetOpcode::G_CTTZ:
2179 return legalizeCTLZ_CTTZ(MI, MRI, B);
2180 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2181 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2182 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2183 return legalizeFPTruncRound(MI, B);
2184 case TargetOpcode::G_STACKSAVE:
2185 return legalizeStackSave(MI, B);
2186 case TargetOpcode::G_GET_FPENV:
2187 return legalizeGetFPEnv(MI, MRI, B);
2188 case TargetOpcode::G_SET_FPENV:
2189 return legalizeSetFPEnv(MI, MRI, B);
2190 case TargetOpcode::G_TRAP:
2191 return legalizeTrap(MI, MRI, B);
2192 case TargetOpcode::G_DEBUGTRAP:
2193 return legalizeDebugTrap(MI, MRI, B);
2194 default:
2195 return false;
2196 }
2197
2198 llvm_unreachable("expected switch to return");
2199}
2200
2202 unsigned AS,
2204 MachineIRBuilder &B) const {
2205 MachineFunction &MF = B.getMF();
2206 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2207 const LLT S32 = LLT::scalar(32);
2208 const LLT S64 = LLT::scalar(64);
2209
2211
2212 if (ST.hasApertureRegs()) {
2213 // Note: this register is somewhat broken. When used as a 32-bit operand,
2214 // it only returns zeroes. The real value is in the upper 32 bits.
2215 // Thus, we must emit extract the high 32 bits.
2216 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE;
2219 // FIXME: It would be more natural to emit a COPY here, but then copy
2220 // coalescing would kick in and it would think it's okay to use the "HI"
2221 // subregister (instead of extracting the HI 32 bits) which is an artificial
2222 // (unusable) register.
2223 // Register TableGen definitions would need an overhaul to get rid of the
2224 // artificial "HI" aperture registers and prevent this kind of issue from
2225 // happening.
2226 Register Dst = MRI.createGenericVirtualRegister(S64);
2227 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2229 return B.buildUnmerge(S32, Dst).getReg(1);
2230 }
2231
2232 // TODO: can we be smarter about machine pointer info?
2234 Register LoadAddr = MRI.createGenericVirtualRegister(
2236 // For code object version 5, private_base and shared_base are passed through
2237 // implicit kernargs.
2244 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2245
2246 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2248
2249 if (!loadInputValue(KernargPtrReg, B,
2251 return Register();
2252
2254 PtrInfo,
2258
2259 // Pointer address
2260 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2261 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2262 // Load address
2263 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2264 }
2265
2266 Register QueuePtr = MRI.createGenericVirtualRegister(
2268
2270 return Register();
2271
2272 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2273 // private_segment_aperture_base_hi.
2274 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2275
2277 PtrInfo,
2280 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2281
2282 B.buildPtrAdd(LoadAddr, QueuePtr,
2283 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2284 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2285}
2286
2287/// Return true if the value is a known valid address, such that a null check is
2288/// not necessary.
2290 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2291 MachineInstr *Def = MRI.getVRegDef(Val);
2292 switch (Def->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX:
2294 case AMDGPU::G_GLOBAL_VALUE:
2295 case AMDGPU::G_BLOCK_ADDR:
2296 return true;
2297 case AMDGPU::G_CONSTANT: {
2298 const ConstantInt *CI = Def->getOperand(1).getCImm();
2299 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2300 }
2301 default:
2302 return false;
2303 }
2304
2305 return false;
2306}
2307
2310 MachineIRBuilder &B) const {
2311 MachineFunction &MF = B.getMF();
2312
2313 // MI can either be a G_ADDRSPACE_CAST or a
2314 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2315 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull));
2318
2319 const LLT S32 = LLT::scalar(32);
2320 Register Dst = MI.getOperand(0).getReg();
2321 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2322 : MI.getOperand(1).getReg();
2323 LLT DstTy = MRI.getType(Dst);
2324 LLT SrcTy = MRI.getType(Src);
2325 unsigned DestAS = DstTy.getAddressSpace();
2326 unsigned SrcAS = SrcTy.getAddressSpace();
2327
2328 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2329 // vector element.
2330 assert(!DstTy.isVector());
2331
2332 const AMDGPUTargetMachine &TM
2333 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2334
2335 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2337 return true;
2338 }
2339
2340 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2341 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2342 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2343 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2344 // G_ADDRSPACE_CAST we need to guess.
2345 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2346 // Extract low 32-bits of the pointer.
2347 B.buildExtract(Dst, Src, 0);
2348 MI.eraseFromParent();
2349 return true;
2350 }
2351
2352 unsigned NullVal = TM.getNullPointerValue(DestAS);
2353
2354 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2355 auto FlatNull = B.buildConstant(SrcTy, 0);
2356
2357 // Extract low 32-bits of the pointer.
2358 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2359
2360 auto CmpRes =
2361 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2362 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2363
2364 MI.eraseFromParent();
2365 return true;
2366 }
2367
2368 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2369 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2370 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2371 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2372 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2373 if (!ApertureReg.isValid())
2374 return false;
2375
2376 // Coerce the type of the low half of the result so we can use
2377 // merge_values.
2378 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2379
2380 // TODO: Should we allow mismatched types but matching sizes in merges to
2381 // avoid the ptrtoint?
2382 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2383 };
2384
2385 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2386 // G_ADDRSPACE_CAST we need to guess.
2387 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2388 castLocalOrPrivateToFlat(Dst);
2389 MI.eraseFromParent();
2390 return true;
2391 }
2392
2393 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2394
2395 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2396 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2397
2398 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2399 SegmentNull.getReg(0));
2400
2401 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2402
2403 MI.eraseFromParent();
2404 return true;
2405 }
2406
2407 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2408 SrcTy.getSizeInBits() == 64) {
2409 // Truncate.
2410 B.buildExtract(Dst, Src, 0);
2411 MI.eraseFromParent();
2412 return true;
2413 }
2414
2415 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2416 DstTy.getSizeInBits() == 64) {
2418 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2419 auto PtrLo = B.buildPtrToInt(S32, Src);
2420 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2421 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2422 MI.eraseFromParent();
2423 return true;
2424 }
2425
2426 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2427 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2428
2429 LLVMContext &Ctx = MF.getFunction().getContext();
2430 Ctx.diagnose(InvalidAddrSpaceCast);
2431 B.buildUndef(Dst);
2432 MI.eraseFromParent();
2433 return true;
2434}
2435
2438 MachineIRBuilder &B) const {
2439 Register Src = MI.getOperand(1).getReg();
2440 LLT Ty = MRI.getType(Src);
2441 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2442
2443 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2444 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2445
2446 auto C1 = B.buildFConstant(Ty, C1Val);
2447 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2448
2449 // TODO: Should this propagate fast-math-flags?
2450 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2451 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2452
2453 auto C2 = B.buildFConstant(Ty, C2Val);
2454 auto Fabs = B.buildFAbs(Ty, Src);
2455
2456 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2457 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2458 MI.eraseFromParent();
2459 return true;
2460}
2461
2464 MachineIRBuilder &B) const {
2465
2466 const LLT S1 = LLT::scalar(1);
2467 const LLT S64 = LLT::scalar(64);
2468
2469 Register Src = MI.getOperand(1).getReg();
2470 assert(MRI.getType(Src) == S64);
2471
2472 // result = trunc(src)
2473 // if (src > 0.0 && src != result)
2474 // result += 1.0
2475
2476 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2477
2478 const auto Zero = B.buildFConstant(S64, 0.0);
2479 const auto One = B.buildFConstant(S64, 1.0);
2480 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2481 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2482 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2483 auto Add = B.buildSelect(S64, And, One, Zero);
2484
2485 // TODO: Should this propagate fast-math-flags?
2486 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2487 MI.eraseFromParent();
2488 return true;
2489}
2490
2493 MachineIRBuilder &B) const {
2494 Register DstReg = MI.getOperand(0).getReg();
2495 Register Src0Reg = MI.getOperand(1).getReg();
2496 Register Src1Reg = MI.getOperand(2).getReg();
2497 auto Flags = MI.getFlags();
2498 LLT Ty = MRI.getType(DstReg);
2499
2500 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2501 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2502 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2503 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2504 MI.eraseFromParent();
2505 return true;
2506}
2507
2510 const unsigned FractBits = 52;
2511 const unsigned ExpBits = 11;
2512 LLT S32 = LLT::scalar(32);
2513
2514 auto Const0 = B.buildConstant(S32, FractBits - 32);
2515 auto Const1 = B.buildConstant(S32, ExpBits);
2516
2517 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2518 .addUse(Hi)
2519 .addUse(Const0.getReg(0))
2520 .addUse(Const1.getReg(0));
2521
2522 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2523}
2524
2527 MachineIRBuilder &B) const {
2528 const LLT S1 = LLT::scalar(1);
2529 const LLT S32 = LLT::scalar(32);
2530 const LLT S64 = LLT::scalar(64);
2531
2532 Register Src = MI.getOperand(1).getReg();
2533 assert(MRI.getType(Src) == S64);
2534
2535 // TODO: Should this use extract since the low half is unused?
2536 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2537 Register Hi = Unmerge.getReg(1);
2538
2539 // Extract the upper half, since this is where we will find the sign and
2540 // exponent.
2541 auto Exp = extractF64Exponent(Hi, B);
2542
2543 const unsigned FractBits = 52;
2544
2545 // Extract the sign bit.
2546 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2547 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2548
2549 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2550
2551 const auto Zero32 = B.buildConstant(S32, 0);
2552
2553 // Extend back to 64-bits.
2554 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2555
2556 auto Shr = B.buildAShr(S64, FractMask, Exp);
2557 auto Not = B.buildNot(S64, Shr);
2558 auto Tmp0 = B.buildAnd(S64, Src, Not);
2559 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2560
2561 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2562 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2563
2564 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2565 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2566 MI.eraseFromParent();
2567 return true;
2568}
2569
2572 MachineIRBuilder &B, bool Signed) const {
2573
2574 Register Dst = MI.getOperand(0).getReg();
2575 Register Src = MI.getOperand(1).getReg();
2576
2577 const LLT S64 = LLT::scalar(64);
2578 const LLT S32 = LLT::scalar(32);
2579
2580 assert(MRI.getType(Src) == S64);
2581
2582 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2583 auto ThirtyTwo = B.buildConstant(S32, 32);
2584
2585 if (MRI.getType(Dst) == S64) {
2586 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2587 : B.buildUITOFP(S64, Unmerge.getReg(1));
2588
2589 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2590 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2591
2592 // TODO: Should this propagate fast-math-flags?
2593 B.buildFAdd(Dst, LdExp, CvtLo);
2594 MI.eraseFromParent();
2595 return true;
2596 }
2597
2598 assert(MRI.getType(Dst) == S32);
2599
2600 auto One = B.buildConstant(S32, 1);
2601
2602 MachineInstrBuilder ShAmt;
2603 if (Signed) {
2604 auto ThirtyOne = B.buildConstant(S32, 31);
2605 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2606 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2607 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2608 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2609 .addUse(Unmerge.getReg(1));
2610 auto LS2 = B.buildSub(S32, LS, One);
2611 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2612 } else
2613 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2614 auto Norm = B.buildShl(S64, Src, ShAmt);
2615 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2616 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2617 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2618 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2619 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2620 B.buildFLdexp(Dst, FVal, Scale);
2621 MI.eraseFromParent();
2622 return true;
2623}
2624
2625// TODO: Copied from DAG implementation. Verify logic and document how this
2626// actually works.
2630 bool Signed) const {
2631
2632 Register Dst = MI.getOperand(0).getReg();
2633 Register Src = MI.getOperand(1).getReg();
2634
2635 const LLT S64 = LLT::scalar(64);
2636 const LLT S32 = LLT::scalar(32);
2637
2638 const LLT SrcLT = MRI.getType(Src);
2639 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2640
2641 unsigned Flags = MI.getFlags();
2642
2643 // The basic idea of converting a floating point number into a pair of 32-bit
2644 // integers is illustrated as follows:
2645 //
2646 // tf := trunc(val);
2647 // hif := floor(tf * 2^-32);
2648 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2649 // hi := fptoi(hif);
2650 // lo := fptoi(lof);
2651 //
2652 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2654 if (Signed && SrcLT == S32) {
2655 // However, a 32-bit floating point number has only 23 bits mantissa and
2656 // it's not enough to hold all the significant bits of `lof` if val is
2657 // negative. To avoid the loss of precision, We need to take the absolute
2658 // value after truncating and flip the result back based on the original
2659 // signedness.
2660 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2661 Trunc = B.buildFAbs(S32, Trunc, Flags);
2662 }
2663 MachineInstrBuilder K0, K1;
2664 if (SrcLT == S64) {
2665 K0 = B.buildFConstant(
2666 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2667 K1 = B.buildFConstant(
2668 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2669 } else {
2670 K0 = B.buildFConstant(
2671 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2672 K1 = B.buildFConstant(
2673 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2674 }
2675
2676 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2677 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2678 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2679
2680 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2681 : B.buildFPTOUI(S32, FloorMul);
2682 auto Lo = B.buildFPTOUI(S32, Fma);
2683
2684 if (Signed && SrcLT == S32) {
2685 // Flip the result based on the signedness, which is either all 0s or 1s.
2686 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2687 // r := xor({lo, hi}, sign) - sign;
2688 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2689 Sign);
2690 } else
2691 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2692 MI.eraseFromParent();
2693
2694 return true;
2695}
2696
2698 MachineInstr &MI) const {
2699 MachineFunction &MF = Helper.MIRBuilder.getMF();
2701
2702 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2703 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2704
2705 // With ieee_mode disabled, the instructions have the correct behavior
2706 // already for G_FMINNUM/G_FMAXNUM
2707 if (!MFI->getMode().IEEE)
2708 return !IsIEEEOp;
2709
2710 if (IsIEEEOp)
2711 return true;
2712
2714}
2715
2718 MachineIRBuilder &B) const {
2719 // TODO: Should move some of this into LegalizerHelper.
2720
2721 // TODO: Promote dynamic indexing of s16 to s32
2722
2723 Register Dst = MI.getOperand(0).getReg();
2724 Register Vec = MI.getOperand(1).getReg();
2725
2726 LLT VecTy = MRI.getType(Vec);
2727 LLT EltTy = VecTy.getElementType();
2728 assert(EltTy == MRI.getType(Dst));
2729
2730 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2731 // but we can't go directly to that logic becasue you can't bitcast a vector
2732 // of pointers to a vector of integers. Therefore, introduce an intermediate
2733 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2734 // drive the legalization forward.
2735 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2736 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2737 LLT IntVecTy = VecTy.changeElementType(IntTy);
2738
2739 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2740 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2741 B.buildIntToPtr(Dst, IntElt);
2742
2743 MI.eraseFromParent();
2744 return true;
2745 }
2746
2747 // FIXME: Artifact combiner probably should have replaced the truncated
2748 // constant before this, so we shouldn't need
2749 // getIConstantVRegValWithLookThrough.
2750 std::optional<ValueAndVReg> MaybeIdxVal =
2751 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2752 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2753 return true;
2754 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2755
2756 if (IdxVal < VecTy.getNumElements()) {
2757 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2758 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2759 } else {
2760 B.buildUndef(Dst);
2761 }
2762
2763 MI.eraseFromParent();
2764 return true;
2765}
2766
2769 MachineIRBuilder &B) const {
2770 // TODO: Should move some of this into LegalizerHelper.
2771
2772 // TODO: Promote dynamic indexing of s16 to s32
2773
2774 Register Dst = MI.getOperand(0).getReg();
2775 Register Vec = MI.getOperand(1).getReg();
2776 Register Ins = MI.getOperand(2).getReg();
2777
2778 LLT VecTy = MRI.getType(Vec);
2779 LLT EltTy = VecTy.getElementType();
2780 assert(EltTy == MRI.getType(Ins));
2781
2782 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2783 // but we can't go directly to that logic becasue you can't bitcast a vector
2784 // of pointers to a vector of integers. Therefore, make the pointer vector
2785 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2786 // new value, and then inttoptr the result vector back. This will then allow
2787 // the rest of legalization to take over.
2788 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2789 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2790 LLT IntVecTy = VecTy.changeElementType(IntTy);
2791
2792 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2793 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2794 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2795 MI.getOperand(3));
2796 B.buildIntToPtr(Dst, IntVecDest);
2797 MI.eraseFromParent();
2798 return true;
2799 }
2800
2801 // FIXME: Artifact combiner probably should have replaced the truncated
2802 // constant before this, so we shouldn't need
2803 // getIConstantVRegValWithLookThrough.
2804 std::optional<ValueAndVReg> MaybeIdxVal =
2805 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2806 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2807 return true;
2808
2809 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2810
2811 unsigned NumElts = VecTy.getNumElements();
2812 if (IdxVal < NumElts) {
2814 for (unsigned i = 0; i < NumElts; ++i)
2815 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2816 B.buildUnmerge(SrcRegs, Vec);
2817
2818 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2819 B.buildMergeLikeInstr(Dst, SrcRegs);
2820 } else {
2821 B.buildUndef(Dst);
2822 }
2823
2824 MI.eraseFromParent();
2825 return true;
2826}
2827
2830 MachineIRBuilder &B) const {
2831
2832 Register DstReg = MI.getOperand(0).getReg();
2833 Register SrcReg = MI.getOperand(1).getReg();
2834 LLT Ty = MRI.getType(DstReg);
2835 unsigned Flags = MI.getFlags();
2836
2837 Register TrigVal;
2838 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2839 if (ST.hasTrigReducedRange()) {
2840 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2841 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2842 .addUse(MulVal.getReg(0))
2843 .setMIFlags(Flags)
2844 .getReg(0);
2845 } else
2846 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2847
2848 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2849 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2850 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2851 .addUse(TrigVal)
2852 .setMIFlags(Flags);
2853 MI.eraseFromParent();
2854 return true;
2855}
2856
2859 const GlobalValue *GV,
2860 int64_t Offset,
2861 unsigned GAFlags) const {
2862 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2863 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2864 // to the following code sequence:
2865 //
2866 // For constant address space:
2867 // s_getpc_b64 s[0:1]
2868 // s_add_u32 s0, s0, $symbol
2869 // s_addc_u32 s1, s1, 0
2870 //
2871 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2872 // a fixup or relocation is emitted to replace $symbol with a literal
2873 // constant, which is a pc-relative offset from the encoding of the $symbol
2874 // operand to the global variable.
2875 //
2876 // For global address space:
2877 // s_getpc_b64 s[0:1]
2878 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2879 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2880 //
2881 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2882 // fixups or relocations are emitted to replace $symbol@*@lo and
2883 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2884 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2885 // operand to the global variable.
2886
2888
2889 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2890 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2891
2892 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2893 .addDef(PCReg);
2894
2895 MIB.addGlobalAddress(GV, Offset, GAFlags);
2896 if (GAFlags == SIInstrInfo::MO_NONE)
2897 MIB.addImm(0);
2898 else
2899 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2900
2901 if (!B.getMRI()->getRegClassOrNull(PCReg))
2902 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2903
2904 if (PtrTy.getSizeInBits() == 32)
2905 B.buildExtract(DstReg, PCReg, 0);
2906 return true;
2907}
2908
2909// Emit a ABS32_LO / ABS32_HI relocation stub.
2911 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2912 MachineRegisterInfo &MRI) const {
2913 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2914
2915 LLT S32 = LLT::scalar(32);
2916
2917 // Use the destination directly, if and only if we store the lower address
2918 // part only and we don't have a register class being set.
2919 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2920 ? DstReg
2921 : MRI.createGenericVirtualRegister(S32);
2922
2923 if (!MRI.getRegClassOrNull(AddrLo))
2924 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2925
2926 // Write the lower half.
2927 B.buildInstr(AMDGPU::S_MOV_B32)
2928 .addDef(AddrLo)
2929 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2930
2931 // If required, write the upper half as well.
2932 if (RequiresHighHalf) {
2933 assert(PtrTy.getSizeInBits() == 64 &&
2934 "Must provide a 64-bit pointer type!");
2935
2936 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2937 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2938
2939 B.buildInstr(AMDGPU::S_MOV_B32)
2940 .addDef(AddrHi)
2941 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2942
2943 // Use the destination directly, if and only if we don't have a register
2944 // class being set.
2945 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2946 ? DstReg
2947 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2948
2949 if (!MRI.getRegClassOrNull(AddrDst))
2950 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2951
2952 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2953
2954 // If we created a new register for the destination, cast the result into
2955 // the final output.
2956 if (AddrDst != DstReg)
2957 B.buildCast(DstReg, AddrDst);
2958 } else if (AddrLo != DstReg) {
2959 // If we created a new register for the destination, cast the result into
2960 // the final output.
2961 B.buildCast(DstReg, AddrLo);
2962 }
2963}
2964
2967 MachineIRBuilder &B) const {
2968 Register DstReg = MI.getOperand(0).getReg();
2969 LLT Ty = MRI.getType(DstReg);
2970 unsigned AS = Ty.getAddressSpace();
2971
2972 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2973 MachineFunction &MF = B.getMF();
2975
2977 if (!MFI->isModuleEntryFunction() &&
2978 GV->getName() != "llvm.amdgcn.module.lds") {
2979 const Function &Fn = MF.getFunction();
2980 DiagnosticInfoUnsupported BadLDSDecl(
2981 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2982 DS_Warning);
2983 Fn.getContext().diagnose(BadLDSDecl);
2984
2985 // We currently don't have a way to correctly allocate LDS objects that
2986 // aren't directly associated with a kernel. We do force inlining of
2987 // functions that use local objects. However, if these dead functions are
2988 // not eliminated, we don't want a compile time error. Just emit a warning
2989 // and a trap, since there should be no callable path here.
2990 B.buildTrap();
2991 B.buildUndef(DstReg);
2992 MI.eraseFromParent();
2993 return true;
2994 }
2995
2996 // TODO: We could emit code to handle the initialization somewhere.
2997 // We ignore the initializer for now and legalize it to allow selection.
2998 // The initializer will anyway get errored out during assembly emission.
2999 const SITargetLowering *TLI = ST.getTargetLowering();
3000 if (!TLI->shouldUseLDSConstAddress(GV)) {
3001 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3002 return true; // Leave in place;
3003 }
3004
3005 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3006 Type *Ty = GV->getValueType();
3007 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3008 // zero-sized type in other languages to declare the dynamic shared
3009 // memory which size is not known at the compile time. They will be
3010 // allocated by the runtime and placed directly after the static
3011 // allocated ones. They all share the same offset.
3012 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3013 // Adjust alignment for that dynamic shared memory array.
3014 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3015 LLT S32 = LLT::scalar(32);
3016 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3017 B.buildIntToPtr(DstReg, Sz);
3018 MI.eraseFromParent();
3019 return true;
3020 }
3021 }
3022
3023 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3024 *cast<GlobalVariable>(GV)));
3025 MI.eraseFromParent();
3026 return true;
3027 }
3028
3029 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3030 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3031 MI.eraseFromParent();
3032 return true;
3033 }
3034
3035 const SITargetLowering *TLI = ST.getTargetLowering();
3036
3037 if (TLI->shouldEmitFixup(GV)) {
3038 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3039 MI.eraseFromParent();
3040 return true;
3041 }
3042
3043 if (TLI->shouldEmitPCReloc(GV)) {
3044 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3045 MI.eraseFromParent();
3046 return true;
3047 }
3048
3050 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3051
3052 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3057 LoadTy, Align(8));
3058
3059 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3060
3061 if (Ty.getSizeInBits() == 32) {
3062 // Truncate if this is a 32-bit constant address.
3063 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3064 B.buildExtract(DstReg, Load, 0);
3065 } else
3066 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3067
3068 MI.eraseFromParent();
3069 return true;
3070}
3071
3073 if (Ty.isVector())
3074 return Ty.changeElementCount(
3077}
3078
3080 MachineInstr &MI) const {
3081 MachineIRBuilder &B = Helper.MIRBuilder;
3082 MachineRegisterInfo &MRI = *B.getMRI();
3083 GISelChangeObserver &Observer = Helper.Observer;
3084
3085 Register PtrReg = MI.getOperand(1).getReg();
3086 LLT PtrTy = MRI.getType(PtrReg);
3087 unsigned AddrSpace = PtrTy.getAddressSpace();
3088
3089 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3091 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3092 Observer.changingInstr(MI);
3093 MI.getOperand(1).setReg(Cast.getReg(0));
3094 Observer.changedInstr(MI);
3095 return true;
3096 }
3097
3098 if (MI.getOpcode() != AMDGPU::G_LOAD)
3099 return false;
3100
3101 Register ValReg = MI.getOperand(0).getReg();
3102 LLT ValTy = MRI.getType(ValReg);
3103
3104 if (hasBufferRsrcWorkaround(ValTy)) {
3105 Observer.changingInstr(MI);
3107 Observer.changedInstr(MI);
3108 return true;
3109 }
3110
3111 MachineMemOperand *MMO = *MI.memoperands_begin();
3112 const unsigned ValSize = ValTy.getSizeInBits();
3113 const LLT MemTy = MMO->getMemoryType();
3114 const Align MemAlign = MMO->getAlign();
3115 const unsigned MemSize = MemTy.getSizeInBits();
3116 const uint64_t AlignInBits = 8 * MemAlign.value();
3117
3118 // Widen non-power-of-2 loads to the alignment if needed
3119 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3120 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3121
3122 // This was already the correct extending load result type, so just adjust
3123 // the memory type.
3124 if (WideMemSize == ValSize) {
3125 MachineFunction &MF = B.getMF();
3126
3127 MachineMemOperand *WideMMO =
3128 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3129 Observer.changingInstr(MI);
3130 MI.setMemRefs(MF, {WideMMO});
3131 Observer.changedInstr(MI);
3132 return true;
3133 }
3134
3135 // Don't bother handling edge case that should probably never be produced.
3136 if (ValSize > WideMemSize)
3137 return false;
3138
3139 LLT WideTy = widenToNextPowerOf2(ValTy);
3140
3141 Register WideLoad;
3142 if (!WideTy.isVector()) {
3143 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3144 B.buildTrunc(ValReg, WideLoad).getReg(0);
3145 } else {
3146 // Extract the subvector.
3147
3148 if (isRegisterType(ValTy)) {
3149 // If this a case where G_EXTRACT is legal, use it.
3150 // (e.g. <3 x s32> -> <4 x s32>)
3151 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3152 B.buildExtract(ValReg, WideLoad, 0);
3153 } else {
3154 // For cases where the widened type isn't a nice register value, unmerge
3155 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3156 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3157 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3158 }
3159 }
3160
3161 MI.eraseFromParent();
3162 return true;
3163 }
3164
3165 return false;
3166}
3167
3169 MachineInstr &MI) const {
3170 MachineIRBuilder &B = Helper.MIRBuilder;
3171 MachineRegisterInfo &MRI = *B.getMRI();
3172 GISelChangeObserver &Observer = Helper.Observer;
3173
3174 Register DataReg = MI.getOperand(0).getReg();
3175 LLT DataTy = MRI.getType(DataReg);
3176
3177 if (hasBufferRsrcWorkaround(DataTy)) {
3178 Observer.changingInstr(MI);
3180 Observer.changedInstr(MI);
3181 return true;
3182 }
3183 return false;
3184}
3185
3188 MachineIRBuilder &B) const {
3189 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3190 assert(Ty.isScalar());
3191
3192 MachineFunction &MF = B.getMF();
3194
3195 // TODO: Always legal with future ftz flag.
3196 // FIXME: Do we need just output?
3197 if (Ty == LLT::float32() &&
3199 return true;
3200 if (Ty == LLT::float16() &&
3202 return true;
3203
3204 MachineIRBuilder HelperBuilder(MI);
3205 GISelObserverWrapper DummyObserver;
3206 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3207 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3208}
3209
3212 Register DstReg = MI.getOperand(0).getReg();
3213 Register PtrReg = MI.getOperand(1).getReg();
3214 Register CmpVal = MI.getOperand(2).getReg();
3215 Register NewVal = MI.getOperand(3).getReg();
3216
3217 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3218 "this should not have been custom lowered");
3219
3220 LLT ValTy = MRI.getType(CmpVal);
3221 LLT VecTy = LLT::fixed_vector(2, ValTy);
3222
3223 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3224
3225 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3226 .addDef(DstReg)
3227 .addUse(PtrReg)
3228 .addUse(PackedVal)
3229 .setMemRefs(MI.memoperands());
3230
3231 MI.eraseFromParent();
3232 return true;
3233}
3234
3235/// Return true if it's known that \p Src can never be an f32 denormal value.
3237 Register Src) {
3238 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3239 switch (DefMI->getOpcode()) {
3240 case TargetOpcode::G_INTRINSIC: {
3241 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3242 case Intrinsic::amdgcn_frexp_mant:
3243 return true;
3244 default:
3245 break;
3246 }
3247
3248 break;
3249 }
3250 case TargetOpcode::G_FFREXP: {
3251 if (DefMI->getOperand(0).getReg() == Src)
3252 return true;
3253 break;
3254 }
3255 case TargetOpcode::G_FPEXT: {
3256 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3257 }
3258 default:
3259 return false;
3260 }
3261
3262 return false;
3263}
3264
3265static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3266 if (Flags & MachineInstr::FmAfn)
3267 return true;
3268 const auto &Options = MF.getTarget().Options;
3269 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3270}
3271
3273 unsigned Flags) {
3274 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3277}
3278
3279std::pair<Register, Register>
3281 unsigned Flags) const {
3282 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3283 return {};
3284
3285 const LLT F32 = LLT::scalar(32);
3286 auto SmallestNormal = B.buildFConstant(
3288 auto IsLtSmallestNormal =
3289 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3290
3291 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3292 auto One = B.buildFConstant(F32, 1.0);
3293 auto ScaleFactor =
3294 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3295 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3296
3297 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3298}
3299
3301 MachineIRBuilder &B) const {
3302 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3303 // If we have to handle denormals, scale up the input and adjust the result.
3304
3305 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3306 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3307
3308 Register Dst = MI.getOperand(0).getReg();
3309 Register Src = MI.getOperand(1).getReg();
3310 LLT Ty = B.getMRI()->getType(Dst);
3311 unsigned Flags = MI.getFlags();
3312
3313 if (Ty == LLT::scalar(16)) {
3314 const LLT F32 = LLT::scalar(32);
3315 // Nothing in half is a denormal when promoted to f32.
3316 auto Ext = B.buildFPExt(F32, Src, Flags);
3317 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3318 .addUse(Ext.getReg(0))
3319 .setMIFlags(Flags);
3320 B.buildFPTrunc(Dst, Log2, Flags);
3321 MI.eraseFromParent();
3322 return true;
3323 }
3324
3325 assert(Ty == LLT::scalar(32));
3326
3327 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3328 if (!ScaledInput) {
3329 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3330 .addUse(Src)
3331 .setMIFlags(Flags);
3332 MI.eraseFromParent();
3333 return true;
3334 }
3335
3336 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3337 .addUse(ScaledInput)
3338 .setMIFlags(Flags);
3339
3340 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3341 auto Zero = B.buildFConstant(Ty, 0.0);
3342 auto ResultOffset =
3343 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3344 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3345
3346 MI.eraseFromParent();
3347 return true;
3348}
3349
3351 Register Z, unsigned Flags) {
3352 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3353 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3354}
3355
3357 MachineIRBuilder &B) const {
3358 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3359 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3360
3361 MachineRegisterInfo &MRI = *B.getMRI();
3362 Register Dst = MI.getOperand(0).getReg();
3363 Register X = MI.getOperand(1).getReg();
3364 unsigned Flags = MI.getFlags();
3365 const LLT Ty = MRI.getType(X);
3366 MachineFunction &MF = B.getMF();
3367
3368 const LLT F32 = LLT::scalar(32);
3369 const LLT F16 = LLT::scalar(16);
3370
3371 const AMDGPUTargetMachine &TM =
3372 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3373
3374 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3375 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3376 if (Ty == F16 && !ST.has16BitInsts()) {
3377 Register LogVal = MRI.createGenericVirtualRegister(F32);
3378 auto PromoteSrc = B.buildFPExt(F32, X);
3379 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3380 B.buildFPTrunc(Dst, LogVal);
3381 } else {
3382 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3383 }
3384
3385 MI.eraseFromParent();
3386 return true;
3387 }
3388
3389 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3390 if (ScaledInput)
3391 X = ScaledInput;
3392
3393 auto Y =
3394 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3395
3396 Register R;
3397 if (ST.hasFastFMAF32()) {
3398 // c+cc are ln(2)/ln(10) to more than 49 bits
3399 const float c_log10 = 0x1.344134p-2f;
3400 const float cc_log10 = 0x1.09f79ep-26f;
3401
3402 // c + cc is ln(2) to more than 49 bits
3403 const float c_log = 0x1.62e42ep-1f;
3404 const float cc_log = 0x1.efa39ep-25f;
3405
3406 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3407 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3408
3409 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3410 auto NegR = B.buildFNeg(Ty, R, Flags);
3411 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3412 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3413 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3414 } else {
3415 // ch+ct is ln(2)/ln(10) to more than 36 bits
3416 const float ch_log10 = 0x1.344000p-2f;
3417 const float ct_log10 = 0x1.3509f6p-18f;
3418
3419 // ch + ct is ln(2) to more than 36 bits
3420 const float ch_log = 0x1.62e000p-1f;
3421 const float ct_log = 0x1.0bfbe8p-15f;
3422
3423 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3424 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3425
3426 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3427 auto YH = B.buildAnd(Ty, Y, MaskConst);
3428 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3429 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3430
3431 Register Mad0 =
3432 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3433 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3434 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3435 }
3436
3437 const bool IsFiniteOnly =
3438 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3439 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3440
3441 if (!IsFiniteOnly) {
3442 // Expand isfinite(x) => fabs(x) < inf
3443 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3444 auto Fabs = B.buildFAbs(Ty, Y);
3445 auto IsFinite =
3446 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3447 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3448 }
3449
3450 if (ScaledInput) {
3451 auto Zero = B.buildFConstant(Ty, 0.0);
3452 auto ShiftK =
3453 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3454 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3455 B.buildFSub(Dst, R, Shift, Flags);
3456 } else {
3457 B.buildCopy(Dst, R);
3458 }
3459
3460 MI.eraseFromParent();
3461 return true;
3462}
3463
3465 Register Src, bool IsLog10,
3466 unsigned Flags) const {
3467 const double Log2BaseInverted =
3469
3470 LLT Ty = B.getMRI()->getType(Dst);
3471
3472 if (Ty == LLT::scalar(32)) {
3473 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3474 if (ScaledInput) {
3475 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3476 .addUse(Src)
3477 .setMIFlags(Flags);
3478 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3479 auto Zero = B.buildFConstant(Ty, 0.0);
3480 auto ResultOffset =
3481 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3482 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3483
3484 if (ST.hasFastFMAF32())
3485 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3486 else {
3487 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3488 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3489 }
3490
3491 return true;
3492 }
3493 }
3494
3495 auto Log2Operand = Ty == LLT::scalar(16)
3496 ? B.buildFLog2(Ty, Src, Flags)
3497 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3498 .addUse(Src)
3499 .setMIFlags(Flags);
3500 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3501 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3502 return true;
3503}
3504
3506 MachineIRBuilder &B) const {
3507 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3508 // If we have to handle denormals, scale up the input and adjust the result.
3509
3510 Register Dst = MI.getOperand(0).getReg();
3511 Register Src = MI.getOperand(1).getReg();
3512 unsigned Flags = MI.getFlags();
3513 LLT Ty = B.getMRI()->getType(Dst);
3514 const LLT F16 = LLT::scalar(16);
3515 const LLT F32 = LLT::scalar(32);
3516
3517 if (Ty == F16) {
3518 // Nothing in half is a denormal when promoted to f32.
3519 auto Ext = B.buildFPExt(F32, Src, Flags);
3520 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3521 .addUse(Ext.getReg(0))
3522 .setMIFlags(Flags);
3523 B.buildFPTrunc(Dst, Log2, Flags);
3524 MI.eraseFromParent();
3525 return true;
3526 }
3527
3528 assert(Ty == F32);
3529
3530 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3531 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3532 .addUse(Src)
3533 .setMIFlags(Flags);
3534 MI.eraseFromParent();
3535 return true;
3536 }
3537
3538 // bool needs_scaling = x < -0x1.f80000p+6f;
3539 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3540
3541 // -nextafter(128.0, -1)
3542 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3543 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3544 RangeCheckConst, Flags);
3545
3546 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3547 auto Zero = B.buildFConstant(Ty, 0.0);
3548 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3549 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3550
3551 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3552 .addUse(AddInput.getReg(0))
3553 .setMIFlags(Flags);
3554
3555 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3556 auto One = B.buildFConstant(Ty, 1.0);
3557 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3558 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3559 MI.eraseFromParent();
3560 return true;
3561}
3562
3564 Register X, unsigned Flags) const {
3565 LLT Ty = B.getMRI()->getType(Dst);
3566 LLT F32 = LLT::scalar(32);
3567
3568 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3569 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3570 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3571
3572 if (Ty == F32) {
3573 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3574 .addUse(Mul.getReg(0))
3575 .setMIFlags(Flags);
3576 } else {
3577 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3578 }
3579
3580 return true;
3581 }
3582
3583 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3584 auto NeedsScaling =
3585 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3586 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3587 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3588 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3589
3590 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3591 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3592
3593 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3594 .addUse(ExpInput.getReg(0))
3595 .setMIFlags(Flags);
3596
3597 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3598 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3599 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3600 return true;
3601}
3602
3604 MachineIRBuilder &B) const {
3605 Register Dst = MI.getOperand(0).getReg();
3606 Register X = MI.getOperand(1).getReg();
3607 const unsigned Flags = MI.getFlags();
3608 MachineFunction &MF = B.getMF();
3609 MachineRegisterInfo &MRI = *B.getMRI();
3610 LLT Ty = MRI.getType(Dst);
3611 const LLT F16 = LLT::scalar(16);
3612 const LLT F32 = LLT::scalar(32);
3613 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3614
3615 if (Ty == F16) {
3616 // v_exp_f16 (fmul x, log2e)
3617 if (allowApproxFunc(MF, Flags)) {
3618 // TODO: Does this really require fast?
3619 legalizeFExpUnsafe(B, Dst, X, Flags);
3620 MI.eraseFromParent();
3621 return true;
3622 }
3623
3624 // exp(f16 x) ->
3625 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3626
3627 // Nothing in half is a denormal when promoted to f32.
3628 auto Ext = B.buildFPExt(F32, X, Flags);
3629 Register Lowered = MRI.createGenericVirtualRegister(F32);
3630 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3631 B.buildFPTrunc(Dst, Lowered, Flags);
3632 MI.eraseFromParent();
3633 return true;
3634 }
3635
3636 assert(Ty == F32);
3637
3638 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3639 // library behavior. Also, is known-not-daz source sufficient?
3640 if (allowApproxFunc(MF, Flags)) {
3641 legalizeFExpUnsafe(B, Dst, X, Flags);
3642 MI.eraseFromParent();
3643 return true;
3644 }
3645
3646 // Algorithm:
3647 //
3648 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3649 //
3650 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3651 // n = 64*m + j, 0 <= j < 64
3652 //
3653 // e^x = 2^((64*m + j + f)/64)
3654 // = (2^m) * (2^(j/64)) * 2^(f/64)
3655 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3656 //
3657 // f = x*(64/ln(2)) - n
3658 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3659 //
3660 // e^x = (2^m) * (2^(j/64)) * e^r
3661 //
3662 // (2^(j/64)) is precomputed
3663 //
3664 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3665 // e^r = 1 + q
3666 //
3667 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3668 //
3669 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3670 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3671 Register PH, PL;
3672
3673 if (ST.hasFastFMAF32()) {
3674 const float c_exp = numbers::log2ef;
3675 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3676 const float c_exp10 = 0x1.a934f0p+1f;
3677 const float cc_exp10 = 0x1.2f346ep-24f;
3678
3679 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3680 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3681 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3682 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3683
3684 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3685 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3686 } else {
3687 const float ch_exp = 0x1.714000p+0f;
3688 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3689
3690 const float ch_exp10 = 0x1.a92000p+1f;
3691 const float cl_exp10 = 0x1.4f0978p-11f;
3692
3693 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3694 auto XH = B.buildAnd(Ty, X, MaskConst);
3695 auto XL = B.buildFSub(Ty, X, XH, Flags);
3696
3697 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3698 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3699
3700 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3701 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3702
3703 Register Mad0 =
3704 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3705 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3706 }
3707
3708 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3709
3710 // It is unsafe to contract this fsub into the PH multiply.
3711 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3712 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3713 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3714
3715 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3716 .addUse(A.getReg(0))
3717 .setMIFlags(Flags);
3718 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3719
3720 auto UnderflowCheckConst =
3721 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3722 auto Zero = B.buildFConstant(Ty, 0.0);
3723 auto Underflow =
3724 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3725
3726 R = B.buildSelect(Ty, Underflow, Zero, R);
3727
3728 const auto &Options = MF.getTarget().Options;
3729
3730 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3731 auto OverflowCheckConst =
3732 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3733
3734 auto Overflow =
3735 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3736 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3737 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3738 }
3739
3740 B.buildCopy(Dst, R);
3741 MI.eraseFromParent();
3742 return true;
3743}
3744
3746 MachineIRBuilder &B) const {
3747 Register Dst = MI.getOperand(0).getReg();
3748 Register Src0 = MI.getOperand(1).getReg();
3749 Register Src1 = MI.getOperand(2).getReg();
3750 unsigned Flags = MI.getFlags();
3751 LLT Ty = B.getMRI()->getType(Dst);
3752 const LLT F16 = LLT::float16();
3753 const LLT F32 = LLT::float32();
3754
3755 if (Ty == F32) {
3756 auto Log = B.buildFLog2(F32, Src0, Flags);
3757 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3758 .addUse(Log.getReg(0))
3759 .addUse(Src1)
3760 .setMIFlags(Flags);
3761 B.buildFExp2(Dst, Mul, Flags);
3762 } else if (Ty == F16) {
3763 // There's no f16 fmul_legacy, so we need to convert for it.
3764 auto Log = B.buildFLog2(F16, Src0, Flags);
3765 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3766 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3767 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3768 .addUse(Ext0.getReg(0))
3769 .addUse(Ext1.getReg(0))
3770 .setMIFlags(Flags);
3771 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3772 } else
3773 return false;
3774
3775 MI.eraseFromParent();
3776 return true;
3777}
3778
3779// Find a source register, ignoring any possible source modifiers.
3781 Register ModSrc = OrigSrc;
3782 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3783 ModSrc = SrcFNeg->getOperand(1).getReg();
3784 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3785 ModSrc = SrcFAbs->getOperand(1).getReg();
3786 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3787 ModSrc = SrcFAbs->getOperand(1).getReg();
3788 return ModSrc;
3789}
3790
3793 MachineIRBuilder &B) const {
3794
3795 const LLT S1 = LLT::scalar(1);
3796 const LLT F64 = LLT::float64();
3797 Register Dst = MI.getOperand(0).getReg();
3798 Register OrigSrc = MI.getOperand(1).getReg();
3799 unsigned Flags = MI.getFlags();
3800 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3801 "this should not have been custom lowered");
3802
3803 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3804 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3805 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3806 // V_FRACT bug is:
3807 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3808 //
3809 // Convert floor(x) to (x - fract(x))
3810
3811 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3812 .addUse(OrigSrc)
3813 .setMIFlags(Flags);
3814
3815 // Give source modifier matching some assistance before obscuring a foldable
3816 // pattern.
3817
3818 // TODO: We can avoid the neg on the fract? The input sign to fract
3819 // shouldn't matter?
3820 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3821
3822 auto Const =
3823 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3824
3825 Register Min = MRI.createGenericVirtualRegister(F64);
3826
3827 // We don't need to concern ourselves with the snan handling difference, so
3828 // use the one which will directly select.
3829 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3830 if (MFI->getMode().IEEE)
3831 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3832 else
3833 B.buildFMinNum(Min, Fract, Const, Flags);
3834
3835 Register CorrectedFract = Min;
3836 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3837 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3838 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3839 }
3840
3841 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3842 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3843
3844 MI.eraseFromParent();
3845 return true;
3846}
3847
3848// Turn an illegal packed v2s16 build vector into bit operations.
3849// TODO: This should probably be a bitcast action in LegalizerHelper.
3852 Register Dst = MI.getOperand(0).getReg();
3853 const LLT S32 = LLT::scalar(32);
3854 const LLT S16 = LLT::scalar(16);
3855 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3856
3857 Register Src0 = MI.getOperand(1).getReg();
3858 Register Src1 = MI.getOperand(2).getReg();
3859
3860 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3861 assert(MRI.getType(Src0) == S32);
3862 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3863 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3864 }
3865
3866 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3867 B.buildBitcast(Dst, Merge);
3868
3869 MI.eraseFromParent();
3870 return true;
3871}
3872
3873// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3874//
3875// Source and accumulation registers must all be 32-bits.
3876//
3877// TODO: When the multiply is uniform, we should produce a code sequence
3878// that is better suited to instruction selection on the SALU. Instead of
3879// the outer loop going over parts of the result, the outer loop should go
3880// over parts of one of the factors. This should result in instruction
3881// selection that makes full use of S_ADDC_U32 instructions.
3884 ArrayRef<Register> Src0,
3885 ArrayRef<Register> Src1,
3886 bool UsePartialMad64_32,
3887 bool SeparateOddAlignedProducts) const {
3888 // Use (possibly empty) vectors of S1 registers to represent the set of
3889 // carries from one pair of positions to the next.
3890 using Carry = SmallVector<Register, 2>;
3891
3892 MachineIRBuilder &B = Helper.MIRBuilder;
3893 GISelKnownBits &KB = *Helper.getKnownBits();
3894
3895 const LLT S1 = LLT::scalar(1);
3896 const LLT S32 = LLT::scalar(32);
3897 const LLT S64 = LLT::scalar(64);
3898
3899 Register Zero32;
3900 Register Zero64;
3901
3902 auto getZero32 = [&]() -> Register {
3903 if (!Zero32)
3904 Zero32 = B.buildConstant(S32, 0).getReg(0);
3905 return Zero32;
3906 };
3907 auto getZero64 = [&]() -> Register {
3908 if (!Zero64)
3909 Zero64 = B.buildConstant(S64, 0).getReg(0);
3910 return Zero64;
3911 };
3912
3913 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3914 for (unsigned i = 0; i < Src0.size(); ++i) {
3915 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3916 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3917 }
3918
3919 // Merge the given carries into the 32-bit LocalAccum, which is modified
3920 // in-place.
3921 //
3922 // Returns the carry-out, which is a single S1 register or null.
3923 auto mergeCarry =
3924 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3925 if (CarryIn.empty())
3926 return Register();
3927
3928 bool HaveCarryOut = true;
3929 Register CarryAccum;
3930 if (CarryIn.size() == 1) {
3931 if (!LocalAccum) {
3932 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3933 return Register();
3934 }
3935
3936 CarryAccum = getZero32();
3937 } else {
3938 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3939 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3940 CarryAccum =
3941 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3942 .getReg(0);
3943 }
3944
3945 if (!LocalAccum) {
3946 LocalAccum = getZero32();
3947 HaveCarryOut = false;
3948 }
3949 }
3950
3951 auto Add =
3952 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3953 LocalAccum = Add.getReg(0);
3954 return HaveCarryOut ? Add.getReg(1) : Register();
3955 };
3956
3957 // Build a multiply-add chain to compute
3958 //
3959 // LocalAccum + (partial products at DstIndex)
3960 // + (opportunistic subset of CarryIn)
3961 //
3962 // LocalAccum is an array of one or two 32-bit registers that are updated
3963 // in-place. The incoming registers may be null.
3964 //
3965 // In some edge cases, carry-ins can be consumed "for free". In that case,
3966 // the consumed carry bits are removed from CarryIn in-place.
3967 auto buildMadChain =
3968 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3969 -> Carry {
3970 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3971 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3972
3973 Carry CarryOut;
3974 unsigned j0 = 0;
3975
3976 // Use plain 32-bit multiplication for the most significant part of the
3977 // result by default.
3978 if (LocalAccum.size() == 1 &&
3979 (!UsePartialMad64_32 || !CarryIn.empty())) {
3980 do {
3981 // Skip multiplication if one of the operands is 0
3982 unsigned j1 = DstIndex - j0;
3983 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3984 ++j0;
3985 continue;
3986 }
3987 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3988 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3989 LocalAccum[0] = Mul.getReg(0);
3990 } else {
3991 if (CarryIn.empty()) {
3992 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3993 } else {
3994 LocalAccum[0] =
3995 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3996 .getReg(0);
3997 CarryIn.pop_back();
3998 }
3999 }
4000 ++j0;
4001 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4002 }
4003
4004 // Build full 64-bit multiplies.
4005 if (j0 <= DstIndex) {
4006 bool HaveSmallAccum = false;
4007 Register Tmp;
4008
4009 if (LocalAccum[0]) {
4010 if (LocalAccum.size() == 1) {
4011 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4012 HaveSmallAccum = true;
4013 } else if (LocalAccum[1]) {
4014 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4015 HaveSmallAccum = false;
4016 } else {
4017 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4018 HaveSmallAccum = true;
4019 }
4020 } else {
4021 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4022 Tmp = getZero64();
4023 HaveSmallAccum = true;
4024 }
4025
4026 do {
4027 unsigned j1 = DstIndex - j0;
4028 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4029 ++j0;
4030 continue;
4031 }
4032 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4033 {Src0[j0], Src1[j1], Tmp});
4034 Tmp = Mad.getReg(0);
4035 if (!HaveSmallAccum)
4036 CarryOut.push_back(Mad.getReg(1));
4037 HaveSmallAccum = false;
4038
4039 ++j0;
4040 } while (j0 <= DstIndex);
4041
4042 auto Unmerge = B.buildUnmerge(S32, Tmp);
4043 LocalAccum[0] = Unmerge.getReg(0);
4044 if (LocalAccum.size() > 1)
4045 LocalAccum[1] = Unmerge.getReg(1);
4046 }
4047
4048 return CarryOut;
4049 };
4050
4051 // Outer multiply loop, iterating over destination parts from least
4052 // significant to most significant parts.
4053 //
4054 // The columns of the following diagram correspond to the destination parts
4055 // affected by one iteration of the outer loop (ignoring boundary
4056 // conditions).
4057 //
4058 // Dest index relative to 2 * i: 1 0 -1
4059 // ------
4060 // Carries from previous iteration: e o
4061 // Even-aligned partial product sum: E E .
4062 // Odd-aligned partial product sum: O O
4063 //
4064 // 'o' is OddCarry, 'e' is EvenCarry.
4065 // EE and OO are computed from partial products via buildMadChain and use
4066 // accumulation where possible and appropriate.
4067 //
4068 Register SeparateOddCarry;
4069 Carry EvenCarry;
4070 Carry OddCarry;
4071
4072 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4073 Carry OddCarryIn = std::move(OddCarry);
4074 Carry EvenCarryIn = std::move(EvenCarry);
4075 OddCarry.clear();
4076 EvenCarry.clear();
4077
4078 // Partial products at offset 2 * i.
4079 if (2 * i < Accum.size()) {
4080 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4081 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4082 }
4083
4084 // Partial products at offset 2 * i - 1.
4085 if (i > 0) {
4086 if (!SeparateOddAlignedProducts) {
4087 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4088 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4089 } else {
4090 bool IsHighest = 2 * i >= Accum.size();
4091 Register SeparateOddOut[2];
4092 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4093 .take_front(IsHighest ? 1 : 2);
4094 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4095
4097
4098 if (i == 1) {
4099 if (!IsHighest)
4100 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4101 else
4102 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4103 } else {
4104 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4105 SeparateOddCarry);
4106 }
4107 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4108
4109 if (!IsHighest) {
4110 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4111 Lo->getOperand(1).getReg());
4112 Accum[2 * i] = Hi.getReg(0);
4113 SeparateOddCarry = Hi.getReg(1);
4114 }
4115 }
4116 }
4117
4118 // Add in the carries from the previous iteration
4119 if (i > 0) {
4120 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4121 EvenCarryIn.push_back(CarryOut);
4122
4123 if (2 * i < Accum.size()) {
4124 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4125 OddCarry.push_back(CarryOut);
4126 }
4127 }
4128 }
4129}
4130
4131// Custom narrowing of wide multiplies using wide multiply-add instructions.
4132//
4133// TODO: If the multiply is followed by an addition, we should attempt to
4134// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4136 MachineInstr &MI) const {
4137 assert(ST.hasMad64_32());
4138 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4139
4140 MachineIRBuilder &B = Helper.MIRBuilder;
4141 MachineRegisterInfo &MRI = *B.getMRI();
4142
4143 Register DstReg = MI.getOperand(0).getReg();
4144 Register Src0 = MI.getOperand(1).getReg();
4145 Register Src1 = MI.getOperand(2).getReg();
4146
4147 LLT Ty = MRI.getType(DstReg);
4148 assert(Ty.isScalar());
4149
4150 unsigned Size = Ty.getSizeInBits();
4151 unsigned NumParts = Size / 32;
4152 assert((Size % 32) == 0);
4153 assert(NumParts >= 2);
4154
4155 // Whether to use MAD_64_32 for partial products whose high half is
4156 // discarded. This avoids some ADD instructions but risks false dependency
4157 // stalls on some subtargets in some cases.
4158 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4159
4160 // Whether to compute odd-aligned partial products separately. This is
4161 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4162 // in an even-aligned VGPR.
4163 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4164
4165 LLT S32 = LLT::scalar(32);
4166 SmallVector<Register, 2> Src0Parts, Src1Parts;
4167 for (unsigned i = 0; i < NumParts; ++i) {
4168 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4169 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4170 }
4171 B.buildUnmerge(Src0Parts, Src0);
4172 B.buildUnmerge(Src1Parts, Src1);
4173
4174 SmallVector<Register, 2> AccumRegs(NumParts);
4175 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4176 SeparateOddAlignedProducts);
4177
4178 B.buildMergeLikeInstr(DstReg, AccumRegs);
4179 MI.eraseFromParent();
4180 return true;
4181}
4182
4183// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4184// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4185// case with a single min instruction instead of a compare+select.
4188 MachineIRBuilder &B) const {
4189 Register Dst = MI.getOperand(0).getReg();
4190 Register Src = MI.getOperand(1).getReg();
4191 LLT DstTy = MRI.getType(Dst);
4192 LLT SrcTy = MRI.getType(Src);
4193
4194 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4195 ? AMDGPU::G_AMDGPU_FFBH_U32
4196 : AMDGPU::G_AMDGPU_FFBL_B32;
4197 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4198 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4199
4200 MI.eraseFromParent();
4201 return true;
4202}
4203
4206 MachineIRBuilder &B) const {
4207 Register Dst = MI.getOperand(0).getReg();
4208 Register Src = MI.getOperand(1).getReg();
4209 LLT SrcTy = MRI.getType(Src);
4210 TypeSize NumBits = SrcTy.getSizeInBits();
4211
4212 assert(NumBits < 32u);
4213
4214 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4215 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4216 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4217 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4218 B.buildTrunc(Dst, Ctlz);
4219 MI.eraseFromParent();
4220 return true;
4221}
4222
4223// Check that this is a G_XOR x, -1
4224static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4225 if (MI.getOpcode() != TargetOpcode::G_XOR)
4226 return false;
4227 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4228 return ConstVal && *ConstVal == -1;
4229}
4230
4231// Return the use branch instruction, otherwise null if the usage is invalid.
4232static MachineInstr *
4234 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4235 Register CondDef = MI.getOperand(0).getReg();
4236 if (!MRI.hasOneNonDBGUse(CondDef))
4237 return nullptr;
4238
4239 MachineBasicBlock *Parent = MI.getParent();
4240 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4241
4242 if (isNot(MRI, *UseMI)) {
4243 Register NegatedCond = UseMI->getOperand(0).getReg();
4244 if (!MRI.hasOneNonDBGUse(NegatedCond))
4245 return nullptr;
4246
4247 // We're deleting the def of this value, so we need to remove it.
4248 eraseInstr(*UseMI, MRI);
4249
4250 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4251 Negated = true;
4252 }
4253
4254 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4255 return nullptr;
4256
4257 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4258 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4259 if (Next == Parent->end()) {
4260 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4261 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4262 return nullptr;
4263 UncondBrTarget = &*NextMBB;
4264 } else {
4265 if (Next->getOpcode() != AMDGPU::G_BR)
4266 return nullptr;
4267 Br = &*Next;
4268 UncondBrTarget = Br->getOperand(0).getMBB();
4269 }
4270
4271 return UseMI;
4272}
4273
4275 const ArgDescriptor *Arg,
4276 const TargetRegisterClass *ArgRC,
4277 LLT ArgTy) const {
4278 MCRegister SrcReg = Arg->getRegister();
4279 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4280 assert(DstReg.isVirtual() && "Virtual register expected");
4281
4282 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4283 *ArgRC, B.getDebugLoc(), ArgTy);
4284 if (Arg->isMasked()) {
4285 // TODO: Should we try to emit this once in the entry block?
4286 const LLT S32 = LLT::scalar(32);
4287 const unsigned Mask = Arg->getMask();
4288 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4289
4290 Register AndMaskSrc = LiveIn;
4291
4292 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4293 // 0.
4294 if (Shift != 0) {
4295 auto ShiftAmt = B.buildConstant(S32, Shift);
4296 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4297 }
4298
4299 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4300 } else {
4301 B.buildCopy(DstReg, LiveIn);
4302 }
4303
4304 return true;
4305}
4306
4308 Register DstReg, MachineIRBuilder &B,
4310 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4311 const ArgDescriptor *Arg = nullptr;
4312 const TargetRegisterClass *ArgRC;
4313 LLT ArgTy;
4314
4315 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4316 const ArgDescriptor WorkGroupIDX =
4317 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4318 // If GridZ is not programmed in an entry function then the hardware will set
4319 // it to all zeros, so there is no need to mask the GridY value in the low
4320 // order bits.
4321 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4322 AMDGPU::TTMP7,
4323 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4324 const ArgDescriptor WorkGroupIDZ =
4325 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4326 if (ST.hasArchitectedSGPRs() &&
4328 switch (ArgType) {
4330 Arg = &WorkGroupIDX;
4331 ArgRC = &AMDGPU::SReg_32RegClass;
4332 ArgTy = LLT::scalar(32);
4333 break;
4335 Arg = &WorkGroupIDY;
4336 ArgRC = &AMDGPU::SReg_32RegClass;
4337 ArgTy = LLT::scalar(32);
4338 break;
4340 Arg = &WorkGroupIDZ;
4341 ArgRC = &AMDGPU::SReg_32RegClass;
4342 ArgTy = LLT::scalar(32);
4343 break;
4344 default:
4345 break;
4346 }
4347 }
4348
4349 if (!Arg)
4350 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4351
4352 if (!Arg) {
4354 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4355 // case the pointer argument may be missing and we use null.
4356 B.buildConstant(DstReg, 0);
4357 return true;
4358 }
4359
4360 // It's undefined behavior if a function marked with the amdgpu-no-*
4361 // attributes uses the corresponding intrinsic.
4362 B.buildUndef(DstReg);
4363 return true;
4364 }
4365
4366 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4367 return false; // TODO: Handle these
4368 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4369}
4370
4374 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4375 return false;
4376
4377 MI.eraseFromParent();
4378 return true;
4379}
4380
4382 int64_t C) {
4383 B.buildConstant(MI.getOperand(0).getReg(), C);
4384 MI.eraseFromParent();
4385 return true;
4386}
4387
4390 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4391 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4392 if (MaxID == 0)
4393 return replaceWithConstant(B, MI, 0);
4394
4395 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4396 const ArgDescriptor *Arg;
4397 const TargetRegisterClass *ArgRC;
4398 LLT ArgTy;
4399 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4400
4401 Register DstReg = MI.getOperand(0).getReg();
4402 if (!Arg) {
4403 // It's undefined behavior if a function marked with the amdgpu-no-*
4404 // attributes uses the corresponding intrinsic.
4405 B.buildUndef(DstReg);
4406 MI.eraseFromParent();
4407 return true;
4408 }
4409
4410 if (Arg->isMasked()) {
4411 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4412 // masking operations anyway.
4413 //
4414 // TODO: We could assert the top bit is 0 for the source copy.
4415 if (!loadInputValue(DstReg, B, ArgType))
4416 return false;
4417 } else {
4418 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4419 if (!loadInputValue(TmpReg, B, ArgType))
4420 return false;
4421 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4422 }
4423
4424 MI.eraseFromParent();
4425 return true;
4426}
4427
4429 int64_t Offset) const {
4431 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4432
4433 // TODO: If we passed in the base kernel offset we could have a better
4434 // alignment than 4, but we don't really need it.
4435 if (!loadInputValue(KernArgReg, B,
4437 llvm_unreachable("failed to find kernarg segment ptr");
4438
4439 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4440 // TODO: Should get nuw
4441 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4442}
4443
4444/// Legalize a value that's loaded from kernel arguments. This is only used by
4445/// legacy intrinsics.
4449 Align Alignment) const {
4450 Register DstReg = MI.getOperand(0).getReg();
4451
4452 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4453 "unexpected kernarg parameter type");
4454
4457 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4460 MI.eraseFromParent();
4461 return true;
4462}
4463
4466 MachineIRBuilder &B) const {
4467 Register Dst = MI.getOperand(0).getReg();
4468 LLT DstTy = MRI.getType(Dst);
4469 LLT S16 = LLT::scalar(16);
4470 LLT S32 = LLT::scalar(32);