LLVM 19.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(1);
283static const LLT S8 = LLT::scalar(8);
284static const LLT S16 = LLT::scalar(16);
285static const LLT S32 = LLT::scalar(32);
286static const LLT S64 = LLT::scalar(64);
287static const LLT S96 = LLT::scalar(96);
288static const LLT S128 = LLT::scalar(128);
289static const LLT S160 = LLT::scalar(160);
290static const LLT S224 = LLT::scalar(224);
291static const LLT S256 = LLT::scalar(256);
292static const LLT S512 = LLT::scalar(512);
294
295static const LLT V2S8 = LLT::fixed_vector(2, 8);
296static const LLT V2S16 = LLT::fixed_vector(2, 16);
297static const LLT V4S16 = LLT::fixed_vector(4, 16);
298static const LLT V6S16 = LLT::fixed_vector(6, 16);
299static const LLT V8S16 = LLT::fixed_vector(8, 16);
300static const LLT V10S16 = LLT::fixed_vector(10, 16);
301static const LLT V12S16 = LLT::fixed_vector(12, 16);
302static const LLT V16S16 = LLT::fixed_vector(16, 16);
303
304static const LLT V2S32 = LLT::fixed_vector(2, 32);
305static const LLT V3S32 = LLT::fixed_vector(3, 32);
306static const LLT V4S32 = LLT::fixed_vector(4, 32);
307static const LLT V5S32 = LLT::fixed_vector(5, 32);
308static const LLT V6S32 = LLT::fixed_vector(6, 32);
309static const LLT V7S32 = LLT::fixed_vector(7, 32);
310static const LLT V8S32 = LLT::fixed_vector(8, 32);
311static const LLT V9S32 = LLT::fixed_vector(9, 32);
312static const LLT V10S32 = LLT::fixed_vector(10, 32);
313static const LLT V11S32 = LLT::fixed_vector(11, 32);
314static const LLT V12S32 = LLT::fixed_vector(12, 32);
315static const LLT V16S32 = LLT::fixed_vector(16, 32);
316static const LLT V32S32 = LLT::fixed_vector(32, 32);
317
318static const LLT V2S64 = LLT::fixed_vector(2, 64);
319static const LLT V3S64 = LLT::fixed_vector(3, 64);
320static const LLT V4S64 = LLT::fixed_vector(4, 64);
321static const LLT V5S64 = LLT::fixed_vector(5, 64);
322static const LLT V6S64 = LLT::fixed_vector(6, 64);
323static const LLT V7S64 = LLT::fixed_vector(7, 64);
324static const LLT V8S64 = LLT::fixed_vector(8, 64);
325static const LLT V16S64 = LLT::fixed_vector(16, 64);
326
327static const LLT V2S128 = LLT::fixed_vector(2, 128);
328static const LLT V4S128 = LLT::fixed_vector(4, 128);
329
330static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
331 S160, S224, S256, S512};
332
333static std::initializer_list<LLT> AllS16Vectors{
335
336static std::initializer_list<LLT> AllS32Vectors = {
339
340static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
342
343// Checks whether a type is in the list of legal register types.
344static bool isRegisterClassType(LLT Ty) {
347
350}
351
352static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
353 return [TypeIdx](const LegalityQuery &Query) {
354 return isRegisterClassType(Query.Types[TypeIdx]);
355 };
356}
357
358// If we have a truncating store or an extending load with a data size larger
359// than 32-bits, we need to reduce to a 32-bit type.
361 return [=](const LegalityQuery &Query) {
362 const LLT Ty = Query.Types[TypeIdx];
363 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
364 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
365 };
366}
367
368// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
369// handle some operations by just promoting the register during
370// selection. There are also d16 loads on GFX9+ which preserve the high bits.
371static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
372 bool IsLoad, bool IsAtomic) {
373 switch (AS) {
375 // FIXME: Private element size.
376 return ST.enableFlatScratch() ? 128 : 32;
378 return ST.useDS128() ? 128 : 64;
383 // Treat constant and global as identical. SMRD loads are sometimes usable for
384 // global loads (ideally constant address space should be eliminated)
385 // depending on the context. Legality cannot be context dependent, but
386 // RegBankSelect can split the load as necessary depending on the pointer
387 // register bank/uniformity and if the memory is invariant or not written in a
388 // kernel.
389 return IsLoad ? 512 : 128;
390 default:
391 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
392 // if they may alias scratch depending on the subtarget. This needs to be
393 // moved to custom handling to use addressMayBeAccessedAsPrivate
394 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
395 }
396}
397
398static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
399 const LegalityQuery &Query) {
400 const LLT Ty = Query.Types[0];
401
402 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
403 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
404
405 unsigned RegSize = Ty.getSizeInBits();
406 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
407 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
408 unsigned AS = Query.Types[1].getAddressSpace();
409
410 // All of these need to be custom lowered to cast the pointer operand.
412 return false;
413
414 // Do not handle extending vector loads.
415 if (Ty.isVector() && MemSize != RegSize)
416 return false;
417
418 // TODO: We should be able to widen loads if the alignment is high enough, but
419 // we also need to modify the memory access size.
420#if 0
421 // Accept widening loads based on alignment.
422 if (IsLoad && MemSize < Size)
423 MemSize = std::max(MemSize, Align);
424#endif
425
426 // Only 1-byte and 2-byte to 32-bit extloads are valid.
427 if (MemSize != RegSize && RegSize != 32)
428 return false;
429
430 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
431 Query.MMODescrs[0].Ordering !=
432 AtomicOrdering::NotAtomic))
433 return false;
434
435 switch (MemSize) {
436 case 8:
437 case 16:
438 case 32:
439 case 64:
440 case 128:
441 break;
442 case 96:
443 if (!ST.hasDwordx3LoadStores())
444 return false;
445 break;
446 case 256:
447 case 512:
448 // These may contextually need to be broken down.
449 break;
450 default:
451 return false;
452 }
453
454 assert(RegSize >= MemSize);
455
456 if (AlignBits < MemSize) {
457 const SITargetLowering *TLI = ST.getTargetLowering();
458 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
459 Align(AlignBits / 8)))
460 return false;
461 }
462
463 return true;
464}
465
466// The newer buffer intrinsic forms take their resource arguments as
467// pointers in address space 8, aka s128 values. However, in order to not break
468// SelectionDAG, the underlying operations have to continue to take v4i32
469// arguments. Therefore, we convert resource pointers - or vectors of them
470// to integer values here.
471static bool hasBufferRsrcWorkaround(const LLT Ty) {
473 return true;
474 if (Ty.isVector()) {
475 const LLT ElemTy = Ty.getElementType();
476 return hasBufferRsrcWorkaround(ElemTy);
477 }
478 return false;
479}
480
481// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
482// workaround this. Eventually it should ignore the type for loads and only care
483// about the size. Return true in cases where we will workaround this for now by
484// bitcasting.
485static bool loadStoreBitcastWorkaround(const LLT Ty) {
487 return false;
488
489 const unsigned Size = Ty.getSizeInBits();
490 if (Size <= 64)
491 return false;
492 // Address space 8 pointers get their own workaround.
494 return false;
495 if (!Ty.isVector())
496 return true;
497
498 if (Ty.isPointerVector())
499 return true;
500
501 unsigned EltSize = Ty.getScalarSizeInBits();
502 return EltSize != 32 && EltSize != 64;
503}
504
505static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
506 const LLT Ty = Query.Types[0];
507 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
509}
510
511/// Return true if a load or store of the type should be lowered with a bitcast
512/// to a different type.
513static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
514 const LLT MemTy) {
515 const unsigned MemSizeInBits = MemTy.getSizeInBits();
516 const unsigned Size = Ty.getSizeInBits();
517 if (Size != MemSizeInBits)
518 return Size <= 32 && Ty.isVector();
519
521 return true;
522
523 // Don't try to handle bitcasting vector ext loads for now.
524 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
525 (Size <= 32 || isRegisterSize(Size)) &&
527}
528
529/// Return true if we should legalize a load by widening an odd sized memory
530/// access up to the alignment. Note this case when the memory access itself
531/// changes, not the size of the result register.
532static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
533 uint64_t AlignInBits, unsigned AddrSpace,
534 unsigned Opcode) {
535 unsigned SizeInBits = MemoryTy.getSizeInBits();
536 // We don't want to widen cases that are naturally legal.
537 if (isPowerOf2_32(SizeInBits))
538 return false;
539
540 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
541 // end up widening these for a scalar load during RegBankSelect, if we don't
542 // have 96-bit scalar loads.
543 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
544 return false;
545
546 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
547 return false;
548
549 // A load is known dereferenceable up to the alignment, so it's legal to widen
550 // to it.
551 //
552 // TODO: Could check dereferenceable for less aligned cases.
553 unsigned RoundedSize = NextPowerOf2(SizeInBits);
554 if (AlignInBits < RoundedSize)
555 return false;
556
557 // Do not widen if it would introduce a slow unaligned load.
558 const SITargetLowering *TLI = ST.getTargetLowering();
559 unsigned Fast = 0;
561 RoundedSize, AddrSpace, Align(AlignInBits / 8),
563 Fast;
564}
565
566static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
567 unsigned Opcode) {
568 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
569 return false;
570
571 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
572 Query.MMODescrs[0].AlignInBits,
573 Query.Types[1].getAddressSpace(), Opcode);
574}
575
576/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
577/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
578/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
580 MachineRegisterInfo &MRI, unsigned Idx) {
581 MachineOperand &MO = MI.getOperand(Idx);
582
583 const LLT PointerTy = MRI.getType(MO.getReg());
584
585 // Paranoidly prevent us from doing this multiple times.
587 return PointerTy;
588
589 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
590 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
591 if (!PointerTy.isVector()) {
592 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
593 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
594 const LLT S32 = LLT::scalar(32);
595
596 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
597 std::array<Register, 4> VectorElems;
598 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
599 for (unsigned I = 0; I < NumParts; ++I)
600 VectorElems[I] =
601 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
602 B.buildMergeValues(MO, VectorElems);
603 MO.setReg(VectorReg);
604 return VectorTy;
605 }
606 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
607 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
608 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
609 B.buildIntToPtr(MO, Scalar);
610 MO.setReg(BitcastReg);
611
612 return VectorTy;
613}
614
615/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
616/// the form in which the value must be in order to be passed to the low-level
617/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
618/// needed in order to account for the fact that we can't define a register
619/// class for s128 without breaking SelectionDAG.
621 MachineRegisterInfo &MRI = *B.getMRI();
622 const LLT PointerTy = MRI.getType(Pointer);
623 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
624 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
625
626 if (!PointerTy.isVector()) {
627 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
628 SmallVector<Register, 4> PointerParts;
629 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
630 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
631 for (unsigned I = 0; I < NumParts; ++I)
632 PointerParts.push_back(Unmerged.getReg(I));
633 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
634 }
635 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
636 return B.buildBitcast(VectorTy, Scalar).getReg(0);
637}
638
640 unsigned Idx) {
641 MachineOperand &MO = MI.getOperand(Idx);
642
643 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
644 // Paranoidly prevent us from doing this multiple times.
646 return;
648}
649
651 const GCNTargetMachine &TM)
652 : ST(ST_) {
653 using namespace TargetOpcode;
654
655 auto GetAddrSpacePtr = [&TM](unsigned AS) {
656 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
657 };
658
659 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
660 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
661 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
662 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
663 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
664 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
665 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
666 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
667 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
668 const LLT BufferStridedPtr =
669 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
670
671 const LLT CodePtr = FlatPtr;
672
673 const std::initializer_list<LLT> AddrSpaces64 = {
674 GlobalPtr, ConstantPtr, FlatPtr
675 };
676
677 const std::initializer_list<LLT> AddrSpaces32 = {
678 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
679 };
680
681 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
682
683 const std::initializer_list<LLT> FPTypesBase = {
684 S32, S64
685 };
686
687 const std::initializer_list<LLT> FPTypes16 = {
688 S32, S64, S16
689 };
690
691 const std::initializer_list<LLT> FPTypesPK16 = {
692 S32, S64, S16, V2S16
693 };
694
695 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
696
697 // s1 for VCC branches, s32 for SCC branches.
699
700 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
701 // elements for v3s16
704 .legalFor(AllS32Vectors)
706 .legalFor(AddrSpaces64)
707 .legalFor(AddrSpaces32)
708 .legalFor(AddrSpaces128)
709 .legalIf(isPointer(0))
710 .clampScalar(0, S16, S256)
712 .clampMaxNumElements(0, S32, 16)
714 .scalarize(0);
715
716 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
717 // Full set of gfx9 features.
718 if (ST.hasScalarAddSub64()) {
719 getActionDefinitionsBuilder({G_ADD, G_SUB})
720 .legalFor({S64, S32, S16, V2S16})
721 .clampMaxNumElementsStrict(0, S16, 2)
722 .scalarize(0)
723 .minScalar(0, S16)
725 .maxScalar(0, S32);
726 } else {
727 getActionDefinitionsBuilder({G_ADD, G_SUB})
728 .legalFor({S32, S16, V2S16})
729 .clampMaxNumElementsStrict(0, S16, 2)
730 .scalarize(0)
731 .minScalar(0, S16)
733 .maxScalar(0, S32);
734 }
735
736 if (ST.hasScalarSMulU64()) {
738 .legalFor({S64, S32, S16, V2S16})
739 .clampMaxNumElementsStrict(0, S16, 2)
740 .scalarize(0)
741 .minScalar(0, S16)
743 .custom();
744 } else {
746 .legalFor({S32, S16, V2S16})
747 .clampMaxNumElementsStrict(0, S16, 2)
748 .scalarize(0)
749 .minScalar(0, S16)
751 .custom();
752 }
753 assert(ST.hasMad64_32());
754
755 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
756 .legalFor({S32, S16, V2S16}) // Clamp modifier
757 .minScalarOrElt(0, S16)
759 .scalarize(0)
761 .lower();
762 } else if (ST.has16BitInsts()) {
763 getActionDefinitionsBuilder({G_ADD, G_SUB})
764 .legalFor({S32, S16})
765 .minScalar(0, S16)
767 .maxScalar(0, S32)
768 .scalarize(0);
769
771 .legalFor({S32, S16})
772 .scalarize(0)
773 .minScalar(0, S16)
774 .widenScalarToNextMultipleOf(0, 32)
775 .custom();
776 assert(ST.hasMad64_32());
777
778 // Technically the saturating operations require clamp bit support, but this
779 // was introduced at the same time as 16-bit operations.
780 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
781 .legalFor({S32, S16}) // Clamp modifier
782 .minScalar(0, S16)
783 .scalarize(0)
785 .lower();
786
787 // We're just lowering this, but it helps get a better result to try to
788 // coerce to the desired type first.
789 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
790 .minScalar(0, S16)
791 .scalarize(0)
792 .lower();
793 } else {
794 getActionDefinitionsBuilder({G_ADD, G_SUB})
795 .legalFor({S32})
796 .widenScalarToNextMultipleOf(0, 32)
797 .clampScalar(0, S32, S32)
798 .scalarize(0);
799
800 auto &Mul = getActionDefinitionsBuilder(G_MUL)
801 .legalFor({S32})
802 .scalarize(0)
803 .minScalar(0, S32)
804 .widenScalarToNextMultipleOf(0, 32);
805
806 if (ST.hasMad64_32())
807 Mul.custom();
808 else
809 Mul.maxScalar(0, S32);
810
811 if (ST.hasIntClamp()) {
812 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
813 .legalFor({S32}) // Clamp modifier.
814 .scalarize(0)
815 .minScalarOrElt(0, S32)
816 .lower();
817 } else {
818 // Clamp bit support was added in VI, along with 16-bit operations.
819 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
820 .minScalar(0, S32)
821 .scalarize(0)
822 .lower();
823 }
824
825 // FIXME: DAG expansion gets better results. The widening uses the smaller
826 // range values and goes for the min/max lowering directly.
827 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
828 .minScalar(0, S32)
829 .scalarize(0)
830 .lower();
831 }
832
834 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
835 .customFor({S32, S64})
836 .clampScalar(0, S32, S64)
838 .scalarize(0);
839
840 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
841 .legalFor({S32})
842 .maxScalar(0, S32);
843
844 if (ST.hasVOP3PInsts()) {
845 Mulh
846 .clampMaxNumElements(0, S8, 2)
847 .lowerFor({V2S8});
848 }
849
850 Mulh
851 .scalarize(0)
852 .lower();
853
854 // Report legal for any types we can handle anywhere. For the cases only legal
855 // on the SALU, RegBankSelect will be able to re-legalize.
856 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
857 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
858 .clampScalar(0, S32, S64)
862 .scalarize(0);
863
865 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
866 .legalFor({{S32, S1}, {S32, S32}})
867 .clampScalar(0, S32, S32)
868 .scalarize(0);
869
871 // Don't worry about the size constraint.
873 .lower();
874
876 .legalFor({S1, S32, S64, S16, GlobalPtr,
877 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
878 .legalIf(isPointer(0))
879 .clampScalar(0, S32, S64)
881
882 getActionDefinitionsBuilder(G_FCONSTANT)
883 .legalFor({S32, S64, S16})
884 .clampScalar(0, S16, S64);
885
886 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
887 .legalIf(isRegisterType(0))
888 // s1 and s16 are special cases because they have legal operations on
889 // them, but don't really occupy registers in the normal way.
890 .legalFor({S1, S16})
891 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
894 .clampMaxNumElements(0, S32, 16);
895
896 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
897
898 // If the amount is divergent, we have to do a wave reduction to get the
899 // maximum value, so this is expanded during RegBankSelect.
900 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
901 .legalFor({{PrivatePtr, S32}});
902
903 getActionDefinitionsBuilder(G_STACKSAVE)
904 .customFor({PrivatePtr});
905 getActionDefinitionsBuilder(G_STACKRESTORE)
906 .legalFor({PrivatePtr});
907
908 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
909
910 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
911 .customIf(typeIsNot(0, PrivatePtr));
912
913 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
914
915 auto &FPOpActions = getActionDefinitionsBuilder(
916 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
917 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
918 .legalFor({S32, S64});
919 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
920 .customFor({S32, S64});
921 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
922 .customFor({S32, S64});
923
924 if (ST.has16BitInsts()) {
925 if (ST.hasVOP3PInsts())
926 FPOpActions.legalFor({S16, V2S16});
927 else
928 FPOpActions.legalFor({S16});
929
930 TrigActions.customFor({S16});
931 FDIVActions.customFor({S16});
932 }
933
934 if (ST.hasPackedFP32Ops()) {
935 FPOpActions.legalFor({V2S32});
936 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
937 }
938
939 auto &MinNumMaxNum = getActionDefinitionsBuilder({
940 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
941
942 if (ST.hasVOP3PInsts()) {
943 MinNumMaxNum.customFor(FPTypesPK16)
946 .clampScalar(0, S16, S64)
947 .scalarize(0);
948 } else if (ST.has16BitInsts()) {
949 MinNumMaxNum.customFor(FPTypes16)
950 .clampScalar(0, S16, S64)
951 .scalarize(0);
952 } else {
953 MinNumMaxNum.customFor(FPTypesBase)
954 .clampScalar(0, S32, S64)
955 .scalarize(0);
956 }
957
958 if (ST.hasVOP3PInsts())
959 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
960
961 FPOpActions
962 .scalarize(0)
963 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
964
965 TrigActions
966 .scalarize(0)
967 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
968
969 FDIVActions
970 .scalarize(0)
971 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
972
973 getActionDefinitionsBuilder({G_FNEG, G_FABS})
974 .legalFor(FPTypesPK16)
976 .scalarize(0)
977 .clampScalar(0, S16, S64);
978
979 if (ST.has16BitInsts()) {
981 .legalFor({S16})
982 .customFor({S32, S64})
983 .scalarize(0)
984 .unsupported();
986 .legalFor({S32, S64, S16})
987 .scalarize(0)
988 .clampScalar(0, S16, S64);
989
990 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
991 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
992 .scalarize(0)
993 .maxScalarIf(typeIs(0, S16), 1, S16)
994 .clampScalar(1, S32, S32)
995 .lower();
996
998 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
999 .scalarize(0)
1000 .lower();
1001 } else {
1003 .customFor({S32, S64, S16})
1004 .scalarize(0)
1005 .unsupported();
1006
1007
1008 if (ST.hasFractBug()) {
1010 .customFor({S64})
1011 .legalFor({S32, S64})
1012 .scalarize(0)
1013 .clampScalar(0, S32, S64);
1014 } else {
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 }
1020
1021 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1022 .legalFor({{S32, S32}, {S64, S32}})
1023 .scalarize(0)
1024 .clampScalar(0, S32, S64)
1025 .clampScalar(1, S32, S32)
1026 .lower();
1027
1029 .customFor({{S32, S32}, {S64, S32}})
1030 .scalarize(0)
1031 .minScalar(0, S32)
1032 .clampScalar(1, S32, S32)
1033 .lower();
1034 }
1035
1037 .legalFor({{S32, S64}, {S16, S32}})
1038 .scalarize(0)
1039 .lower();
1040
1042 .legalFor({{S64, S32}, {S32, S16}})
1043 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1044 .scalarize(0);
1045
1046 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1047 if (ST.has16BitInsts()) {
1048 FSubActions
1049 // Use actual fsub instruction
1050 .legalFor({S32, S16})
1051 // Must use fadd + fneg
1052 .lowerFor({S64, V2S16});
1053 } else {
1054 FSubActions
1055 // Use actual fsub instruction
1056 .legalFor({S32})
1057 // Must use fadd + fneg
1058 .lowerFor({S64, S16, V2S16});
1059 }
1060
1061 FSubActions
1062 .scalarize(0)
1063 .clampScalar(0, S32, S64);
1064
1065 // Whether this is legal depends on the floating point mode for the function.
1066 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1067 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1068 FMad.customFor({S32, S16});
1069 else if (ST.hasMadMacF32Insts())
1070 FMad.customFor({S32});
1071 else if (ST.hasMadF16())
1072 FMad.customFor({S16});
1073 FMad.scalarize(0)
1074 .lower();
1075
1076 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1077 if (ST.has16BitInsts()) {
1078 FRem.customFor({S16, S32, S64});
1079 } else {
1080 FRem.minScalar(0, S32)
1081 .customFor({S32, S64});
1082 }
1083 FRem.scalarize(0);
1084
1085 // TODO: Do we need to clamp maximum bitwidth?
1087 .legalIf(isScalar(0))
1088 .legalFor({{V2S16, V2S32}})
1089 .clampMaxNumElements(0, S16, 2)
1090 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1091 // situations (like an invalid implicit use), we don't want to infinite loop
1092 // in the legalizer.
1094 .alwaysLegal();
1095
1096 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1097 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1098 {S32, S1}, {S64, S1}, {S16, S1}})
1099 .scalarize(0)
1100 .clampScalar(0, S32, S64)
1101 .widenScalarToNextPow2(1, 32);
1102
1103 // TODO: Split s1->s64 during regbankselect for VALU.
1104 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1105 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1106 .lowerIf(typeIs(1, S1))
1107 .customFor({{S32, S64}, {S64, S64}});
1108 if (ST.has16BitInsts())
1109 IToFP.legalFor({{S16, S16}});
1110 IToFP.clampScalar(1, S32, S64)
1111 .minScalar(0, S32)
1112 .scalarize(0)
1114
1115 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1116 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1117 .customFor({{S64, S32}, {S64, S64}})
1118 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1119 if (ST.has16BitInsts())
1120 FPToI.legalFor({{S16, S16}});
1121 else
1122 FPToI.minScalar(1, S32);
1123
1124 FPToI.minScalar(0, S32)
1125 .widenScalarToNextPow2(0, 32)
1126 .scalarize(0)
1127 .lower();
1128
1129 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1130 .customFor({S16, S32})
1131 .scalarize(0)
1132 .lower();
1133
1134 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1135 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1136 .scalarize(0)
1137 .lower();
1138
1139 if (ST.has16BitInsts()) {
1140 getActionDefinitionsBuilder(
1141 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1142 .legalFor({S16, S32, S64})
1143 .clampScalar(0, S16, S64)
1144 .scalarize(0);
1145 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1146 getActionDefinitionsBuilder(
1147 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1148 .legalFor({S32, S64})
1149 .clampScalar(0, S32, S64)
1150 .scalarize(0);
1151 } else {
1152 getActionDefinitionsBuilder(
1153 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1154 .legalFor({S32})
1155 .customFor({S64})
1156 .clampScalar(0, S32, S64)
1157 .scalarize(0);
1158 }
1159
1160 getActionDefinitionsBuilder(G_PTR_ADD)
1161 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1162 .legalIf(all(isPointer(0), sameSize(0, 1)))
1163 .scalarize(0)
1164 .scalarSameSizeAs(1, 0);
1165
1166 getActionDefinitionsBuilder(G_PTRMASK)
1167 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1168 .scalarSameSizeAs(1, 0)
1169 .scalarize(0);
1170
1171 auto &CmpBuilder =
1172 getActionDefinitionsBuilder(G_ICMP)
1173 // The compare output type differs based on the register bank of the output,
1174 // so make both s1 and s32 legal.
1175 //
1176 // Scalar compares producing output in scc will be promoted to s32, as that
1177 // is the allocatable register type that will be needed for the copy from
1178 // scc. This will be promoted during RegBankSelect, and we assume something
1179 // before that won't try to use s32 result types.
1180 //
1181 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1182 // bank.
1183 .legalForCartesianProduct(
1184 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1185 .legalForCartesianProduct(
1186 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1187 if (ST.has16BitInsts()) {
1188 CmpBuilder.legalFor({{S1, S16}});
1189 }
1190
1191 CmpBuilder
1192 .widenScalarToNextPow2(1)
1193 .clampScalar(1, S32, S64)
1194 .scalarize(0)
1195 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1196
1197 auto &FCmpBuilder =
1198 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1199 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1200
1201 if (ST.hasSALUFloatInsts())
1202 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1203
1204 FCmpBuilder
1205 .widenScalarToNextPow2(1)
1206 .clampScalar(1, S32, S64)
1207 .scalarize(0);
1208
1209 // FIXME: fpow has a selection pattern that should move to custom lowering.
1210 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1211 if (ST.has16BitInsts())
1212 ExpOps.customFor({{S32}, {S16}});
1213 else
1214 ExpOps.customFor({S32});
1215 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1216 .scalarize(0);
1217
1218 getActionDefinitionsBuilder(G_FPOWI)
1219 .clampScalar(0, MinScalarFPTy, S32)
1220 .lower();
1221
1222 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1223 Log2Ops.customFor({S32});
1224 if (ST.has16BitInsts())
1225 Log2Ops.legalFor({S16});
1226 else
1227 Log2Ops.customFor({S16});
1228 Log2Ops.scalarize(0)
1229 .lower();
1230
1231 auto &LogOps =
1232 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1233 LogOps.customFor({S32, S16});
1234 LogOps.clampScalar(0, MinScalarFPTy, S32)
1235 .scalarize(0);
1236
1237 // The 64-bit versions produce 32-bit results, but only on the SALU.
1238 getActionDefinitionsBuilder(G_CTPOP)
1239 .legalFor({{S32, S32}, {S32, S64}})
1240 .clampScalar(0, S32, S32)
1241 .widenScalarToNextPow2(1, 32)
1242 .clampScalar(1, S32, S64)
1243 .scalarize(0)
1244 .widenScalarToNextPow2(0, 32);
1245
1246 // If no 16 bit instr is available, lower into different instructions.
1247 if (ST.has16BitInsts())
1248 getActionDefinitionsBuilder(G_IS_FPCLASS)
1249 .legalForCartesianProduct({S1}, FPTypes16)
1250 .widenScalarToNextPow2(1)
1251 .scalarize(0)
1252 .lower();
1253 else
1254 getActionDefinitionsBuilder(G_IS_FPCLASS)
1255 .legalForCartesianProduct({S1}, FPTypesBase)
1256 .lowerFor({S1, S16})
1257 .widenScalarToNextPow2(1)
1258 .scalarize(0)
1259 .lower();
1260
1261 // The hardware instructions return a different result on 0 than the generic
1262 // instructions expect. The hardware produces -1, but these produce the
1263 // bitwidth.
1264 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1265 .scalarize(0)
1266 .clampScalar(0, S32, S32)
1267 .clampScalar(1, S32, S64)
1268 .widenScalarToNextPow2(0, 32)
1269 .widenScalarToNextPow2(1, 32)
1270 .custom();
1271
1272 // The 64-bit versions produce 32-bit results, but only on the SALU.
1273 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1274 .legalFor({{S32, S32}, {S32, S64}})
1275 .clampScalar(0, S32, S32)
1276 .clampScalar(1, S32, S64)
1277 .scalarize(0)
1278 .widenScalarToNextPow2(0, 32)
1279 .widenScalarToNextPow2(1, 32);
1280
1281 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1282 // RegBankSelect.
1283 getActionDefinitionsBuilder(G_BITREVERSE)
1284 .legalFor({S32, S64})
1285 .clampScalar(0, S32, S64)
1286 .scalarize(0)
1287 .widenScalarToNextPow2(0);
1288
1289 if (ST.has16BitInsts()) {
1290 getActionDefinitionsBuilder(G_BSWAP)
1291 .legalFor({S16, S32, V2S16})
1292 .clampMaxNumElementsStrict(0, S16, 2)
1293 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1294 // narrowScalar limitation.
1295 .widenScalarToNextPow2(0)
1296 .clampScalar(0, S16, S32)
1297 .scalarize(0);
1298
1299 if (ST.hasVOP3PInsts()) {
1300 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1301 .legalFor({S32, S16, V2S16})
1302 .clampMaxNumElements(0, S16, 2)
1303 .minScalar(0, S16)
1304 .widenScalarToNextPow2(0)
1305 .scalarize(0)
1306 .lower();
1307 } else {
1308 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1309 .legalFor({S32, S16})
1310 .widenScalarToNextPow2(0)
1311 .minScalar(0, S16)
1312 .scalarize(0)
1313 .lower();
1314 }
1315 } else {
1316 // TODO: Should have same legality without v_perm_b32
1317 getActionDefinitionsBuilder(G_BSWAP)
1318 .legalFor({S32})
1319 .lowerIf(scalarNarrowerThan(0, 32))
1320 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1321 // narrowScalar limitation.
1322 .widenScalarToNextPow2(0)
1323 .maxScalar(0, S32)
1324 .scalarize(0)
1325 .lower();
1326
1327 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1328 .legalFor({S32})
1329 .minScalar(0, S32)
1330 .widenScalarToNextPow2(0)
1331 .scalarize(0)
1332 .lower();
1333 }
1334
1335 getActionDefinitionsBuilder(G_INTTOPTR)
1336 // List the common cases
1337 .legalForCartesianProduct(AddrSpaces64, {S64})
1338 .legalForCartesianProduct(AddrSpaces32, {S32})
1339 .scalarize(0)
1340 // Accept any address space as long as the size matches
1341 .legalIf(sameSize(0, 1))
1342 .widenScalarIf(smallerThan(1, 0),
1343 [](const LegalityQuery &Query) {
1344 return std::pair(
1345 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1346 })
1347 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1348 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1349 });
1350
1351 getActionDefinitionsBuilder(G_PTRTOINT)
1352 // List the common cases
1353 .legalForCartesianProduct(AddrSpaces64, {S64})
1354 .legalForCartesianProduct(AddrSpaces32, {S32})
1355 .scalarize(0)
1356 // Accept any address space as long as the size matches
1357 .legalIf(sameSize(0, 1))
1358 .widenScalarIf(smallerThan(0, 1),
1359 [](const LegalityQuery &Query) {
1360 return std::pair(
1361 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1362 })
1363 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1364 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1365 });
1366
1367 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1368 .scalarize(0)
1369 .custom();
1370
1371 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1372 bool IsLoad) -> bool {
1373 const LLT DstTy = Query.Types[0];
1374
1375 // Split vector extloads.
1376 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1377
1378 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1379 return true;
1380
1381 const LLT PtrTy = Query.Types[1];
1382 unsigned AS = PtrTy.getAddressSpace();
1383 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1384 Query.MMODescrs[0].Ordering !=
1386 return true;
1387
1388 // Catch weird sized loads that don't evenly divide into the access sizes
1389 // TODO: May be able to widen depending on alignment etc.
1390 unsigned NumRegs = (MemSize + 31) / 32;
1391 if (NumRegs == 3) {
1392 if (!ST.hasDwordx3LoadStores())
1393 return true;
1394 } else {
1395 // If the alignment allows, these should have been widened.
1396 if (!isPowerOf2_32(NumRegs))
1397 return true;
1398 }
1399
1400 return false;
1401 };
1402
1403 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1404 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1405 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1406
1407 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1408 // LDS
1409 // TODO: Unsupported flat for SI.
1410
1411 for (unsigned Op : {G_LOAD, G_STORE}) {
1412 const bool IsStore = Op == G_STORE;
1413
1414 auto &Actions = getActionDefinitionsBuilder(Op);
1415 // Explicitly list some common cases.
1416 // TODO: Does this help compile time at all?
1417 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1418 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1419 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1420 {S64, GlobalPtr, S64, GlobalAlign32},
1421 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1422 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1423 {S32, GlobalPtr, S8, GlobalAlign8},
1424 {S32, GlobalPtr, S16, GlobalAlign16},
1425
1426 {S32, LocalPtr, S32, 32},
1427 {S64, LocalPtr, S64, 32},
1428 {V2S32, LocalPtr, V2S32, 32},
1429 {S32, LocalPtr, S8, 8},
1430 {S32, LocalPtr, S16, 16},
1431 {V2S16, LocalPtr, S32, 32},
1432
1433 {S32, PrivatePtr, S32, 32},
1434 {S32, PrivatePtr, S8, 8},
1435 {S32, PrivatePtr, S16, 16},
1436 {V2S16, PrivatePtr, S32, 32},
1437
1438 {S32, ConstantPtr, S32, GlobalAlign32},
1439 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1440 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1441 {S64, ConstantPtr, S64, GlobalAlign32},
1442 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1443 Actions.legalIf(
1444 [=](const LegalityQuery &Query) -> bool {
1445 return isLoadStoreLegal(ST, Query);
1446 });
1447
1448 // The custom pointers (fat pointers, buffer resources) don't work with load
1449 // and store at this level. Fat pointers should have been lowered to
1450 // intrinsics before the translation to MIR.
1451 Actions.unsupportedIf(
1452 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1453
1454 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1455 // ptrtoint. This is needed to account for the fact that we can't have i128
1456 // as a register class for SelectionDAG reasons.
1457 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1458 return hasBufferRsrcWorkaround(Query.Types[0]);
1459 });
1460
1461 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1462 // 64-bits.
1463 //
1464 // TODO: Should generalize bitcast action into coerce, which will also cover
1465 // inserting addrspacecasts.
1466 Actions.customIf(typeIs(1, Constant32Ptr));
1467
1468 // Turn any illegal element vectors into something easier to deal
1469 // with. These will ultimately produce 32-bit scalar shifts to extract the
1470 // parts anyway.
1471 //
1472 // For odd 16-bit element vectors, prefer to split those into pieces with
1473 // 16-bit vector parts.
1474 Actions.bitcastIf(
1475 [=](const LegalityQuery &Query) -> bool {
1476 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1477 Query.MMODescrs[0].MemoryTy);
1478 }, bitcastToRegisterType(0));
1479
1480 if (!IsStore) {
1481 // Widen suitably aligned loads by loading extra bytes. The standard
1482 // legalization actions can't properly express widening memory operands.
1483 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1484 return shouldWidenLoad(ST, Query, G_LOAD);
1485 });
1486 }
1487
1488 // FIXME: load/store narrowing should be moved to lower action
1489 Actions
1490 .narrowScalarIf(
1491 [=](const LegalityQuery &Query) -> bool {
1492 return !Query.Types[0].isVector() &&
1493 needToSplitMemOp(Query, Op == G_LOAD);
1494 },
1495 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1496 const LLT DstTy = Query.Types[0];
1497 const LLT PtrTy = Query.Types[1];
1498
1499 const unsigned DstSize = DstTy.getSizeInBits();
1500 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1501
1502 // Split extloads.
1503 if (DstSize > MemSize)
1504 return std::pair(0, LLT::scalar(MemSize));
1505
1506 unsigned MaxSize = maxSizeForAddrSpace(
1507 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1508 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1509 if (MemSize > MaxSize)
1510 return std::pair(0, LLT::scalar(MaxSize));
1511
1512 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1513 return std::pair(0, LLT::scalar(Align));
1514 })
1515 .fewerElementsIf(
1516 [=](const LegalityQuery &Query) -> bool {
1517 return Query.Types[0].isVector() &&
1518 needToSplitMemOp(Query, Op == G_LOAD);
1519 },
1520 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1521 const LLT DstTy = Query.Types[0];
1522 const LLT PtrTy = Query.Types[1];
1523
1524 LLT EltTy = DstTy.getElementType();
1525 unsigned MaxSize = maxSizeForAddrSpace(
1526 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1527 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1528
1529 // FIXME: Handle widened to power of 2 results better. This ends
1530 // up scalarizing.
1531 // FIXME: 3 element stores scalarized on SI
1532
1533 // Split if it's too large for the address space.
1534 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1535 if (MemSize > MaxSize) {
1536 unsigned NumElts = DstTy.getNumElements();
1537 unsigned EltSize = EltTy.getSizeInBits();
1538
1539 if (MaxSize % EltSize == 0) {
1540 return std::pair(
1542 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1543 }
1544
1545 unsigned NumPieces = MemSize / MaxSize;
1546
1547 // FIXME: Refine when odd breakdowns handled
1548 // The scalars will need to be re-legalized.
1549 if (NumPieces == 1 || NumPieces >= NumElts ||
1550 NumElts % NumPieces != 0)
1551 return std::pair(0, EltTy);
1552
1553 return std::pair(0,
1554 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1555 }
1556
1557 // FIXME: We could probably handle weird extending loads better.
1558 if (DstTy.getSizeInBits() > MemSize)
1559 return std::pair(0, EltTy);
1560
1561 unsigned EltSize = EltTy.getSizeInBits();
1562 unsigned DstSize = DstTy.getSizeInBits();
1563 if (!isPowerOf2_32(DstSize)) {
1564 // We're probably decomposing an odd sized store. Try to split
1565 // to the widest type. TODO: Account for alignment. As-is it
1566 // should be OK, since the new parts will be further legalized.
1567 unsigned FloorSize = llvm::bit_floor(DstSize);
1568 return std::pair(
1570 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1571 }
1572
1573 // May need relegalization for the scalars.
1574 return std::pair(0, EltTy);
1575 })
1576 .minScalar(0, S32)
1577 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1578 .widenScalarToNextPow2(0)
1579 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1580 .lower();
1581 }
1582
1583 // FIXME: Unaligned accesses not lowered.
1584 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1585 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1586 {S32, GlobalPtr, S16, 2 * 8},
1587 {S32, LocalPtr, S8, 8},
1588 {S32, LocalPtr, S16, 16},
1589 {S32, PrivatePtr, S8, 8},
1590 {S32, PrivatePtr, S16, 16},
1591 {S32, ConstantPtr, S8, 8},
1592 {S32, ConstantPtr, S16, 2 * 8}})
1593 .legalIf(
1594 [=](const LegalityQuery &Query) -> bool {
1595 return isLoadStoreLegal(ST, Query);
1596 });
1597
1598 if (ST.hasFlatAddressSpace()) {
1599 ExtLoads.legalForTypesWithMemDesc(
1600 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1601 }
1602
1603 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1604 // 64-bits.
1605 //
1606 // TODO: Should generalize bitcast action into coerce, which will also cover
1607 // inserting addrspacecasts.
1608 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1609
1610 ExtLoads.clampScalar(0, S32, S32)
1611 .widenScalarToNextPow2(0)
1612 .lower();
1613
1614 auto &Atomics = getActionDefinitionsBuilder(
1615 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1616 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1617 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1618 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1619 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1620 {S64, GlobalPtr}, {S64, LocalPtr},
1621 {S32, RegionPtr}, {S64, RegionPtr}});
1622 if (ST.hasFlatAddressSpace()) {
1623 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1624 }
1625
1626 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1627 if (ST.hasLDSFPAtomicAdd()) {
1628 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1629 if (ST.hasLdsAtomicAddF64())
1630 Atomic.legalFor({{S64, LocalPtr}});
1631 if (ST.hasAtomicDsPkAdd16Insts())
1632 Atomic.legalFor({{V2S16, LocalPtr}});
1633 }
1634 if (ST.hasAtomicFaddInsts())
1635 Atomic.legalFor({{S32, GlobalPtr}});
1636 if (ST.hasFlatAtomicFaddF32Inst())
1637 Atomic.legalFor({{S32, FlatPtr}});
1638
1639 if (ST.hasGFX90AInsts()) {
1640 // These are legal with some caveats, and should have undergone expansion in
1641 // the IR in most situations
1642 // TODO: Move atomic expansion into legalizer
1643 Atomic.legalFor({
1644 {S32, GlobalPtr},
1645 {S64, GlobalPtr},
1646 {S64, FlatPtr}
1647 });
1648 }
1649
1650 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1651 // demarshalling
1652 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1653 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1654 {S32, FlatPtr}, {S64, FlatPtr}})
1655 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1656 {S32, RegionPtr}, {S64, RegionPtr}});
1657 // TODO: Pointer types, any 32-bit or 64-bit vector
1658
1659 // Condition should be s32 for scalar, s1 for vector.
1660 getActionDefinitionsBuilder(G_SELECT)
1661 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1662 LocalPtr, FlatPtr, PrivatePtr,
1663 LLT::fixed_vector(2, LocalPtr),
1664 LLT::fixed_vector(2, PrivatePtr)},
1665 {S1, S32})
1666 .clampScalar(0, S16, S64)
1667 .scalarize(1)
1668 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1669 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1670 .clampMaxNumElements(0, S32, 2)
1671 .clampMaxNumElements(0, LocalPtr, 2)
1672 .clampMaxNumElements(0, PrivatePtr, 2)
1673 .scalarize(0)
1674 .widenScalarToNextPow2(0)
1675 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1676
1677 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1678 // be more flexible with the shift amount type.
1679 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1680 .legalFor({{S32, S32}, {S64, S32}});
1681 if (ST.has16BitInsts()) {
1682 if (ST.hasVOP3PInsts()) {
1683 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1684 .clampMaxNumElements(0, S16, 2);
1685 } else
1686 Shifts.legalFor({{S16, S16}});
1687
1688 // TODO: Support 16-bit shift amounts for all types
1689 Shifts.widenScalarIf(
1690 [=](const LegalityQuery &Query) {
1691 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1692 // 32-bit amount.
1693 const LLT ValTy = Query.Types[0];
1694 const LLT AmountTy = Query.Types[1];
1695 return ValTy.getSizeInBits() <= 16 &&
1696 AmountTy.getSizeInBits() < 16;
1697 }, changeTo(1, S16));
1698 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1699 Shifts.clampScalar(1, S32, S32);
1700 Shifts.widenScalarToNextPow2(0, 16);
1701 Shifts.clampScalar(0, S16, S64);
1702
1703 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1704 .minScalar(0, S16)
1705 .scalarize(0)
1706 .lower();
1707 } else {
1708 // Make sure we legalize the shift amount type first, as the general
1709 // expansion for the shifted type will produce much worse code if it hasn't
1710 // been truncated already.
1711 Shifts.clampScalar(1, S32, S32);
1712 Shifts.widenScalarToNextPow2(0, 32);
1713 Shifts.clampScalar(0, S32, S64);
1714
1715 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1716 .minScalar(0, S32)
1717 .scalarize(0)
1718 .lower();
1719 }
1720 Shifts.scalarize(0);
1721
1722 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1723 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1724 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1725 unsigned IdxTypeIdx = 2;
1726
1727 getActionDefinitionsBuilder(Op)
1728 .customIf([=](const LegalityQuery &Query) {
1729 const LLT EltTy = Query.Types[EltTypeIdx];
1730 const LLT VecTy = Query.Types[VecTypeIdx];
1731 const LLT IdxTy = Query.Types[IdxTypeIdx];
1732 const unsigned EltSize = EltTy.getSizeInBits();
1733 const bool isLegalVecType =
1735 // Address space 8 pointers are 128-bit wide values, but the logic
1736 // below will try to bitcast them to 2N x s64, which will fail.
1737 // Therefore, as an intermediate step, wrap extracts/insertions from a
1738 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1739 // extraction result) in order to produce a vector operation that can
1740 // be handled by the logic below.
1741 if (EltTy.isPointer() && EltSize > 64)
1742 return true;
1743 return (EltSize == 32 || EltSize == 64) &&
1744 VecTy.getSizeInBits() % 32 == 0 &&
1745 VecTy.getSizeInBits() <= MaxRegisterSize &&
1746 IdxTy.getSizeInBits() == 32 &&
1747 isLegalVecType;
1748 })
1749 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1750 bitcastToVectorElement32(VecTypeIdx))
1751 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1752 .bitcastIf(
1753 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1754 [=](const LegalityQuery &Query) {
1755 // For > 64-bit element types, try to turn this into a 64-bit
1756 // element vector since we may be able to do better indexing
1757 // if this is scalar. If not, fall back to 32.
1758 const LLT EltTy = Query.Types[EltTypeIdx];
1759 const LLT VecTy = Query.Types[VecTypeIdx];
1760 const unsigned DstEltSize = EltTy.getSizeInBits();
1761 const unsigned VecSize = VecTy.getSizeInBits();
1762
1763 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1764 return std::pair(
1765 VecTypeIdx,
1766 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1767 })
1768 .clampScalar(EltTypeIdx, S32, S64)
1769 .clampScalar(VecTypeIdx, S32, S64)
1770 .clampScalar(IdxTypeIdx, S32, S32)
1771 .clampMaxNumElements(VecTypeIdx, S32, 32)
1772 // TODO: Clamp elements for 64-bit vectors?
1773 .moreElementsIf(
1774 isIllegalRegisterType(VecTypeIdx),
1776 // It should only be necessary with variable indexes.
1777 // As a last resort, lower to the stack
1778 .lower();
1779 }
1780
1781 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1782 .unsupportedIf([=](const LegalityQuery &Query) {
1783 const LLT &EltTy = Query.Types[1].getElementType();
1784 return Query.Types[0] != EltTy;
1785 });
1786
1787 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1788 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1789 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1790
1791 // FIXME: Doesn't handle extract of illegal sizes.
1792 getActionDefinitionsBuilder(Op)
1793 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1794 .lowerIf([=](const LegalityQuery &Query) {
1795 // Sub-vector(or single element) insert and extract.
1796 // TODO: verify immediate offset here since lower only works with
1797 // whole elements.
1798 const LLT BigTy = Query.Types[BigTyIdx];
1799 return BigTy.isVector();
1800 })
1801 // FIXME: Multiples of 16 should not be legal.
1802 .legalIf([=](const LegalityQuery &Query) {
1803 const LLT BigTy = Query.Types[BigTyIdx];
1804 const LLT LitTy = Query.Types[LitTyIdx];
1805 return (BigTy.getSizeInBits() % 32 == 0) &&
1806 (LitTy.getSizeInBits() % 16 == 0);
1807 })
1808 .widenScalarIf(
1809 [=](const LegalityQuery &Query) {
1810 const LLT BigTy = Query.Types[BigTyIdx];
1811 return (BigTy.getScalarSizeInBits() < 16);
1812 },
1814 .widenScalarIf(
1815 [=](const LegalityQuery &Query) {
1816 const LLT LitTy = Query.Types[LitTyIdx];
1817 return (LitTy.getScalarSizeInBits() < 16);
1818 },
1820 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1821 .widenScalarToNextPow2(BigTyIdx, 32);
1822
1823 }
1824
1825 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1826 .legalForCartesianProduct(AllS32Vectors, {S32})
1827 .legalForCartesianProduct(AllS64Vectors, {S64})
1828 .clampNumElements(0, V16S32, V32S32)
1829 .clampNumElements(0, V2S64, V16S64)
1830 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1831 .moreElementsIf(
1834
1835 if (ST.hasScalarPackInsts()) {
1836 BuildVector
1837 // FIXME: Should probably widen s1 vectors straight to s32
1838 .minScalarOrElt(0, S16)
1839 .minScalar(1, S16);
1840
1841 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1842 .legalFor({V2S16, S32})
1843 .lower();
1844 } else {
1845 BuildVector.customFor({V2S16, S16});
1846 BuildVector.minScalarOrElt(0, S32);
1847
1848 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1849 .customFor({V2S16, S32})
1850 .lower();
1851 }
1852
1853 BuildVector.legalIf(isRegisterType(0));
1854
1855 // FIXME: Clamp maximum size
1856 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1857 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1858 .clampMaxNumElements(0, S32, 32)
1859 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1860 .clampMaxNumElements(0, S16, 64);
1861
1862 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1863
1864 // Merge/Unmerge
1865 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1866 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1867 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1868
1869 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1870 const LLT Ty = Query.Types[TypeIdx];
1871 if (Ty.isVector()) {
1872 const LLT &EltTy = Ty.getElementType();
1873 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1874 return true;
1875 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1876 return true;
1877 }
1878 return false;
1879 };
1880
1881 auto &Builder = getActionDefinitionsBuilder(Op)
1882 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1883 .lowerFor({{S16, V2S16}})
1884 .lowerIf([=](const LegalityQuery &Query) {
1885 const LLT BigTy = Query.Types[BigTyIdx];
1886 return BigTy.getSizeInBits() == 32;
1887 })
1888 // Try to widen to s16 first for small types.
1889 // TODO: Only do this on targets with legal s16 shifts
1890 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1891 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1892 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1893 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1894 elementTypeIs(1, S16)),
1895 changeTo(1, V2S16))
1896 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1897 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1898 // valid.
1899 .clampScalar(LitTyIdx, S32, S512)
1900 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1901 // Break up vectors with weird elements into scalars
1902 .fewerElementsIf(
1903 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1904 scalarize(0))
1905 .fewerElementsIf(
1906 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1907 scalarize(1))
1908 .clampScalar(BigTyIdx, S32, MaxScalar);
1909
1910 if (Op == G_MERGE_VALUES) {
1911 Builder.widenScalarIf(
1912 // TODO: Use 16-bit shifts if legal for 8-bit values?
1913 [=](const LegalityQuery &Query) {
1914 const LLT Ty = Query.Types[LitTyIdx];
1915 return Ty.getSizeInBits() < 32;
1916 },
1917 changeTo(LitTyIdx, S32));
1918 }
1919
1920 Builder.widenScalarIf(
1921 [=](const LegalityQuery &Query) {
1922 const LLT Ty = Query.Types[BigTyIdx];
1923 return Ty.getSizeInBits() % 16 != 0;
1924 },
1925 [=](const LegalityQuery &Query) {
1926 // Pick the next power of 2, or a multiple of 64 over 128.
1927 // Whichever is smaller.
1928 const LLT &Ty = Query.Types[BigTyIdx];
1929 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1930 if (NewSizeInBits >= 256) {
1931 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1932 if (RoundedTo < NewSizeInBits)
1933 NewSizeInBits = RoundedTo;
1934 }
1935 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1936 })
1937 // Any vectors left are the wrong size. Scalarize them.
1938 .scalarize(0)
1939 .scalarize(1);
1940 }
1941
1942 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1943 // RegBankSelect.
1944 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1945 .legalFor({{S32}, {S64}});
1946
1947 if (ST.hasVOP3PInsts()) {
1948 SextInReg.lowerFor({{V2S16}})
1949 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1950 // get more vector shift opportunities, since we'll get those when
1951 // expanded.
1952 .clampMaxNumElementsStrict(0, S16, 2);
1953 } else if (ST.has16BitInsts()) {
1954 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1955 } else {
1956 // Prefer to promote to s32 before lowering if we don't have 16-bit
1957 // shifts. This avoid a lot of intermediate truncate and extend operations.
1958 SextInReg.lowerFor({{S32}, {S64}});
1959 }
1960
1961 SextInReg
1962 .scalarize(0)
1963 .clampScalar(0, S32, S64)
1964 .lower();
1965
1966 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1967 .scalarize(0)
1968 .lower();
1969
1970 // TODO: Only Try to form v2s16 with legal packed instructions.
1971 getActionDefinitionsBuilder(G_FSHR)
1972 .legalFor({{S32, S32}})
1973 .lowerFor({{V2S16, V2S16}})
1974 .clampMaxNumElementsStrict(0, S16, 2)
1975 .scalarize(0)
1976 .lower();
1977
1978 if (ST.hasVOP3PInsts()) {
1979 getActionDefinitionsBuilder(G_FSHL)
1980 .lowerFor({{V2S16, V2S16}})
1981 .clampMaxNumElementsStrict(0, S16, 2)
1982 .scalarize(0)
1983 .lower();
1984 } else {
1985 getActionDefinitionsBuilder(G_FSHL)
1986 .scalarize(0)
1987 .lower();
1988 }
1989
1990 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1991 .legalFor({S64});
1992
1993 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
1994
1995 getActionDefinitionsBuilder(G_FENCE)
1996 .alwaysLegal();
1997
1998 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1999 .scalarize(0)
2000 .minScalar(0, S32)
2001 .lower();
2002
2003 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2004 .legalFor({{S32, S32}, {S64, S32}})
2005 .clampScalar(1, S32, S32)
2006 .clampScalar(0, S32, S64)
2007 .widenScalarToNextPow2(0)
2008 .scalarize(0);
2009
2010 getActionDefinitionsBuilder(
2011 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2012 G_FCOPYSIGN,
2013
2014 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2015 G_READ_REGISTER, G_WRITE_REGISTER,
2016
2017 G_SADDO, G_SSUBO})
2018 .lower();
2019
2020 if (ST.hasIEEEMinMax()) {
2021 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2022 .legalFor(FPTypesPK16)
2023 .clampMaxNumElements(0, S16, 2)
2024 .scalarize(0);
2025 } else {
2026 // TODO: Implement
2027 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2028 }
2029
2030 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2031 .lower();
2032
2033 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2034
2035 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2036 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2037 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2038 .unsupported();
2039
2040 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2041
2042 getLegacyLegalizerInfo().computeTables();
2043 verify(*ST.getInstrInfo());
2044}
2045
2048 LostDebugLocObserver &LocObserver) const {
2049 MachineIRBuilder &B = Helper.MIRBuilder;
2050 MachineRegisterInfo &MRI = *B.getMRI();
2051
2052 switch (MI.getOpcode()) {
2053 case TargetOpcode::G_ADDRSPACE_CAST:
2054 return legalizeAddrSpaceCast(MI, MRI, B);
2055 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2056 return legalizeFroundeven(MI, MRI, B);
2057 case TargetOpcode::G_FCEIL:
2058 return legalizeFceil(MI, MRI, B);
2059 case TargetOpcode::G_FREM:
2060 return legalizeFrem(MI, MRI, B);
2061 case TargetOpcode::G_INTRINSIC_TRUNC:
2062 return legalizeIntrinsicTrunc(MI, MRI, B);
2063 case TargetOpcode::G_SITOFP:
2064 return legalizeITOFP(MI, MRI, B, true);
2065 case TargetOpcode::G_UITOFP:
2066 return legalizeITOFP(MI, MRI, B, false);
2067 case TargetOpcode::G_FPTOSI:
2068 return legalizeFPTOI(MI, MRI, B, true);
2069 case TargetOpcode::G_FPTOUI:
2070 return legalizeFPTOI(MI, MRI, B, false);
2071 case TargetOpcode::G_FMINNUM:
2072 case TargetOpcode::G_FMAXNUM:
2073 case TargetOpcode::G_FMINNUM_IEEE:
2074 case TargetOpcode::G_FMAXNUM_IEEE:
2075 return legalizeMinNumMaxNum(Helper, MI);
2076 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2077 return legalizeExtractVectorElt(MI, MRI, B);
2078 case TargetOpcode::G_INSERT_VECTOR_ELT:
2079 return legalizeInsertVectorElt(MI, MRI, B);
2080 case TargetOpcode::G_FSIN:
2081 case TargetOpcode::G_FCOS:
2082 return legalizeSinCos(MI, MRI, B);
2083 case TargetOpcode::G_GLOBAL_VALUE:
2084 return legalizeGlobalValue(MI, MRI, B);
2085 case TargetOpcode::G_LOAD:
2086 case TargetOpcode::G_SEXTLOAD:
2087 case TargetOpcode::G_ZEXTLOAD:
2088 return legalizeLoad(Helper, MI);
2089 case TargetOpcode::G_STORE:
2090 return legalizeStore(Helper, MI);
2091 case TargetOpcode::G_FMAD:
2092 return legalizeFMad(MI, MRI, B);
2093 case TargetOpcode::G_FDIV:
2094 return legalizeFDIV(MI, MRI, B);
2095 case TargetOpcode::G_FFREXP:
2096 return legalizeFFREXP(MI, MRI, B);
2097 case TargetOpcode::G_FSQRT:
2098 return legalizeFSQRT(MI, MRI, B);
2099 case TargetOpcode::G_UDIV:
2100 case TargetOpcode::G_UREM:
2101 case TargetOpcode::G_UDIVREM:
2102 return legalizeUnsignedDIV_REM(MI, MRI, B);
2103 case TargetOpcode::G_SDIV:
2104 case TargetOpcode::G_SREM:
2105 case TargetOpcode::G_SDIVREM:
2106 return legalizeSignedDIV_REM(MI, MRI, B);
2107 case TargetOpcode::G_ATOMIC_CMPXCHG:
2108 return legalizeAtomicCmpXChg(MI, MRI, B);
2109 case TargetOpcode::G_FLOG2:
2110 return legalizeFlog2(MI, B);
2111 case TargetOpcode::G_FLOG:
2112 case TargetOpcode::G_FLOG10:
2113 return legalizeFlogCommon(MI, B);
2114 case TargetOpcode::G_FEXP2:
2115 return legalizeFExp2(MI, B);
2116 case TargetOpcode::G_FEXP:
2117 case TargetOpcode::G_FEXP10:
2118 return legalizeFExp(MI, B);
2119 case TargetOpcode::G_FPOW:
2120 return legalizeFPow(MI, B);
2121 case TargetOpcode::G_FFLOOR:
2122 return legalizeFFloor(MI, MRI, B);
2123 case TargetOpcode::G_BUILD_VECTOR:
2124 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2125 return legalizeBuildVector(MI, MRI, B);
2126 case TargetOpcode::G_MUL:
2127 return legalizeMul(Helper, MI);
2128 case TargetOpcode::G_CTLZ:
2129 case TargetOpcode::G_CTTZ:
2130 return legalizeCTLZ_CTTZ(MI, MRI, B);
2131 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2132 return legalizeFPTruncRound(MI, B);
2133 case TargetOpcode::G_STACKSAVE:
2134 return legalizeStackSave(MI, B);
2135 case TargetOpcode::G_GET_FPENV:
2136 return legalizeGetFPEnv(MI, MRI, B);
2137 case TargetOpcode::G_SET_FPENV:
2138 return legalizeSetFPEnv(MI, MRI, B);
2139 case TargetOpcode::G_TRAP:
2140 return legalizeTrap(MI, MRI, B);
2141 case TargetOpcode::G_DEBUGTRAP:
2142 return legalizeDebugTrap(MI, MRI, B);
2143 default:
2144 return false;
2145 }
2146
2147 llvm_unreachable("expected switch to return");
2148}
2149
2151 unsigned AS,
2153 MachineIRBuilder &B) const {
2154 MachineFunction &MF = B.getMF();
2155 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2156 const LLT S32 = LLT::scalar(32);
2157 const LLT S64 = LLT::scalar(64);
2158
2160
2161 if (ST.hasApertureRegs()) {
2162 // Note: this register is somewhat broken. When used as a 32-bit operand,
2163 // it only returns zeroes. The real value is in the upper 32 bits.
2164 // Thus, we must emit extract the high 32 bits.
2165 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2166 ? AMDGPU::SRC_SHARED_BASE
2167 : AMDGPU::SRC_PRIVATE_BASE;
2168 // FIXME: It would be more natural to emit a COPY here, but then copy
2169 // coalescing would kick in and it would think it's okay to use the "HI"
2170 // subregister (instead of extracting the HI 32 bits) which is an artificial
2171 // (unusable) register.
2172 // Register TableGen definitions would need an overhaul to get rid of the
2173 // artificial "HI" aperture registers and prevent this kind of issue from
2174 // happening.
2175 Register Dst = MRI.createGenericVirtualRegister(S64);
2176 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2177 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2178 return B.buildUnmerge(S32, Dst).getReg(1);
2179 }
2180
2181 // TODO: can we be smarter about machine pointer info?
2183 Register LoadAddr = MRI.createGenericVirtualRegister(
2185 // For code object version 5, private_base and shared_base are passed through
2186 // implicit kernargs.
2193 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2194
2195 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2197
2198 if (!loadInputValue(KernargPtrReg, B,
2200 return Register();
2201
2203 PtrInfo,
2207
2208 // Pointer address
2209 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2210 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2211 // Load address
2212 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2213 }
2214
2215 Register QueuePtr = MRI.createGenericVirtualRegister(
2217
2219 return Register();
2220
2221 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2222 // private_segment_aperture_base_hi.
2223 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2224
2226 PtrInfo,
2229 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2230
2231 B.buildPtrAdd(LoadAddr, QueuePtr,
2232 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2233 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2234}
2235
2236/// Return true if the value is a known valid address, such that a null check is
2237/// not necessary.
2239 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2240 MachineInstr *Def = MRI.getVRegDef(Val);
2241 switch (Def->getOpcode()) {
2242 case AMDGPU::G_FRAME_INDEX:
2243 case AMDGPU::G_GLOBAL_VALUE:
2244 case AMDGPU::G_BLOCK_ADDR:
2245 return true;
2246 case AMDGPU::G_CONSTANT: {
2247 const ConstantInt *CI = Def->getOperand(1).getCImm();
2248 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2249 }
2250 default:
2251 return false;
2252 }
2253
2254 return false;
2255}
2256
2259 MachineIRBuilder &B) const {
2260 MachineFunction &MF = B.getMF();
2261
2262 // MI can either be a G_ADDRSPACE_CAST or a
2263 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2264 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2265 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2266 Intrinsic::amdgcn_addrspacecast_nonnull));
2267
2268 const LLT S32 = LLT::scalar(32);
2269 Register Dst = MI.getOperand(0).getReg();
2270 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2271 : MI.getOperand(1).getReg();
2272 LLT DstTy = MRI.getType(Dst);
2273 LLT SrcTy = MRI.getType(Src);
2274 unsigned DestAS = DstTy.getAddressSpace();
2275 unsigned SrcAS = SrcTy.getAddressSpace();
2276
2277 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2278 // vector element.
2279 assert(!DstTy.isVector());
2280
2281 const AMDGPUTargetMachine &TM
2282 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2283
2284 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2285 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2286 return true;
2287 }
2288
2289 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2290 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2291 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2292 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2293 // G_ADDRSPACE_CAST we need to guess.
2294 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2295 // Extract low 32-bits of the pointer.
2296 B.buildExtract(Dst, Src, 0);
2297 MI.eraseFromParent();
2298 return true;
2299 }
2300
2301 unsigned NullVal = TM.getNullPointerValue(DestAS);
2302
2303 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2304 auto FlatNull = B.buildConstant(SrcTy, 0);
2305
2306 // Extract low 32-bits of the pointer.
2307 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2308
2309 auto CmpRes =
2310 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2311 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2312
2313 MI.eraseFromParent();
2314 return true;
2315 }
2316
2317 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2318 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2319 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2320 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2321 if (!ApertureReg.isValid())
2322 return false;
2323
2324 // Coerce the type of the low half of the result so we can use merge_values.
2325 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2326
2327 // TODO: Should we allow mismatched types but matching sizes in merges to
2328 // avoid the ptrtoint?
2329 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2330
2331 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2332 // G_ADDRSPACE_CAST we need to guess.
2333 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2334 B.buildCopy(Dst, BuildPtr);
2335 MI.eraseFromParent();
2336 return true;
2337 }
2338
2339 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2340 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2341
2342 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2343 SegmentNull.getReg(0));
2344
2345 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2346
2347 MI.eraseFromParent();
2348 return true;
2349 }
2350
2351 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2352 SrcTy.getSizeInBits() == 64) {
2353 // Truncate.
2354 B.buildExtract(Dst, Src, 0);
2355 MI.eraseFromParent();
2356 return true;
2357 }
2358
2359 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2360 DstTy.getSizeInBits() == 64) {
2362 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2363 auto PtrLo = B.buildPtrToInt(S32, Src);
2364 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2365 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2366 MI.eraseFromParent();
2367 return true;
2368 }
2369
2370 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2371 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2372
2373 LLVMContext &Ctx = MF.getFunction().getContext();
2374 Ctx.diagnose(InvalidAddrSpaceCast);
2375 B.buildUndef(Dst);
2376 MI.eraseFromParent();
2377 return true;
2378}
2379
2382 MachineIRBuilder &B) const {
2383 Register Src = MI.getOperand(1).getReg();
2384 LLT Ty = MRI.getType(Src);
2385 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2386
2387 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2388 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2389
2390 auto C1 = B.buildFConstant(Ty, C1Val);
2391 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2392
2393 // TODO: Should this propagate fast-math-flags?
2394 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2395 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2396
2397 auto C2 = B.buildFConstant(Ty, C2Val);
2398 auto Fabs = B.buildFAbs(Ty, Src);
2399
2400 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2401 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2402 MI.eraseFromParent();
2403 return true;
2404}
2405
2408 MachineIRBuilder &B) const {
2409
2410 const LLT S1 = LLT::scalar(1);
2411 const LLT S64 = LLT::scalar(64);
2412
2413 Register Src = MI.getOperand(1).getReg();
2414 assert(MRI.getType(Src) == S64);
2415
2416 // result = trunc(src)
2417 // if (src > 0.0 && src != result)
2418 // result += 1.0
2419
2420 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2421
2422 const auto Zero = B.buildFConstant(S64, 0.0);
2423 const auto One = B.buildFConstant(S64, 1.0);
2424 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2425 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2426 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2427 auto Add = B.buildSelect(S64, And, One, Zero);
2428
2429 // TODO: Should this propagate fast-math-flags?
2430 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2431 MI.eraseFromParent();
2432 return true;
2433}
2434
2437 MachineIRBuilder &B) const {
2438 Register DstReg = MI.getOperand(0).getReg();
2439 Register Src0Reg = MI.getOperand(1).getReg();
2440 Register Src1Reg = MI.getOperand(2).getReg();
2441 auto Flags = MI.getFlags();
2442 LLT Ty = MRI.getType(DstReg);
2443
2444 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2445 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2446 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2447 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2448 MI.eraseFromParent();
2449 return true;
2450}
2451
2454 const unsigned FractBits = 52;
2455 const unsigned ExpBits = 11;
2456 LLT S32 = LLT::scalar(32);
2457
2458 auto Const0 = B.buildConstant(S32, FractBits - 32);
2459 auto Const1 = B.buildConstant(S32, ExpBits);
2460
2461 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2462 .addUse(Hi)
2463 .addUse(Const0.getReg(0))
2464 .addUse(Const1.getReg(0));
2465
2466 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2467}
2468
2471 MachineIRBuilder &B) const {
2472 const LLT S1 = LLT::scalar(1);
2473 const LLT S32 = LLT::scalar(32);
2474 const LLT S64 = LLT::scalar(64);
2475
2476 Register Src = MI.getOperand(1).getReg();
2477 assert(MRI.getType(Src) == S64);
2478
2479 // TODO: Should this use extract since the low half is unused?
2480 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2481 Register Hi = Unmerge.getReg(1);
2482
2483 // Extract the upper half, since this is where we will find the sign and
2484 // exponent.
2485 auto Exp = extractF64Exponent(Hi, B);
2486
2487 const unsigned FractBits = 52;
2488
2489 // Extract the sign bit.
2490 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2491 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2492
2493 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2494
2495 const auto Zero32 = B.buildConstant(S32, 0);
2496
2497 // Extend back to 64-bits.
2498 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2499
2500 auto Shr = B.buildAShr(S64, FractMask, Exp);
2501 auto Not = B.buildNot(S64, Shr);
2502 auto Tmp0 = B.buildAnd(S64, Src, Not);
2503 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2504
2505 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2506 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2507
2508 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2509 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2510 MI.eraseFromParent();
2511 return true;
2512}
2513
2516 MachineIRBuilder &B, bool Signed) const {
2517
2518 Register Dst = MI.getOperand(0).getReg();
2519 Register Src = MI.getOperand(1).getReg();
2520
2521 const LLT S64 = LLT::scalar(64);
2522 const LLT S32 = LLT::scalar(32);
2523
2524 assert(MRI.getType(Src) == S64);
2525
2526 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2527 auto ThirtyTwo = B.buildConstant(S32, 32);
2528
2529 if (MRI.getType(Dst) == S64) {
2530 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2531 : B.buildUITOFP(S64, Unmerge.getReg(1));
2532
2533 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2534 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2535
2536 // TODO: Should this propagate fast-math-flags?
2537 B.buildFAdd(Dst, LdExp, CvtLo);
2538 MI.eraseFromParent();
2539 return true;
2540 }
2541
2542 assert(MRI.getType(Dst) == S32);
2543
2544 auto One = B.buildConstant(S32, 1);
2545
2546 MachineInstrBuilder ShAmt;
2547 if (Signed) {
2548 auto ThirtyOne = B.buildConstant(S32, 31);
2549 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2550 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2551 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2552 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2553 .addUse(Unmerge.getReg(1));
2554 auto LS2 = B.buildSub(S32, LS, One);
2555 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2556 } else
2557 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2558 auto Norm = B.buildShl(S64, Src, ShAmt);
2559 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2560 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2561 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2562 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2563 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2564 B.buildFLdexp(Dst, FVal, Scale);
2565 MI.eraseFromParent();
2566 return true;
2567}
2568
2569// TODO: Copied from DAG implementation. Verify logic and document how this
2570// actually works.
2574 bool Signed) const {
2575
2576 Register Dst = MI.getOperand(0).getReg();
2577 Register Src = MI.getOperand(1).getReg();
2578
2579 const LLT S64 = LLT::scalar(64);
2580 const LLT S32 = LLT::scalar(32);
2581
2582 const LLT SrcLT = MRI.getType(Src);
2583 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2584
2585 unsigned Flags = MI.getFlags();
2586
2587 // The basic idea of converting a floating point number into a pair of 32-bit
2588 // integers is illustrated as follows:
2589 //
2590 // tf := trunc(val);
2591 // hif := floor(tf * 2^-32);
2592 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2593 // hi := fptoi(hif);
2594 // lo := fptoi(lof);
2595 //
2596 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2598 if (Signed && SrcLT == S32) {
2599 // However, a 32-bit floating point number has only 23 bits mantissa and
2600 // it's not enough to hold all the significant bits of `lof` if val is
2601 // negative. To avoid the loss of precision, We need to take the absolute
2602 // value after truncating and flip the result back based on the original
2603 // signedness.
2604 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2605 Trunc = B.buildFAbs(S32, Trunc, Flags);
2606 }
2607 MachineInstrBuilder K0, K1;
2608 if (SrcLT == S64) {
2609 K0 = B.buildFConstant(
2610 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2611 K1 = B.buildFConstant(
2612 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2613 } else {
2614 K0 = B.buildFConstant(
2615 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2616 K1 = B.buildFConstant(
2617 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2618 }
2619
2620 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2621 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2622 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2623
2624 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2625 : B.buildFPTOUI(S32, FloorMul);
2626 auto Lo = B.buildFPTOUI(S32, Fma);
2627
2628 if (Signed && SrcLT == S32) {
2629 // Flip the result based on the signedness, which is either all 0s or 1s.
2630 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2631 // r := xor({lo, hi}, sign) - sign;
2632 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2633 Sign);
2634 } else
2635 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2636 MI.eraseFromParent();
2637
2638 return true;
2639}
2640
2642 MachineInstr &MI) const {
2643 MachineFunction &MF = Helper.MIRBuilder.getMF();
2645
2646 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2647 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2648
2649 // With ieee_mode disabled, the instructions have the correct behavior
2650 // already for G_FMINNUM/G_FMAXNUM
2651 if (!MFI->getMode().IEEE)
2652 return !IsIEEEOp;
2653
2654 if (IsIEEEOp)
2655 return true;
2656
2658}
2659
2662 MachineIRBuilder &B) const {
2663 // TODO: Should move some of this into LegalizerHelper.
2664
2665 // TODO: Promote dynamic indexing of s16 to s32
2666
2667 Register Dst = MI.getOperand(0).getReg();
2668 Register Vec = MI.getOperand(1).getReg();
2669
2670 LLT VecTy = MRI.getType(Vec);
2671 LLT EltTy = VecTy.getElementType();
2672 assert(EltTy == MRI.getType(Dst));
2673
2674 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2675 // but we can't go directly to that logic becasue you can't bitcast a vector
2676 // of pointers to a vector of integers. Therefore, introduce an intermediate
2677 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2678 // drive the legalization forward.
2679 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2680 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2681 LLT IntVecTy = VecTy.changeElementType(IntTy);
2682
2683 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2684 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2685 B.buildIntToPtr(Dst, IntElt);
2686
2687 MI.eraseFromParent();
2688 return true;
2689 }
2690
2691 // FIXME: Artifact combiner probably should have replaced the truncated
2692 // constant before this, so we shouldn't need
2693 // getIConstantVRegValWithLookThrough.
2694 std::optional<ValueAndVReg> MaybeIdxVal =
2695 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2696 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2697 return true;
2698 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2699
2700 if (IdxVal < VecTy.getNumElements()) {
2701 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2702 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2703 } else {
2704 B.buildUndef(Dst);
2705 }
2706
2707 MI.eraseFromParent();
2708 return true;
2709}
2710
2713 MachineIRBuilder &B) const {
2714 // TODO: Should move some of this into LegalizerHelper.
2715
2716 // TODO: Promote dynamic indexing of s16 to s32
2717
2718 Register Dst = MI.getOperand(0).getReg();
2719 Register Vec = MI.getOperand(1).getReg();
2720 Register Ins = MI.getOperand(2).getReg();
2721
2722 LLT VecTy = MRI.getType(Vec);
2723 LLT EltTy = VecTy.getElementType();
2724 assert(EltTy == MRI.getType(Ins));
2725
2726 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2727 // but we can't go directly to that logic becasue you can't bitcast a vector
2728 // of pointers to a vector of integers. Therefore, make the pointer vector
2729 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2730 // new value, and then inttoptr the result vector back. This will then allow
2731 // the rest of legalization to take over.
2732 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2733 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2734 LLT IntVecTy = VecTy.changeElementType(IntTy);
2735
2736 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2737 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2738 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2739 MI.getOperand(3));
2740 B.buildIntToPtr(Dst, IntVecDest);
2741 MI.eraseFromParent();
2742 return true;
2743 }
2744
2745 // FIXME: Artifact combiner probably should have replaced the truncated
2746 // constant before this, so we shouldn't need
2747 // getIConstantVRegValWithLookThrough.
2748 std::optional<ValueAndVReg> MaybeIdxVal =
2749 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2750 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2751 return true;
2752
2753 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2754
2755 unsigned NumElts = VecTy.getNumElements();
2756 if (IdxVal < NumElts) {
2758 for (unsigned i = 0; i < NumElts; ++i)
2759 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2760 B.buildUnmerge(SrcRegs, Vec);
2761
2762 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2763 B.buildMergeLikeInstr(Dst, SrcRegs);
2764 } else {
2765 B.buildUndef(Dst);
2766 }
2767
2768 MI.eraseFromParent();
2769 return true;
2770}
2771
2774 MachineIRBuilder &B) const {
2775
2776 Register DstReg = MI.getOperand(0).getReg();
2777 Register SrcReg = MI.getOperand(1).getReg();
2778 LLT Ty = MRI.getType(DstReg);
2779 unsigned Flags = MI.getFlags();
2780
2781 Register TrigVal;
2782 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2783 if (ST.hasTrigReducedRange()) {
2784 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2785 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2786 .addUse(MulVal.getReg(0))
2787 .setMIFlags(Flags)
2788 .getReg(0);
2789 } else
2790 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2791
2792 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2793 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2794 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2795 .addUse(TrigVal)
2796 .setMIFlags(Flags);
2797 MI.eraseFromParent();
2798 return true;
2799}
2800
2803 const GlobalValue *GV,
2804 int64_t Offset,
2805 unsigned GAFlags) const {
2806 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2807 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2808 // to the following code sequence:
2809 //
2810 // For constant address space:
2811 // s_getpc_b64 s[0:1]
2812 // s_add_u32 s0, s0, $symbol
2813 // s_addc_u32 s1, s1, 0
2814 //
2815 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2816 // a fixup or relocation is emitted to replace $symbol with a literal
2817 // constant, which is a pc-relative offset from the encoding of the $symbol
2818 // operand to the global variable.
2819 //
2820 // For global address space:
2821 // s_getpc_b64 s[0:1]
2822 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2823 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2824 //
2825 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2826 // fixups or relocations are emitted to replace $symbol@*@lo and
2827 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2828 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2829 // operand to the global variable.
2830
2832
2833 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2834 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2835
2836 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2837 .addDef(PCReg);
2838
2839 MIB.addGlobalAddress(GV, Offset, GAFlags);
2840 if (GAFlags == SIInstrInfo::MO_NONE)
2841 MIB.addImm(0);
2842 else
2843 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2844
2845 if (!B.getMRI()->getRegClassOrNull(PCReg))
2846 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2847
2848 if (PtrTy.getSizeInBits() == 32)
2849 B.buildExtract(DstReg, PCReg, 0);
2850 return true;
2851}
2852
2853// Emit a ABS32_LO / ABS32_HI relocation stub.
2855 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2856 MachineRegisterInfo &MRI) const {
2857 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2858
2859 LLT S32 = LLT::scalar(32);
2860
2861 // Use the destination directly, if and only if we store the lower address
2862 // part only and we don't have a register class being set.
2863 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2864 ? DstReg
2865 : MRI.createGenericVirtualRegister(S32);
2866
2867 if (!MRI.getRegClassOrNull(AddrLo))
2868 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2869
2870 // Write the lower half.
2871 B.buildInstr(AMDGPU::S_MOV_B32)
2872 .addDef(AddrLo)
2873 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2874
2875 // If required, write the upper half as well.
2876 if (RequiresHighHalf) {
2877 assert(PtrTy.getSizeInBits() == 64 &&
2878 "Must provide a 64-bit pointer type!");
2879
2880 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2881 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2882
2883 B.buildInstr(AMDGPU::S_MOV_B32)
2884 .addDef(AddrHi)
2885 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2886
2887 // Use the destination directly, if and only if we don't have a register
2888 // class being set.
2889 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2890 ? DstReg
2891 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2892
2893 if (!MRI.getRegClassOrNull(AddrDst))
2894 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2895
2896 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2897
2898 // If we created a new register for the destination, cast the result into
2899 // the final output.
2900 if (AddrDst != DstReg)
2901 B.buildCast(DstReg, AddrDst);
2902 } else if (AddrLo != DstReg) {
2903 // If we created a new register for the destination, cast the result into
2904 // the final output.
2905 B.buildCast(DstReg, AddrLo);
2906 }
2907}
2908
2911 MachineIRBuilder &B) const {
2912 Register DstReg = MI.getOperand(0).getReg();
2913 LLT Ty = MRI.getType(DstReg);
2914 unsigned AS = Ty.getAddressSpace();
2915
2916 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2917 MachineFunction &MF = B.getMF();
2919
2921 if (!MFI->isModuleEntryFunction() &&
2922 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2923 const Function &Fn = MF.getFunction();
2924 DiagnosticInfoUnsupported BadLDSDecl(
2925 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2926 DS_Warning);
2927 Fn.getContext().diagnose(BadLDSDecl);
2928
2929 // We currently don't have a way to correctly allocate LDS objects that
2930 // aren't directly associated with a kernel. We do force inlining of
2931 // functions that use local objects. However, if these dead functions are
2932 // not eliminated, we don't want a compile time error. Just emit a warning
2933 // and a trap, since there should be no callable path here.
2934 B.buildTrap();
2935 B.buildUndef(DstReg);
2936 MI.eraseFromParent();
2937 return true;
2938 }
2939
2940 // TODO: We could emit code to handle the initialization somewhere.
2941 // We ignore the initializer for now and legalize it to allow selection.
2942 // The initializer will anyway get errored out during assembly emission.
2943 const SITargetLowering *TLI = ST.getTargetLowering();
2944 if (!TLI->shouldUseLDSConstAddress(GV)) {
2945 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2946 return true; // Leave in place;
2947 }
2948
2949 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2950 Type *Ty = GV->getValueType();
2951 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2952 // zero-sized type in other languages to declare the dynamic shared
2953 // memory which size is not known at the compile time. They will be
2954 // allocated by the runtime and placed directly after the static
2955 // allocated ones. They all share the same offset.
2956 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2957 // Adjust alignment for that dynamic shared memory array.
2958 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2959 LLT S32 = LLT::scalar(32);
2960 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2961 B.buildIntToPtr(DstReg, Sz);
2962 MI.eraseFromParent();
2963 return true;
2964 }
2965 }
2966
2967 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2968 *cast<GlobalVariable>(GV)));
2969 MI.eraseFromParent();
2970 return true;
2971 }
2972
2973 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2974 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
2975 MI.eraseFromParent();
2976 return true;
2977 }
2978
2979 const SITargetLowering *TLI = ST.getTargetLowering();
2980
2981 if (TLI->shouldEmitFixup(GV)) {
2982 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2983 MI.eraseFromParent();
2984 return true;
2985 }
2986
2987 if (TLI->shouldEmitPCReloc(GV)) {
2988 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2989 MI.eraseFromParent();
2990 return true;
2991 }
2992
2994 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2995
2996 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3001 LoadTy, Align(8));
3002
3003 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3004
3005 if (Ty.getSizeInBits() == 32) {
3006 // Truncate if this is a 32-bit constant address.
3007 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3008 B.buildExtract(DstReg, Load, 0);
3009 } else
3010 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3011
3012 MI.eraseFromParent();
3013 return true;
3014}
3015
3017 if (Ty.isVector())
3018 return Ty.changeElementCount(
3021}
3022
3024 MachineInstr &MI) const {
3025 MachineIRBuilder &B = Helper.MIRBuilder;
3026 MachineRegisterInfo &MRI = *B.getMRI();
3027 GISelChangeObserver &Observer = Helper.Observer;
3028
3029 Register PtrReg = MI.getOperand(1).getReg();
3030 LLT PtrTy = MRI.getType(PtrReg);
3031 unsigned AddrSpace = PtrTy.getAddressSpace();
3032
3033 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3035 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3036 Observer.changingInstr(MI);
3037 MI.getOperand(1).setReg(Cast.getReg(0));
3038 Observer.changedInstr(MI);
3039 return true;
3040 }
3041
3042 if (MI.getOpcode() != AMDGPU::G_LOAD)
3043 return false;
3044
3045 Register ValReg = MI.getOperand(0).getReg();
3046 LLT ValTy = MRI.getType(ValReg);
3047
3048 if (hasBufferRsrcWorkaround(ValTy)) {
3049 Observer.changingInstr(MI);
3051 Observer.changedInstr(MI);
3052 return true;
3053 }
3054
3055 MachineMemOperand *MMO = *MI.memoperands_begin();
3056 const unsigned ValSize = ValTy.getSizeInBits();
3057 const LLT MemTy = MMO->getMemoryType();
3058 const Align MemAlign = MMO->getAlign();
3059 const unsigned MemSize = MemTy.getSizeInBits();
3060 const uint64_t AlignInBits = 8 * MemAlign.value();
3061
3062 // Widen non-power-of-2 loads to the alignment if needed
3063 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3064 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3065
3066 // This was already the correct extending load result type, so just adjust
3067 // the memory type.
3068 if (WideMemSize == ValSize) {
3069 MachineFunction &MF = B.getMF();
3070
3071 MachineMemOperand *WideMMO =
3072 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3073 Observer.changingInstr(MI);
3074 MI.setMemRefs(MF, {WideMMO});
3075 Observer.changedInstr(MI);
3076 return true;
3077 }
3078
3079 // Don't bother handling edge case that should probably never be produced.
3080 if (ValSize > WideMemSize)
3081 return false;
3082
3083 LLT WideTy = widenToNextPowerOf2(ValTy);
3084
3085 Register WideLoad;
3086 if (!WideTy.isVector()) {
3087 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3088 B.buildTrunc(ValReg, WideLoad).getReg(0);
3089 } else {
3090 // Extract the subvector.
3091
3092 if (isRegisterType(ValTy)) {
3093 // If this a case where G_EXTRACT is legal, use it.
3094 // (e.g. <3 x s32> -> <4 x s32>)
3095 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3096 B.buildExtract(ValReg, WideLoad, 0);
3097 } else {
3098 // For cases where the widened type isn't a nice register value, unmerge
3099 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3100 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3101 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3102 }
3103 }
3104
3105 MI.eraseFromParent();
3106 return true;
3107 }
3108
3109 return false;
3110}
3111
3113 MachineInstr &MI) const {
3114 MachineIRBuilder &B = Helper.MIRBuilder;
3115 MachineRegisterInfo &MRI = *B.getMRI();
3116 GISelChangeObserver &Observer = Helper.Observer;
3117
3118 Register DataReg = MI.getOperand(0).getReg();
3119 LLT DataTy = MRI.getType(DataReg);
3120
3121 if (hasBufferRsrcWorkaround(DataTy)) {
3122 Observer.changingInstr(MI);
3124 Observer.changedInstr(MI);
3125 return true;
3126 }
3127 return false;
3128}
3129
3132 MachineIRBuilder &B) const {
3133 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3134 assert(Ty.isScalar());
3135
3136 MachineFunction &MF = B.getMF();
3138
3139 // TODO: Always legal with future ftz flag.
3140 // FIXME: Do we need just output?
3141 if (Ty == LLT::float32() &&
3143 return true;
3144 if (Ty == LLT::float16() &&
3146 return true;
3147
3148 MachineIRBuilder HelperBuilder(MI);
3149 GISelObserverWrapper DummyObserver;
3150 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3151 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3152}
3153
3156 Register DstReg = MI.getOperand(0).getReg();
3157 Register PtrReg = MI.getOperand(1).getReg();
3158 Register CmpVal = MI.getOperand(2).getReg();
3159 Register NewVal = MI.getOperand(3).getReg();
3160
3161 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3162 "this should not have been custom lowered");
3163
3164 LLT ValTy = MRI.getType(CmpVal);
3165 LLT VecTy = LLT::fixed_vector(2, ValTy);
3166
3167 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3168
3169 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3170 .addDef(DstReg)
3171 .addUse(PtrReg)
3172 .addUse(PackedVal)
3173 .setMemRefs(MI.memoperands());
3174
3175 MI.eraseFromParent();
3176 return true;
3177}
3178
3179/// Return true if it's known that \p Src can never be an f32 denormal value.
3181 Register Src) {
3182 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3183 switch (DefMI->getOpcode()) {
3184 case TargetOpcode::G_INTRINSIC: {
3185 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3186 case Intrinsic::amdgcn_frexp_mant:
3187 return true;
3188 default:
3189 break;
3190 }
3191
3192 break;
3193 }
3194 case TargetOpcode::G_FFREXP: {
3195 if (DefMI->getOperand(0).getReg() == Src)
3196 return true;
3197 break;
3198 }
3199 case TargetOpcode::G_FPEXT: {
3200 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3201 }
3202 default:
3203 return false;
3204 }
3205
3206 return false;
3207}
3208
3209static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3210 if (Flags & MachineInstr::FmAfn)
3211 return true;
3212 const auto &Options = MF.getTarget().Options;
3213 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3214}
3215
3217 unsigned Flags) {
3218 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3221}
3222
3223std::pair<Register, Register>
3225 unsigned Flags) const {
3226 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3227 return {};
3228
3229 const LLT F32 = LLT::scalar(32);
3230 auto SmallestNormal = B.buildFConstant(
3232 auto IsLtSmallestNormal =
3233 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3234
3235 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3236 auto One = B.buildFConstant(F32, 1.0);
3237 auto ScaleFactor =
3238 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3239 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3240
3241 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3242}
3243
3245 MachineIRBuilder &B) const {
3246 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3247 // If we have to handle denormals, scale up the input and adjust the result.
3248
3249 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3250 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3251
3252 Register Dst = MI.getOperand(0).getReg();
3253 Register Src = MI.getOperand(1).getReg();
3254 LLT Ty = B.getMRI()->getType(Dst);
3255 unsigned Flags = MI.getFlags();
3256
3257 if (Ty == LLT::scalar(16)) {
3258 const LLT F32 = LLT::scalar(32);
3259 // Nothing in half is a denormal when promoted to f32.
3260 auto Ext = B.buildFPExt(F32, Src, Flags);
3261 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3262 .addUse(Ext.getReg(0))
3263 .setMIFlags(Flags);
3264 B.buildFPTrunc(Dst, Log2, Flags);
3265 MI.eraseFromParent();
3266 return true;
3267 }
3268
3269 assert(Ty == LLT::scalar(32));
3270
3271 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3272 if (!ScaledInput) {
3273 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3274 .addUse(Src)
3275 .setMIFlags(Flags);
3276 MI.eraseFromParent();
3277 return true;
3278 }
3279
3280 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3281 .addUse(ScaledInput)
3282 .setMIFlags(Flags);
3283
3284 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3285 auto Zero = B.buildFConstant(Ty, 0.0);
3286 auto ResultOffset =
3287 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3288 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3289
3290 MI.eraseFromParent();
3291 return true;
3292}
3293
3295 Register Z, unsigned Flags) {
3296 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3297 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3298}
3299
3301 MachineIRBuilder &B) const {
3302 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3303 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3304
3305 MachineRegisterInfo &MRI = *B.getMRI();
3306 Register Dst = MI.getOperand(0).getReg();
3307 Register X = MI.getOperand(1).getReg();
3308 unsigned Flags = MI.getFlags();
3309 const LLT Ty = MRI.getType(X);
3310 MachineFunction &MF = B.getMF();
3311
3312 const LLT F32 = LLT::scalar(32);
3313 const LLT F16 = LLT::scalar(16);
3314
3315 const AMDGPUTargetMachine &TM =
3316 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3317
3318 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3319 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3320 if (Ty == F16 && !ST.has16BitInsts()) {
3321 Register LogVal = MRI.createGenericVirtualRegister(F32);
3322 auto PromoteSrc = B.buildFPExt(F32, X);
3323 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3324 B.buildFPTrunc(Dst, LogVal);
3325 } else {
3326 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3327 }
3328
3329 MI.eraseFromParent();
3330 return true;
3331 }
3332
3333 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3334 if (ScaledInput)
3335 X = ScaledInput;
3336
3337 auto Y =
3338 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3339
3340 Register R;
3341 if (ST.hasFastFMAF32()) {
3342 // c+cc are ln(2)/ln(10) to more than 49 bits
3343 const float c_log10 = 0x1.344134p-2f;
3344 const float cc_log10 = 0x1.09f79ep-26f;
3345
3346 // c + cc is ln(2) to more than 49 bits
3347 const float c_log = 0x1.62e42ep-1f;
3348 const float cc_log = 0x1.efa39ep-25f;
3349
3350 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3351 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3352
3353 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3354 auto NegR = B.buildFNeg(Ty, R, Flags);
3355 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3356 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3357 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3358 } else {
3359 // ch+ct is ln(2)/ln(10) to more than 36 bits
3360 const float ch_log10 = 0x1.344000p-2f;
3361 const float ct_log10 = 0x1.3509f6p-18f;
3362
3363 // ch + ct is ln(2) to more than 36 bits
3364 const float ch_log = 0x1.62e000p-1f;
3365 const float ct_log = 0x1.0bfbe8p-15f;
3366
3367 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3368 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3369
3370 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3371 auto YH = B.buildAnd(Ty, Y, MaskConst);
3372 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3373 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3374
3375 Register Mad0 =
3376 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3377 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3378 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3379 }
3380
3381 const bool IsFiniteOnly =
3382 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3383 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3384
3385 if (!IsFiniteOnly) {
3386 // Expand isfinite(x) => fabs(x) < inf
3387 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3388 auto Fabs = B.buildFAbs(Ty, Y);
3389 auto IsFinite =
3390 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3391 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3392 }
3393
3394 if (ScaledInput) {
3395 auto Zero = B.buildFConstant(Ty, 0.0);
3396 auto ShiftK =
3397 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3398 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3399 B.buildFSub(Dst, R, Shift, Flags);
3400 } else {
3401 B.buildCopy(Dst, R);
3402 }
3403
3404 MI.eraseFromParent();
3405 return true;
3406}
3407
3409 Register Src, bool IsLog10,
3410 unsigned Flags) const {
3411 const double Log2BaseInverted =
3413
3414 LLT Ty = B.getMRI()->getType(Dst);
3415
3416 if (Ty == LLT::scalar(32)) {
3417 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3418 if (ScaledInput) {
3419 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3420 .addUse(Src)
3421 .setMIFlags(Flags);
3422 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3423 auto Zero = B.buildFConstant(Ty, 0.0);
3424 auto ResultOffset =
3425 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3426 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3427
3428 if (ST.hasFastFMAF32())
3429 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3430 else {
3431 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3432 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3433 }
3434
3435 return true;
3436 }
3437 }
3438
3439 auto Log2Operand = Ty == LLT::scalar(16)
3440 ? B.buildFLog2(Ty, Src, Flags)
3441 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3442 .addUse(Src)
3443 .setMIFlags(Flags);
3444 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3445 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3446 return true;
3447}
3448
3450 MachineIRBuilder &B) const {
3451 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3452 // If we have to handle denormals, scale up the input and adjust the result.
3453
3454 Register Dst = MI.getOperand(0).getReg();
3455 Register Src = MI.getOperand(1).getReg();
3456 unsigned Flags = MI.getFlags();
3457 LLT Ty = B.getMRI()->getType(Dst);
3458 const LLT F16 = LLT::scalar(16);
3459 const LLT F32 = LLT::scalar(32);
3460
3461 if (Ty == F16) {
3462 // Nothing in half is a denormal when promoted to f32.
3463 auto Ext = B.buildFPExt(F32, Src, Flags);
3464 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3465 .addUse(Ext.getReg(0))
3466 .setMIFlags(Flags);
3467 B.buildFPTrunc(Dst, Log2, Flags);
3468 MI.eraseFromParent();
3469 return true;
3470 }
3471
3472 assert(Ty == F32);
3473
3474 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3475 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3476 .addUse(Src)
3477 .setMIFlags(Flags);
3478 MI.eraseFromParent();
3479 return true;
3480 }
3481
3482 // bool needs_scaling = x < -0x1.f80000p+6f;
3483 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3484
3485 // -nextafter(128.0, -1)
3486 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3487 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3488 RangeCheckConst, Flags);
3489
3490 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3491 auto Zero = B.buildFConstant(Ty, 0.0);
3492 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3493 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3494
3495 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3496 .addUse(AddInput.getReg(0))
3497 .setMIFlags(Flags);
3498
3499 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3500 auto One = B.buildFConstant(Ty, 1.0);
3501 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3502 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3503 MI.eraseFromParent();
3504 return true;
3505}
3506
3508 Register X, unsigned Flags) const {
3509 LLT Ty = B.getMRI()->getType(Dst);
3510 LLT F32 = LLT::scalar(32);
3511
3512 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3513 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3514 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3515
3516 if (Ty == F32) {
3517 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3518 .addUse(Mul.getReg(0))
3519 .setMIFlags(Flags);
3520 } else {
3521 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3522 }
3523
3524 return true;
3525 }
3526
3527 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3528 auto NeedsScaling =
3529 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3530 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3531 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3532 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3533
3534 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3535 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3536
3537 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3538 .addUse(ExpInput.getReg(0))
3539 .setMIFlags(Flags);
3540
3541 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3542 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3543 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3544 return true;
3545}
3546
3548 MachineIRBuilder &B) const {
3549 Register Dst = MI.getOperand(0).getReg();
3550 Register X = MI.getOperand(1).getReg();
3551 const unsigned Flags = MI.getFlags();
3552 MachineFunction &MF = B.getMF();
3553 MachineRegisterInfo &MRI = *B.getMRI();
3554 LLT Ty = MRI.getType(Dst);
3555 const LLT F16 = LLT::scalar(16);
3556 const LLT F32 = LLT::scalar(32);
3557 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3558
3559 if (Ty == F16) {
3560 // v_exp_f16 (fmul x, log2e)
3561 if (allowApproxFunc(MF, Flags)) {
3562 // TODO: Does this really require fast?
3563 legalizeFExpUnsafe(B, Dst, X, Flags);
3564 MI.eraseFromParent();
3565 return true;
3566 }
3567
3568 // exp(f16 x) ->
3569 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3570
3571 // Nothing in half is a denormal when promoted to f32.
3572 auto Ext = B.buildFPExt(F32, X, Flags);
3573 Register Lowered = MRI.createGenericVirtualRegister(F32);
3574 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3575 B.buildFPTrunc(Dst, Lowered, Flags);
3576 MI.eraseFromParent();
3577 return true;
3578 }
3579
3580 assert(Ty == F32);
3581
3582 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3583 // library behavior. Also, is known-not-daz source sufficient?
3584 if (allowApproxFunc(MF, Flags)) {
3585 legalizeFExpUnsafe(B, Dst, X, Flags);
3586 MI.eraseFromParent();
3587 return true;
3588 }
3589
3590 // Algorithm:
3591 //
3592 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3593 //
3594 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3595 // n = 64*m + j, 0 <= j < 64
3596 //
3597 // e^x = 2^((64*m + j + f)/64)
3598 // = (2^m) * (2^(j/64)) * 2^(f/64)
3599 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3600 //
3601 // f = x*(64/ln(2)) - n
3602 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3603 //
3604 // e^x = (2^m) * (2^(j/64)) * e^r
3605 //
3606 // (2^(j/64)) is precomputed
3607 //
3608 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3609 // e^r = 1 + q
3610 //
3611 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3612 //
3613 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3614 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3615 Register PH, PL;
3616
3617 if (ST.hasFastFMAF32()) {
3618 const float c_exp = numbers::log2ef;
3619 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3620 const float c_exp10 = 0x1.a934f0p+1f;
3621 const float cc_exp10 = 0x1.2f346ep-24f;
3622
3623 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3624 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3625 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3626 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3627
3628 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3629 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3630 } else {
3631 const float ch_exp = 0x1.714000p+0f;
3632 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3633
3634 const float ch_exp10 = 0x1.a92000p+1f;
3635 const float cl_exp10 = 0x1.4f0978p-11f;
3636
3637 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3638 auto XH = B.buildAnd(Ty, X, MaskConst);
3639 auto XL = B.buildFSub(Ty, X, XH, Flags);
3640
3641 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3642 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3643
3644 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3645 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3646
3647 Register Mad0 =
3648 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3649 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3650 }
3651
3652 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3653
3654 // It is unsafe to contract this fsub into the PH multiply.
3655 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3656 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3657 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3658
3659 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3660 .addUse(A.getReg(0))
3661 .setMIFlags(Flags);
3662 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3663
3664 auto UnderflowCheckConst =
3665 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3666 auto Zero = B.buildFConstant(Ty, 0.0);
3667 auto Underflow =
3668 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3669
3670 R = B.buildSelect(Ty, Underflow, Zero, R);
3671
3672 const auto &Options = MF.getTarget().Options;
3673
3674 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3675 auto OverflowCheckConst =
3676 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3677
3678 auto Overflow =
3679 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3680 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3681 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3682 }
3683
3684 B.buildCopy(Dst, R);
3685 MI.eraseFromParent();
3686 return true;
3687}
3688
3690 MachineIRBuilder &B) const {
3691 Register Dst = MI.getOperand(0).getReg();
3692 Register Src0 = MI.getOperand(1).getReg();
3693 Register Src1 = MI.getOperand(2).getReg();
3694 unsigned Flags = MI.getFlags();
3695 LLT Ty = B.getMRI()->getType(Dst);
3696 const LLT F16 = LLT::float16();
3697 const LLT F32 = LLT::float32();
3698
3699 if (Ty == F32) {
3700 auto Log = B.buildFLog2(F32, Src0, Flags);
3701 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3702 .addUse(Log.getReg(0))
3703 .addUse(Src1)
3704 .setMIFlags(Flags);
3705 B.buildFExp2(Dst, Mul, Flags);
3706 } else if (Ty == F16) {
3707 // There's no f16 fmul_legacy, so we need to convert for it.
3708 auto Log = B.buildFLog2(F16, Src0, Flags);
3709 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3710 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3711 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3712 .addUse(Ext0.getReg(0))
3713 .addUse(Ext1.getReg(0))
3714 .setMIFlags(Flags);
3715 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3716 } else
3717 return false;
3718
3719 MI.eraseFromParent();
3720 return true;
3721}
3722
3723// Find a source register, ignoring any possible source modifiers.
3725 Register ModSrc = OrigSrc;
3726 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3727 ModSrc = SrcFNeg->getOperand(1).getReg();
3728 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3729 ModSrc = SrcFAbs->getOperand(1).getReg();
3730 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3731 ModSrc = SrcFAbs->getOperand(1).getReg();
3732 return ModSrc;
3733}
3734
3737 MachineIRBuilder &B) const {
3738
3739 const LLT S1 = LLT::scalar(1);
3740 const LLT F64 = LLT::float64();
3741 Register Dst = MI.getOperand(0).getReg();
3742 Register OrigSrc = MI.getOperand(1).getReg();
3743 unsigned Flags = MI.getFlags();
3744 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3745 "this should not have been custom lowered");
3746
3747 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3748 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3749 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3750 // V_FRACT bug is:
3751 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3752 //
3753 // Convert floor(x) to (x - fract(x))
3754
3755 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3756 .addUse(OrigSrc)
3757 .setMIFlags(Flags);
3758
3759 // Give source modifier matching some assistance before obscuring a foldable
3760 // pattern.
3761
3762 // TODO: We can avoid the neg on the fract? The input sign to fract
3763 // shouldn't matter?
3764 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3765
3766 auto Const =
3767 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3768
3769 Register Min = MRI.createGenericVirtualRegister(F64);
3770
3771 // We don't need to concern ourselves with the snan handling difference, so
3772 // use the one which will directly select.
3773 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3774 if (MFI->getMode().IEEE)
3775 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3776 else
3777 B.buildFMinNum(Min, Fract, Const, Flags);
3778
3779 Register CorrectedFract = Min;
3780 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3781 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3782 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3783 }
3784
3785 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3786 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3787
3788 MI.eraseFromParent();
3789 return true;
3790}
3791
3792// Turn an illegal packed v2s16 build vector into bit operations.
3793// TODO: This should probably be a bitcast action in LegalizerHelper.
3796 Register Dst = MI.getOperand(0).getReg();
3797 const LLT S32 = LLT::scalar(32);
3798 const LLT S16 = LLT::scalar(16);
3799 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3800
3801 Register Src0 = MI.getOperand(1).getReg();
3802 Register Src1 = MI.getOperand(2).getReg();
3803
3804 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3805 assert(MRI.getType(Src0) == S32);
3806 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3807 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3808 }
3809
3810 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3811 B.buildBitcast(Dst, Merge);
3812
3813 MI.eraseFromParent();
3814 return true;
3815}
3816
3817// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3818//
3819// Source and accumulation registers must all be 32-bits.
3820//
3821// TODO: When the multiply is uniform, we should produce a code sequence
3822// that is better suited to instruction selection on the SALU. Instead of
3823// the outer loop going over parts of the result, the outer loop should go
3824// over parts of one of the factors. This should result in instruction
3825// selection that makes full use of S_ADDC_U32 instructions.
3828 ArrayRef<Register> Src0,
3829 ArrayRef<Register> Src1,
3830 bool UsePartialMad64_32,
3831 bool SeparateOddAlignedProducts) const {
3832 // Use (possibly empty) vectors of S1 registers to represent the set of
3833 // carries from one pair of positions to the next.
3834 using Carry = SmallVector<Register, 2>;
3835
3836 MachineIRBuilder &B = Helper.MIRBuilder;
3837 GISelKnownBits &KB = *Helper.getKnownBits();
3838
3839 const LLT S1 = LLT::scalar(1);
3840 const LLT S32 = LLT::scalar(32);
3841 const LLT S64 = LLT::scalar(64);
3842
3843 Register Zero32;
3844 Register Zero64;
3845
3846 auto getZero32 = [&]() -> Register {
3847 if (!Zero32)
3848 Zero32 = B.buildConstant(S32, 0).getReg(0);
3849 return Zero32;
3850 };
3851 auto getZero64 = [&]() -> Register {
3852 if (!Zero64)
3853 Zero64 = B.buildConstant(S64, 0).getReg(0);
3854 return Zero64;
3855 };
3856
3857 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3858 for (unsigned i = 0; i < Src0.size(); ++i) {
3859 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3860 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3861 }
3862
3863 // Merge the given carries into the 32-bit LocalAccum, which is modified
3864 // in-place.
3865 //
3866 // Returns the carry-out, which is a single S1 register or null.
3867 auto mergeCarry =
3868 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3869 if (CarryIn.empty())
3870 return Register();
3871
3872 bool HaveCarryOut = true;
3873 Register CarryAccum;
3874 if (CarryIn.size() == 1) {
3875 if (!LocalAccum) {
3876 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3877 return Register();
3878 }
3879
3880 CarryAccum = getZero32();
3881 } else {
3882 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3883 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3884 CarryAccum =
3885 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3886 .getReg(0);
3887 }
3888
3889 if (!LocalAccum) {
3890 LocalAccum = getZero32();
3891 HaveCarryOut = false;
3892 }
3893 }
3894
3895 auto Add =
3896 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3897 LocalAccum = Add.getReg(0);
3898 return HaveCarryOut ? Add.getReg(1) : Register();
3899 };
3900
3901 // Build a multiply-add chain to compute
3902 //
3903 // LocalAccum + (partial products at DstIndex)
3904 // + (opportunistic subset of CarryIn)
3905 //
3906 // LocalAccum is an array of one or two 32-bit registers that are updated
3907 // in-place. The incoming registers may be null.
3908 //
3909 // In some edge cases, carry-ins can be consumed "for free". In that case,
3910 // the consumed carry bits are removed from CarryIn in-place.
3911 auto buildMadChain =
3912 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3913 -> Carry {
3914 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3915 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3916
3917 Carry CarryOut;
3918 unsigned j0 = 0;
3919
3920 // Use plain 32-bit multiplication for the most significant part of the
3921 // result by default.
3922 if (LocalAccum.size() == 1 &&
3923 (!UsePartialMad64_32 || !CarryIn.empty())) {
3924 do {
3925 // Skip multiplication if one of the operands is 0
3926 unsigned j1 = DstIndex - j0;
3927 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3928 ++j0;
3929 continue;
3930 }
3931 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3932 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3933 LocalAccum[0] = Mul.getReg(0);
3934 } else {
3935 if (CarryIn.empty()) {
3936 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3937 } else {
3938 LocalAccum[0] =
3939 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3940 .getReg(0);
3941 CarryIn.pop_back();
3942 }
3943 }
3944 ++j0;
3945 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3946 }
3947
3948 // Build full 64-bit multiplies.
3949 if (j0 <= DstIndex) {
3950 bool HaveSmallAccum = false;
3951 Register Tmp;
3952
3953 if (LocalAccum[0]) {
3954 if (LocalAccum.size() == 1) {
3955 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3956 HaveSmallAccum = true;
3957 } else if (LocalAccum[1]) {
3958 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3959 HaveSmallAccum = false;
3960 } else {
3961 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3962 HaveSmallAccum = true;
3963 }
3964 } else {
3965 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3966 Tmp = getZero64();
3967 HaveSmallAccum = true;
3968 }
3969
3970 do {
3971 unsigned j1 = DstIndex - j0;
3972 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3973 ++j0;
3974 continue;
3975 }
3976 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3977 {Src0[j0], Src1[j1], Tmp});
3978 Tmp = Mad.getReg(0);
3979 if (!HaveSmallAccum)
3980 CarryOut.push_back(Mad.getReg(1));
3981 HaveSmallAccum = false;
3982
3983 ++j0;
3984 } while (j0 <= DstIndex);
3985
3986 auto Unmerge = B.buildUnmerge(S32, Tmp);
3987 LocalAccum[0] = Unmerge.getReg(0);
3988 if (LocalAccum.size() > 1)
3989 LocalAccum[1] = Unmerge.getReg(1);
3990 }
3991
3992 return CarryOut;
3993 };
3994
3995 // Outer multiply loop, iterating over destination parts from least
3996 // significant to most significant parts.
3997 //
3998 // The columns of the following diagram correspond to the destination parts
3999 // affected by one iteration of the outer loop (ignoring boundary
4000 // conditions).
4001 //
4002 // Dest index relative to 2 * i: 1 0 -1
4003 // ------
4004 // Carries from previous iteration: e o
4005 // Even-aligned partial product sum: E E .
4006 // Odd-aligned partial product sum: O O
4007 //
4008 // 'o' is OddCarry, 'e' is EvenCarry.
4009 // EE and OO are computed from partial products via buildMadChain and use
4010 // accumulation where possible and appropriate.
4011 //
4012 Register SeparateOddCarry;
4013 Carry EvenCarry;
4014 Carry OddCarry;
4015
4016 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4017 Carry OddCarryIn = std::move(OddCarry);
4018 Carry EvenCarryIn = std::move(EvenCarry);
4019 OddCarry.clear();
4020 EvenCarry.clear();
4021
4022 // Partial products at offset 2 * i.
4023 if (2 * i < Accum.size()) {
4024 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4025 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4026 }
4027
4028 // Partial products at offset 2 * i - 1.
4029 if (i > 0) {
4030 if (!SeparateOddAlignedProducts) {
4031 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4032 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4033 } else {
4034 bool IsHighest = 2 * i >= Accum.size();
4035 Register SeparateOddOut[2];
4036 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4037 .take_front(IsHighest ? 1 : 2);
4038 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4039
4041
4042 if (i == 1) {
4043 if (!IsHighest)
4044 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4045 else
4046 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4047 } else {
4048 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4049 SeparateOddCarry);
4050 }
4051 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4052
4053 if (!IsHighest) {
4054 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4055 Lo->getOperand(1).getReg());
4056 Accum[2 * i] = Hi.getReg(0);
4057 SeparateOddCarry = Hi.getReg(1);
4058 }
4059 }
4060 }
4061
4062 // Add in the carries from the previous iteration
4063 if (i > 0) {
4064 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4065 EvenCarryIn.push_back(CarryOut);
4066
4067 if (2 * i < Accum.size()) {
4068 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4069 OddCarry.push_back(CarryOut);
4070 }
4071 }
4072 }
4073}
4074
4075// Custom narrowing of wide multiplies using wide multiply-add instructions.
4076//
4077// TODO: If the multiply is followed by an addition, we should attempt to
4078// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4080 MachineInstr &MI) const {
4081 assert(ST.hasMad64_32());
4082 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4083
4084 MachineIRBuilder &B = Helper.MIRBuilder;
4085 MachineRegisterInfo &MRI = *B.getMRI();
4086
4087 Register DstReg = MI.getOperand(0).getReg();
4088 Register Src0 = MI.getOperand(1).getReg();
4089 Register Src1 = MI.getOperand(2).getReg();
4090
4091 LLT Ty = MRI.getType(DstReg);
4092 assert(Ty.isScalar());
4093
4094 unsigned Size = Ty.getSizeInBits();
4095 unsigned NumParts = Size / 32;
4096 assert((Size % 32) == 0);
4097 assert(NumParts >= 2);
4098
4099 // Whether to use MAD_64_32 for partial products whose high half is
4100 // discarded. This avoids some ADD instructions but risks false dependency
4101 // stalls on some subtargets in some cases.
4102 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4103
4104 // Whether to compute odd-aligned partial products separately. This is
4105 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4106 // in an even-aligned VGPR.
4107 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4108
4109 LLT S32 = LLT::scalar(32);
4110 SmallVector<Register, 2> Src0Parts, Src1Parts;
4111 for (unsigned i = 0; i < NumParts; ++i) {
4112 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4113 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4114 }
4115 B.buildUnmerge(Src0Parts, Src0);
4116 B.buildUnmerge(Src1Parts, Src1);
4117
4118 SmallVector<Register, 2> AccumRegs(NumParts);
4119 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4120 SeparateOddAlignedProducts);
4121
4122 B.buildMergeLikeInstr(DstReg, AccumRegs);
4123 MI.eraseFromParent();
4124 return true;
4125}
4126
4127// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4128// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4129// case with a single min instruction instead of a compare+select.
4132 MachineIRBuilder &B) const {
4133 Register Dst = MI.getOperand(0).getReg();
4134 Register Src = MI.getOperand(1).getReg();
4135 LLT DstTy = MRI.getType(Dst);
4136 LLT SrcTy = MRI.getType(Src);
4137
4138 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4139 ? AMDGPU::G_AMDGPU_FFBH_U32
4140 : AMDGPU::G_AMDGPU_FFBL_B32;
4141 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4142 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4143
4144 MI.eraseFromParent();
4145 return true;
4146}
4147
4148// Check that this is a G_XOR x, -1
4149static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4150 if (MI.getOpcode() != TargetOpcode::G_XOR)
4151 return false;
4152 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4153 return ConstVal && *ConstVal == -1;
4154}
4155
4156// Return the use branch instruction, otherwise null if the usage is invalid.
4157static MachineInstr *
4159 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4160 Register CondDef = MI.getOperand(0).getReg();
4161 if (!MRI.hasOneNonDBGUse(CondDef))
4162 return nullptr;
4163
4164 MachineBasicBlock *Parent = MI.getParent();
4165 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4166
4167 if (isNot(MRI, *UseMI)) {
4168 Register NegatedCond = UseMI->getOperand(0).getReg();
4169 if (!MRI.hasOneNonDBGUse(NegatedCond))
4170 return nullptr;
4171
4172 // We're deleting the def of this value, so we need to remove it.
4173 eraseInstr(*UseMI, MRI);
4174
4175 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4176 Negated = true;
4177 }
4178
4179 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4180 return nullptr;
4181
4182 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4183 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4184 if (Next == Parent->end()) {
4185 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4186 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4187 return nullptr;
4188 UncondBrTarget = &*NextMBB;
4189 } else {
4190 if (Next->getOpcode() != AMDGPU::G_BR)
4191 return nullptr;
4192 Br = &*Next;
4193 UncondBrTarget = Br->getOperand(0).getMBB();
4194 }
4195
4196 return UseMI;
4197}
4198
4200 const ArgDescriptor *Arg,
4201 const TargetRegisterClass *ArgRC,
4202 LLT ArgTy) const {
4203 MCRegister SrcReg = Arg->getRegister();
4204 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4205 assert(DstReg.isVirtual() && "Virtual register expected");
4206
4207 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4208 *ArgRC, B.getDebugLoc(), ArgTy);
4209 if (Arg->isMasked()) {
4210 // TODO: Should we try to emit this once in the entry block?
4211 const LLT S32 = LLT::scalar(32);
4212 const unsigned Mask = Arg->getMask();
4213 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4214
4215 Register AndMaskSrc = LiveIn;
4216
4217 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4218 // 0.
4219 if (Shift != 0) {
4220 auto ShiftAmt = B.buildConstant(S32, Shift);
4221 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4222 }
4223
4224 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4225 } else {
4226 B.buildCopy(DstReg, LiveIn);
4227 }
4228
4229 return true;
4230}
4231
4233 Register DstReg, MachineIRBuilder &B,
4235 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4236 const ArgDescriptor *Arg = nullptr;
4237 const TargetRegisterClass *ArgRC;
4238 LLT ArgTy;
4239
4240 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4241 const ArgDescriptor WorkGroupIDX =
4242 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4243 // If GridZ is not programmed in an entry function then the hardware will set
4244 // it to all zeros, so there is no need to mask the GridY value in the low
4245 // order bits.
4246 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4247 AMDGPU::TTMP7,
4248 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4249 const ArgDescriptor WorkGroupIDZ =
4250 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4252 switch (ArgType) {
4254 Arg = &WorkGroupIDX;
4255 ArgRC = &AMDGPU::SReg_32RegClass;
4256 ArgTy = LLT::scalar(32);
4257 break;
4259 Arg = &WorkGroupIDY;
4260 ArgRC = &AMDGPU::SReg_32RegClass;
4261 ArgTy = LLT::scalar(32);
4262 break;
4264 Arg = &WorkGroupIDZ;
4265 ArgRC = &AMDGPU::SReg_32RegClass;
4266 ArgTy = LLT::scalar(32);
4267 break;
4268 default:
4269 break;
4270 }
4271 }
4272
4273 if (!Arg)
4274 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4275
4276 if (!Arg) {
4278 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4279 // case the pointer argument may be missing and we use null.
4280 B.buildConstant(DstReg, 0);
4281 return true;
4282 }
4283
4284 // It's undefined behavior if a function marked with the amdgpu-no-*
4285 // attributes uses the corresponding intrinsic.
4286 B.buildUndef(DstReg);
4287 return true;
4288 }
4289
4290 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4291 return false; // TODO: Handle these
4292 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4293}
4294
4298 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4299 return false;
4300
4301 MI.eraseFromParent();
4302 return true;
4303}
4304
4306 int64_t C) {
4307 B.buildConstant(MI.getOperand(0).getReg(), C);
4308 MI.eraseFromParent();
4309 return true;
4310}
4311
4314 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4315 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4316 if (MaxID == 0)
4317 return replaceWithConstant(B, MI, 0);
4318
4319 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4320 const ArgDescriptor *Arg;
4321 const TargetRegisterClass *ArgRC;
4322 LLT ArgTy;
4323 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4324
4325 Register DstReg = MI.getOperand(0).getReg();
4326 if (!Arg) {
4327 // It's undefined behavior if a function marked with the amdgpu-no-*
4328 // attributes uses the corresponding intrinsic.
4329 B.buildUndef(DstReg);
4330 MI.eraseFromParent();
4331 return true;
4332 }
4333
4334 if (Arg->isMasked()) {
4335 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4336 // masking operations anyway.
4337 //
4338 // TODO: We could assert the top bit is 0 for the source copy.
4339 if (!loadInputValue(DstReg, B, ArgType))
4340 return false;
4341 } else {
4342 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4343 if (!loadInputValue(TmpReg, B, ArgType))
4344 return false;
4345 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4346 }
4347
4348 MI.eraseFromParent();
4349 return true;
4350}
4351
4353 int64_t Offset) const {
4355 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4356
4357 // TODO: If we passed in the base kernel offset we could have a better
4358 // alignment than 4, but we don't really need it.
4359 if (!loadInputValue(KernArgReg, B,
4361 llvm_unreachable("failed to find kernarg segment ptr");
4362
4363 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4364 // TODO: Should get nuw
4365 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4366}
4367
4368/// Legalize a value that's loaded from kernel arguments. This is only used by
4369/// legacy intrinsics.
4373 Align Alignment) const {
4374 Register DstReg = MI.getOperand(0).getReg();
4375
4376 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4377 "unexpected kernarg parameter type");
4378
4381 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4384 MI.eraseFromParent();
4385 return true;
4386}
4387
4390 MachineIRBuilder &B) const {
4391 Register Dst = MI.getOperand(0).getReg();
4392 LLT DstTy = MRI.getType(Dst);
4393 LLT S16 = LLT::scalar(16);
4394 LLT S32 = LLT::scalar(32);
4395 LLT S64 = LLT::scalar(64);
4396
4397 if (DstTy == S16)
4398 return legalizeFDIV16(MI, MRI, B);
4399 if (DstTy == S32)
4400 return legalizeFDIV32(MI, MRI, B);
4401 if (DstTy == S64)
4402 return legalizeFDIV64(MI, MRI, B);
4403
4404 return false;
4405}
4406
4408 Register DstDivReg,
4409 Register DstRemReg,
4410 Register X,
4411 Register Y) const {
4412 const LLT S1 = LLT::scalar(1);
4413 const LLT S32 = LLT::scalar(32);
4414
4415 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4416 // algorithm used here.
4417
4418 // Initial estimate of inv(y).
4419 auto FloatY = B.buildUITOFP(S32, Y);
4420 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4421 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4422 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4423 auto Z = B.buildFPTOUI(S32, ScaledY);
4424
4425 // One round of UNR.
4426 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4427 auto NegYZ = B.buildMul(S32, NegY, Z);
4428 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4429
4430 // Quotient/remainder estimate.
4431 auto Q = B.buildUMulH(S32, X, Z);
4432 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4433
4434 // First quotient/remainder refinement.
4435 auto One = B.buildConstant(S32, 1);
4436 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4437 if (DstDivReg)
4438 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4439 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4440
4441 // Second quotient/remainder refinement.
4442 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4443 if (DstDivReg)
4444 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4445
4446 if (DstRemReg)
4447 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4448}
4449
4450// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4451//
4452// Return lo, hi of result
4453//
4454// %cvt.lo = G_UITOFP Val.lo
4455// %cvt.hi = G_UITOFP Val.hi
4456// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4457// %rcp = G_AMDGPU_RCP_IFLAG %mad
4458// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4459// %mul2 = G_FMUL %mul1, 2**(-32)
4460// %trunc = G_INTRINSIC_TRUNC %mul2
4461// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4462// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4463static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4464 Register Val) {
4465 const LLT S32 = LLT::scalar(32);
4466 auto Unmerge = B.buildUnmerge(S32, Val);
4467
4468 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4469 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4470
4471 auto Mad = B.buildFMAD(
4472 S32, CvtHi, // 2**32
4473 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4474
4475 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4476 auto Mul1 = B.buildFMul(
4477 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4478
4479 // 2**(-32)
4480 auto Mul2 = B.buildFMul(
4481 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4482 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4483
4484 // -(2**32)
4485 auto Mad2 = B.buildFMAD(
4486 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4487 Mul1);
4488
4489 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4490 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4491
4492 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4493}
4494
4496 Register DstDivReg,
4497 Register DstRemReg,
4498 Register Numer,
4499 Register Denom) const {
4500 const LLT S32 = LLT::scalar(32);
4501 const LLT S64 = LLT::scalar(64);
4502 const LLT S1 = LLT::scalar(1);
4503 Register RcpLo, RcpHi;
4504
4505 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4506
4507 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4508
4509 auto Zero64 = B.buildConstant(S64, 0);
4510 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4511
4512 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4513 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4514
4515 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4516 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4517 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4518
4519 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4520 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4521 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4522
4523 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4524 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4525 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4526 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4527 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4528
4529 auto Zero32 = B.buildConstant(S32, 0);
4530 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4531 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4532 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4533
4534 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4535 Register NumerLo = UnmergeNumer.getReg(0);
4536 Register NumerHi = UnmergeNumer.getReg(1);
4537
4538 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4539 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4540 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4541 Register Mul3_Lo = UnmergeMul3.getReg(0);
4542 Register Mul3_Hi = UnmergeMul3.getReg(1);
4543 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4544 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4545 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4546 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4547
4548 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4549 Register DenomLo = UnmergeDenom.getReg(0);
4550 Register DenomHi = UnmergeDenom.getReg(1);
4551
4552 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4553 auto C1 = B.buildSExt(S32, CmpHi);
4554
4555 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4556 auto C2 = B.buildSExt(S32, CmpLo);
4557
4558 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4559 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4560
4561 // TODO: Here and below portions of the code can be enclosed into if/endif.
4562 // Currently control flow is unconditional and we have 4 selects after
4563 // potential endif to substitute PHIs.
4564
4565 // if C3 != 0 ...
4566 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4567 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4568 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4569 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4570
4571 auto One64 = B.buildConstant(S64, 1);
4572 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4573
4574 auto C4 =
4575 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4576 auto C5 =
4577 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4578 auto C6 = B.buildSelect(
4579 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4580
4581 // if (C6 != 0)
4582 auto Add4 = B.buildAdd(S64, Add3, One64);
4583 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4584
4585 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4586 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4587 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4588
4589 // endif C6
4590 // endif C3
4591
4592 if (DstDivReg) {
4593 auto Sel1 = B.buildSelect(
4594 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4595 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4596 Sel1, MulHi3);
4597 }
4598
4599 if (DstRemReg) {
4600 auto Sel2 = B.buildSelect(
4601 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4602 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4603 Sel2, Sub1);
4604 }
4605}
4606
4609 MachineIRBuilder &B) const {
4610 Register DstDivReg, DstRemReg;
4611 switch (MI.getOpcode()) {
4612 default:
4613 llvm_unreachable("Unexpected opcode!");
4614 case AMDGPU::G_UDIV: {
4615 DstDivReg = MI.getOperand(0).getReg();
4616 break;
4617 }
4618 case AMDGPU::G_UREM: {
4619 DstRemReg = MI.getOperand(0).getReg();
4620 break;
4621 }
4622 case AMDGPU::G_UDIVREM: {
4623 DstDivReg = MI.getOperand(0).getReg();
4624 DstRemReg = MI.getOperand(1).getReg();
4625 break;
4626 }
4627 }
4628
4629 const LLT S64 = LLT::scalar(64);
4630 const LLT S32 = LLT::scalar(32);
4631 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4632 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4633 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4634 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4635
4636 if (Ty == S32)
4637 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4638 else if (Ty == S64)
4639 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4640 else
4641 return false;
4642
4643 MI.eraseFromParent();
4644 return true;
4645}
4646
4649 MachineIRBuilder &B) const {
4650 const LLT S64 = LLT::scalar(64);
4651 const LLT S32 = LLT::scalar(32);
4652
4653 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4654 if (Ty != S32 && Ty != S64)
4655 return false;
4656
4657 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4658 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4659 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4660
4661 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4662 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4663 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4664
4665 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4666 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4667
4668 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4669 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4670
4671 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4672 switch (MI.getOpcode()) {
4673 default:
4674 llvm_unreachable("Unexpected opcode!");
4675 case AMDGPU::G_SDIV: {
4676 DstDivReg = MI.getOperand(0).getReg();
4677 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4678 break;
4679 }
4680 case AMDGPU::G_SREM: {
4681 DstRemReg = MI.getOperand(0).getReg();
4682 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4683 break;
4684 }
4685 case AMDGPU::G_SDIVREM: {
4686 DstDivReg = MI.getOperand(0).getReg();
4687 DstRemReg = MI.getOperand(1).getReg();
4688 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4689 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4690 break;
4691 }
4692 }
4693
4694 if (Ty == S32)
4695 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4696 else
4697 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4698
4699 if (DstDivReg) {
4700 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4701 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4702 B.buildSub(DstDivReg, SignXor, Sign);
4703 }
4704
4705 if (DstRemReg) {
4706 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4707 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4708 B.buildSub(DstRemReg, SignXor, Sign);
4709 }
4710
4711 MI.eraseFromParent();
4712 return true;
4713}
4714
4717 MachineIRBuilder &B) const {
4718 Register Res = MI.getOperand(0).getReg();
4719 Register LHS = MI.getOperand(1).getReg();
4720 Register RHS = MI.getOperand(2).getReg();
4721 uint16_t Flags = MI.getFlags();
4722 LLT ResTy = MRI.getType(Res);
4723
4724 const MachineFunction &MF = B.getMF();
4725 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4727
4728 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4729 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4730 return false;
4731
4732 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4733 // the CI documentation has a worst case error of 1 ulp.
4734 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4735 // use it as long as we aren't trying to use denormals.
4736 //
4737 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4738
4739 // 1 / x -> RCP(x)
4740 if (CLHS->isExactlyValue(1.0)) {
4741 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4742 .addUse(RHS)
4743 .setMIFlags(Flags);
4744
4745 MI.eraseFromParent();
4746 return true;
4747 }
4748
4749 // -1 / x -> RCP( FNEG(x) )
4750 if (CLHS->isExactlyValue(-1.0)) {
4751 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4752 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4753 .addUse(FNeg.getReg(0))
4754 .setMIFlags(Flags);
4755
4756 MI.eraseFromParent();
4757 return true;
4758 }
4759 }
4760
4761 // For f16 require afn or arcp.
4762 // For f32 require afn.
4763 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4764 !MI.getFlag(MachineInstr::FmArcp)))
4765 return false;
4766
4767 // x / y -> x * (1.0 / y)
4768 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4769 .addUse(RHS)
4770 .setMIFlags(Flags);
4771 B.buildFMul(Res, LHS, RCP, Flags);
4772
4773 MI.eraseFromParent();
4774 return true;
4775}
4776
4779 MachineIRBuilder &B) const {
4780 Register Res = MI.getOperand(0).getReg();
4781 Register X = MI.getOperand(1).getReg();
4782 Register Y = MI.getOperand(2).getReg();
4783 uint16_t Flags = MI.getFlags();
4784 LLT ResTy = MRI.getType(Res);
4785
4786 const MachineFunction &MF = B.getMF();
4787 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4788 MI.getFlag(MachineInstr::FmAfn);
4789
4790 if (!AllowInaccurateRcp)
4791 return false;
4792
4793 auto NegY = B.buildFNeg(ResTy, Y);
4794 auto One = B.buildFConstant(ResTy, 1.0);
4795
4796 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4797 .addUse(Y)
4798 .setMIFlags(Flags);
4799
4800 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4801 R = B.buildFMA(ResTy, Tmp0, R, R);
4802
4803 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4804 R = B.buildFMA(ResTy, Tmp1, R, R);
4805
4806 auto Ret = B.buildFMul(ResTy, X, R);
4807 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4808
4809 B.buildFMA(Res, Tmp2, R, Ret);
4810 MI.eraseFromParent();
4811 return true;
4812}
4813
4816 MachineIRBuilder &B) const {
4818 return true;
4819
4820 Register Res = MI.getOperand(0).getReg();
4821 Register LHS = MI.getOperand(1).getReg();
4822 Register RHS = MI.getOperand(2).getReg();
4823
4824 uint16_t Flags = MI.getFlags();
4825
4826 LLT S16 = LLT::scalar(16);
4827 LLT S32 = LLT::scalar(32);
4828
4829 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4830 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4831
4832 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4833 .addUse(RHSExt.getReg(0))
4834 .setMIFlags(Flags);
4835
4836 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4837 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4838
4839 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4840 .addUse(RDst.getReg(0))
4841 .addUse(RHS)
4842 .addUse(LHS)
4843 .setMIFlags(Flags);
4844
4845 MI.eraseFromParent();
4846 return true;
4847}
4848
4849static constexpr unsigned SPDenormModeBitField =
4851
4852// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4853// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4855 const GCNSubtarget &ST,
4857 // Set SP denorm mode to this value.
4858 unsigned SPDenormMode =
4859 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4860
4861 if (ST.hasDenormModeInst()) {
4862 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4863 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4864
4865 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4866 B.buildInstr(AMDGPU::S_DENORM_MODE)
4867 .addImm(NewDenormModeValue);
4868
4869 } else {
4870 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4871 .addImm(SPDenormMode)
4872 .addImm(SPDenormModeBitField);
4873 }
4874}
4875
4878 MachineIRBuilder &B) const {
4880 return true;
4881
4882 Register Res = MI.getOperand(0).getReg();
4883 Register LHS = MI.getOperand(1).getReg();
4884 Register RHS = MI.getOperand(2).getReg();
4885 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4886 SIModeRegisterDefaults Mode = MFI->getMode();
4887
4888 uint16_t Flags = MI.getFlags();
4889
4890 LLT S32 = LLT::scalar(32);
4891 LLT S1 = LLT::scalar(1);
4892
4893 auto One = B.buildFConstant(S32, 1.0f);
4894
4895 auto DenominatorScaled =
4896 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4897 .addUse(LHS)
4898 .addUse(RHS)
4899 .addImm(0)
4900 .setMIFlags(Flags);
4901 auto NumeratorScaled =
4902 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4903 .addUse(LHS)
4904 .addUse(RHS)
4905 .addImm(1)
4906 .setMIFlags(Flags);
4907
4908 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4909 .addUse(DenominatorScaled.getReg(0))
4910 .setMIFlags(Flags);
4911 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4912
4913 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4914 const bool HasDynamicDenormals =
4915 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4916 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4917
4918 Register SavedSPDenormMode;
4919 if (!PreservesDenormals) {
4920 if (HasDynamicDenormals) {
4921 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4922 B.buildInstr(AMDGPU::S_GETREG_B32)
4923 .addDef(SavedSPDenormMode)
4924 .addImm(SPDenormModeBitField);
4925 }
4926 toggleSPDenormMode(true, B, ST, Mode);
4927 }
4928
4929 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4930 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4931 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4932 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4933 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4934 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4935
4936 if (!PreservesDenormals) {
4937 if (HasDynamicDenormals) {
4938 assert(SavedSPDenormMode);
4939 B.buildInstr(AMDGPU::S_SETREG_B32)
4940 .addReg(SavedSPDenormMode)
4941 .addImm(SPDenormModeBitField);
4942 } else
4943 toggleSPDenormMode(false, B, ST, Mode);
4944 }
4945
4946 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4947 .addUse(Fma4.getReg(0))
4948 .addUse(Fma1.getReg(0))
4949 .addUse(Fma3.getReg(0))
4950 .addUse(NumeratorScaled.getReg(1))
4951 .setMIFlags(Flags);
4952
4953 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4954 .addUse(Fmas.getReg(0))
4955 .addUse(RHS)
4956 .addUse(LHS)
4957 .setMIFlags(Flags);
4958
4959 MI.eraseFromParent();
4960 return true;
4961}
4962
4965 MachineIRBuilder &B) const {
4967 return true;
4968
4969 Register Res = MI.getOperand(0).getReg();
4970 Register LHS = MI.getOperand(1).getReg();
4971 Register RHS = MI.getOperand(2).getReg();
4972
4973 uint16_t Flags = MI.getFlags();
4974
4975 LLT S64 = LLT::scalar(64);
4976 LLT S1 = LLT::scalar(1);
4977
4978 auto One = B.buildFConstant(S64, 1.0);
4979
4980 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4981 .addUse(LHS)
4982 .addUse(RHS)
4983 .addImm(0)
4984 .setMIFlags(Flags);
4985
4986 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4987
4988 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4989 .addUse(DivScale0.getReg(0))
4990 .setMIFlags(Flags);
4991
4992 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4993 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4994 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4995
4996 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4997 .addUse(LHS)
4998 .addUse(RHS)
4999 .addImm(1)
5000 .setMIFlags(Flags);
5001
5002 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5003 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5004 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5005
5006 Register Scale;
5008 // Workaround a hardware bug on SI where the condition output from div_scale
5009 // is not usable.
5010
5011 LLT S32 = LLT::scalar(32);
5012
5013 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5014 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5015 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5016 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5017
5018 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5019 Scale1Unmerge.getReg(1));
5020 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5021 Scale0Unmerge.getReg(1));
5022 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5023 } else {
5024 Scale = DivScale1.getReg(1);
5025 }
5026
5027 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5028 .addUse(Fma4.getReg(0))
5029 .addUse(Fma3.getReg(0))
5030 .addUse(Mul.getReg(0))
5031 .addUse(Scale)
5032 .setMIFlags(Flags);
5033
5034 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5035 .addUse(Fmas.getReg(0))
5036 .addUse(RHS)
5037 .addUse(LHS)
5038 .setMIFlags(Flags);
5039
5040 MI.eraseFromParent();
5041 return true;
5042}
5043
5046 MachineIRBuilder &B) const {
5047 Register Res0 = MI.getOperand(0).getReg();
5048 Register Res1 = MI.getOperand(1).getReg();
5049 Register Val = MI.getOperand(2).getReg();
5050 uint16_t Flags = MI.getFlags();
5051
5052 LLT Ty = MRI.getType(Res0);
5053 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5054
5055 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5056 .addUse(Val)
5057 .setMIFlags(Flags);
5058 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5059 .addUse(Val)
5060 .setMIFlags(Flags);
5061
5062 if (ST.hasFractBug()) {
5063 auto Fabs = B.buildFAbs(Ty, Val);
5064 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5065 auto IsFinite =
5066 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5067 auto Zero = B.buildConstant(InstrExpTy, 0);
5068 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5069 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5070 }
5071
5072 B.buildCopy(Res0, Mant);
5073 B.buildSExtOrTrunc(Res1, Exp);
5074
5075 MI.eraseFromParent();
5076 return true;
5077}
5078
5081 MachineIRBuilder &B) const {
5082 Register Res = MI.getOperand(0).getReg();
5083 Register LHS = MI.getOperand(2).getReg();
5084 Register RHS = MI.getOperand(3).getReg();
5085 uint16_t Flags = MI.getFlags();
5086
5087 LLT S32 = LLT::scalar(32);
5088 LLT S1 = LLT::scalar(1);
5089
5090 auto Abs = B.buildFAbs(S32, RHS, Flags);
5091 const APFloat C0Val(1.0f);
5092
5093 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5094 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5095 auto C2 = B.buildFConstant(S32, 1.0f);
5096
5097 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5098 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5099
5100 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5101
5102 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5103 .addUse(Mul0.getReg(0))
5104 .setMIFlags(Flags);
5105
5106 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5107
5108 B.buildFMul(Res, Sel, Mul1, Flags);
5109
5110 MI.eraseFromParent();
5111 return true;
5112}
5113
5116 MachineIRBuilder &B) const {
5117 // Bypass the correct expansion a standard promotion through G_FSQRT would
5118 // get. The f32 op is accurate enough for the f16 cas.
5119 unsigned Flags = MI.getFlags();
5120 assert(!ST.has16BitInsts());
5121 const LLT F32 = LLT::scalar(32);
5122 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5123 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5124 .addUse(Ext.getReg(0))
5125 .setMIFlags(Flags);
5126 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5127 MI.eraseFromParent();
5128 return true;
5129}
5130
5133 MachineIRBuilder &B) const {
5134 MachineFunction &MF = B.getMF();
5135 Register Dst = MI.getOperand(0).getReg();
5136 Register X = MI.getOperand(1).getReg();
5137 const unsigned Flags = MI.getFlags();
5138 const LLT S1 = LLT::scalar(1);
5139 const LLT F32 = LLT::scalar(32);
5140 const LLT I32 = LLT::scalar(32);
5141
5142 if (allowApproxFunc(MF, Flags)) {
5143 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5144 .addUse(X)
5145 .setMIFlags(Flags);
5146 MI.eraseFromParent();
5147 return true;
5148 }
5149
5150 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5151 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5152 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5153 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5154 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5155
5156 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5157 if (needsDenormHandlingF32(MF, X, Flags)) {
5158 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5159 .addUse(SqrtX.getReg(0))
5160 .setMIFlags(Flags);
5161
5162 auto NegOne = B.buildConstant(I32, -1);
5163 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5164
5165 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5166 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5167
5168 auto PosOne = B.buildConstant(I32, 1);
5169 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5170
5171 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5172 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5173
5174 auto Zero = B.buildFConstant(F32, 0.0f);
5175 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5176
5177 SqrtS =
5178 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5179
5180 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5181 SqrtS =
5182 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5183 } else {
5184 auto SqrtR =
5185 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5186 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5187
5188 auto Half = B.buildFConstant(F32, 0.5f);
5189 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5190 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5191 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5192 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5193 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5194 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5195 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5196 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5197 }
5198
5199 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5200
5201 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5202
5203 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5204
5205 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5206 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5207
5208 MI.eraseFromParent();
5209 return true;
5210}
5211
5214 MachineIRBuilder &B) const {
5215 // For double type, the SQRT and RSQ instructions don't have required
5216 // precision, we apply Goldschmidt's algorithm to improve the result:
5217 //
5218 // y0 = rsq(x)
5219 // g0 = x * y0
5220 // h0 = 0.5 * y0
5221 //
5222 // r0 = 0.5 - h0 * g0
5223 // g1 = g0 * r0 + g0
5224 // h1 = h0 * r0 + h0
5225 //
5226 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5227 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5228 // h2 = h1 * r1 + h1
5229 //
5230 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5231 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5232 //
5233 // sqrt(x) = g3
5234
5235 const LLT S1 = LLT::scalar(1);
5236 const LLT S32 = LLT::scalar(32);
5237 const LLT F64 = LLT::scalar(64);
5238
5239 Register Dst = MI.getOperand(0).getReg();
5240 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5241
5242 Register X = MI.getOperand(1).getReg();
5243 unsigned Flags = MI.getFlags();
5244
5245 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5246
5247 auto ZeroInt = B.buildConstant(S32, 0);
5248 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5249
5250 // Scale up input if it is too small.
5251 auto ScaleUpFactor = B.buildConstant(S32, 256);
5252 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5253 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5254
5255 auto SqrtY =
5256 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5257
5258 auto Half = B.buildFConstant(F64, 0.5);
5259 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5260 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5261
5262 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5263 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5264
5265 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5266 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5267
5268 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5269 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5270
5271 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5272
5273 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5274 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5275
5276 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5277
5278 // Scale down the result.
5279 auto ScaleDownFactor = B.buildConstant(S32, -128);
5280 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5281 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5282
5283 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5284 // with finite only or nsz because rsq(+/-0) = +/-inf
5285
5286 // TODO: Check for DAZ and expand to subnormals
5287 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5288
5289 // If x is +INF, +0, or -0, use its original value
5290 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5291
5292 MI.eraseFromParent();
5293 return true;
5294}
5295
5298 MachineIRBuilder &B) const {
5299 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5300 if (Ty == LLT::scalar(32))
5301 return legalizeFSQRTF32(MI, MRI, B);
5302 if (Ty == LLT::scalar(64))
5303 return legalizeFSQRTF64(MI, MRI, B);
5304 if (Ty == LLT::scalar(16))
5305 return legalizeFSQRTF16(MI, MRI, B);
5306 return false;
5307}
5308
5309// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5310// FIXME: Why do we handle this one but not other removed instructions?
5311//
5312// Reciprocal square root. The clamp prevents infinite results, clamping
5313// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5314// +-max_float.
5317 MachineIRBuilder &B) const {
5319 return true;
5320
5321 Register Dst = MI.getOperand(0).getReg();
5322 Register Src = MI.getOperand(2).getReg();
5323 auto Flags = MI.getFlags();
5324
5325 LLT Ty = MRI.getType(Dst);
5326
5327 const fltSemantics *FltSemantics;
5328 if (Ty == LLT::scalar(32))
5329 FltSemantics = &APFloat::IEEEsingle();
5330 else if (Ty == LLT::scalar(64))
5331 FltSemantics = &APFloat::IEEEdouble();
5332 else
5333 return false;
5334
5335 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5336 .addUse(Src)
5337 .setMIFlags(Flags);
5338
5339 // We don't need to concern ourselves with the snan handling difference, since
5340 // the rsq quieted (or not) so use the one which will directly select.
5341 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5342 const bool UseIEEE = MFI->getMode().IEEE;
5343
5344 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5345 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5346 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5347
5348 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5349
5350 if (UseIEEE)
5351 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5352 else
5353 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5354 MI.eraseFromParent();
5355 return true;
5356}
5357
5359 switch (IID) {
5360 case Intrinsic::amdgcn_ds_fadd:
5361 return AMDGPU::G_ATOMICRMW_FADD;
5362 case Intrinsic::amdgcn_ds_fmin:
5363 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5364 case Intrinsic::amdgcn_ds_fmax:
5365 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5366 default:
5367 llvm_unreachable("not a DS FP intrinsic");
5368 }
5369}
5370
5373 Intrinsic::ID IID) const {
5374 GISelChangeObserver &Observer = Helper.Observer;
5375 Observer.changingInstr(MI);
5376
5377 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5378
5379 // The remaining operands were used to set fields in the MemOperand on
5380 // construction.
5381 for (int I = 6; I > 3; --I)
5382 MI.removeOperand(I);
5383
5384 MI.removeOperand(1); // Remove the intrinsic ID.
5385 Observer.changedInstr(MI);
5386 return true;
5387}
5388
5391 MachineIRBuilder &B) const {
5395 LLT DstTy = MRI.getType(DstReg);
5396 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5397
5398 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5399 if (!loadInputValue(KernargPtrReg, B,
5401 return false;
5402
5403 // FIXME: This should be nuw
5404 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5405 return true;
5406}
5407
5408/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5409/// bits of the pointer and replace them with the stride argument, then
5410/// merge_values everything together. In the common case of a raw buffer (the
5411/// stride component is 0), we can just AND off the upper half.
5414 Register Result = MI.getOperand(0).getReg();
5415 Register Pointer = MI.getOperand(2).getReg();
5416 Register Stride = MI.getOperand(3).getReg();
5417 Register NumRecords = MI.getOperand(4).getReg();
5418 Register Flags = MI.getOperand(5).getReg();
5419
5420 LLT S32 = LLT::scalar(32);
5421
5422 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5423 auto Unmerge = B.buildUnmerge(S32, Pointer);
5424 Register LowHalf = Unmerge.getReg(0);
5425 Register HighHalf = Unmerge.getReg(1);
5426
5427 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5428 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5429
5430 MachineInstrBuilder NewHighHalf = Masked;
5431 std::optional<ValueAndVReg> StrideConst =
5433 if (!StrideConst || !StrideConst->Value.isZero()) {
5434 MachineInstrBuilder ShiftedStride;
5435 if (StrideConst) {
5436 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5437 uint32_t ShiftedStrideVal = StrideVal << 16;
5438 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5439 } else {
5440 auto ExtStride = B.buildAnyExt(S32, Stride);
5441 auto ShiftConst = B.buildConstant(S32, 16);
5442 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5443 }
5444 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5445 }
5446 Register NewHighHalfReg = NewHighHalf.getReg(0);
5447 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5448 MI.eraseFromParent();
5449 return true;
5450}
5451
5454 MachineIRBuilder &B) const {
5455 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5456 if (!MFI->isEntryFunction()) {
5459 }
5460
5461 Register DstReg = MI.getOperand(0).getReg();
5462 if (!getImplicitArgPtr(DstReg, MRI, B))
5463 return false;
5464
5465 MI.eraseFromParent();
5466 return true;
5467}
5468
5471 MachineIRBuilder &B) const {
5472 Function &F = B.getMF().getFunction();
5473 std::optional<uint32_t> KnownSize =
5475 if (KnownSize.has_value())
5476 B.buildConstant(DstReg, *KnownSize);
5477 return false;
5478}
5479
5482 MachineIRBuilder &B) const {
5483
5484 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5485 if (!MFI->isEntryFunction()) {
5488 }
5489
5490 Register DstReg = MI.getOperand(0).getReg();
5491 if (!getLDSKernelId(DstReg, MRI, B))
5492 return false;
5493
5494 MI.eraseFromParent();
5495 return true;
5496}
5497
5501 unsigned AddrSpace) const {
5502 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5503 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5504 Register Hi32 = Unmerge.getReg(1);
5505
5506 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5507 MI.eraseFromParent();
5508 return true;
5509}
5510
5511// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5512// offset (the offset that is included in bounds checking and swizzling, to be
5513// split between the instruction's voffset and immoffset fields) and soffset
5514// (the offset that is excluded from bounds checking and swizzling, to go in
5515// the instruction's soffset field). This function takes the first kind of
5516// offset and figures out how to split it between voffset and immoffset.
5517std::pair<Register, unsigned>
5519 Register OrigOffset) const {
5520 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5521 Register BaseReg;
5522 unsigned ImmOffset;
5523 const LLT S32 = LLT::scalar(32);
5524 MachineRegisterInfo &MRI = *B.getMRI();
5525
5526 std::tie(BaseReg, ImmOffset) =
5528
5529 // If BaseReg is a pointer, convert it to int.
5530 if (MRI.getType(BaseReg).isPointer())
5531 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5532
5533 // If the immediate value is too big for the immoffset field, put only bits
5534 // that would normally fit in the immoffset field. The remaining value that
5535 // is copied/added for the voffset field is a large power of 2, and it
5536 // stands more chance of being CSEd with the copy/add for another similar
5537 // load/store.
5538 // However, do not do that rounding down if that is a negative
5539 // number, as it appears to be illegal to have a negative offset in the
5540 // vgpr, even if adding the immediate offset makes it positive.
5541 unsigned Overflow = ImmOffset & ~MaxImm;
5542 ImmOffset -= Overflow;
5543 if ((int32_t)Overflow < 0) {
5544 Overflow += ImmOffset;
5545 ImmOffset = 0;
5546 }
5547
5548 if (Overflow != 0) {
5549 if (!BaseReg) {
5550 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5551 } else {
5552 auto OverflowVal = B.buildConstant(S32, Overflow);
5553 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5554 }
5555 }
5556
5557 if (!BaseReg)
5558 BaseReg = B.buildConstant(S32, 0).getReg(0);
5559
5560 return std::pair(BaseReg, ImmOffset);
5561}
5562
5563/// Handle register layout difference for f16 images for some subtargets.
5566 Register Reg,
5567 bool ImageStore) const {
5568 const LLT S16 = LLT::scalar(16);
5569 const LLT S32 = LLT::scalar(32);
5570 LLT StoreVT = MRI.getType(Reg);
5571 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5572
5573 if (ST.hasUnpackedD16VMem()) {
5574 auto Unmerge = B.buildUnmerge(S16, Reg);
5575
5576 SmallVector<Register, 4> WideRegs;
5577 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5578 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5579
5580 int NumElts = StoreVT.getNumElements();
5581
5582 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5583 .getReg(0);
5584 }
5585
5586 if (ImageStore && ST.hasImageStoreD16Bug()) {
5587 if (StoreVT.getNumElements() == 2) {
5588 SmallVector<Register, 4> PackedRegs;
5589 Reg = B.buildBitcast(S32, Reg).getReg(0);
5590 PackedRegs.push_back(Reg);
5591 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5592 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5593 .getReg(0);
5594 }
5595
5596 if (StoreVT.getNumElements() == 3) {
5597 SmallVector<Register, 4> PackedRegs;
5598 auto Unmerge = B.buildUnmerge(S16, Reg);
5599 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5600 PackedRegs.push_back(Unmerge.getReg(I));
5601 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5602 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5603 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5604 }
5605
5606 if (StoreVT.getNumElements() == 4) {
5607 SmallVector<Register, 4> PackedRegs;
5608 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5609 auto Unmerge = B.buildUnmerge(S32, Reg);
5610 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5611 PackedRegs.push_back(Unmerge.getReg(I));
5612 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5613 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5614 .getReg(0);
5615 }
5616
5617 llvm_unreachable("invalid data type");
5618 }
5619
5620 if (StoreVT == LLT::fixed_vector(3, S16)) {
5621 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5622 .getReg(0);
5623 }
5624 return Reg;
5625}
5626
5628 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5629 MachineRegisterInfo *MRI = B.getMRI();
5630 LLT Ty = MRI->getType(VData);
5631
5632 const LLT S16 = LLT::scalar(16);
5633
5634 // Fixup buffer resources themselves needing to be v4i128.
5636 return castBufferRsrcToV4I32(VData, B);
5637
5638 // Fixup illegal register types for i8 stores.
5639 if (Ty == LLT::scalar(8) || Ty == S16) {
5640 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5641 return AnyExt;
5642 }
5643
5644 if (Ty.isVector()) {
5645 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5646 if (IsFormat)
5647 return handleD16VData(B, *MRI, VData);
5648 }
5649 }
5650
5651 return VData;
5652}
5653
5657 bool IsTyped,
5658 bool IsFormat) const {
5659 Register VData = MI.getOperand(1).getReg();
5660 LLT Ty = MRI.getType(VData);
5661 LLT EltTy = Ty.getScalarType();
5662 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5663 const LLT S32 = LLT::scalar(32);
5664
5665 VData = fixStoreSourceType(B, VData, IsFormat);
5667 Register RSrc = MI.getOperand(2).getReg();
5668
5669 MachineMemOperand *MMO = *MI.memoperands_begin();
5670 const int MemSize = MMO->getSize().getValue();
5671
5672 unsigned ImmOffset;
5673
5674 // The typed intrinsics add an immediate after the registers.
5675 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5676
5677 // The struct intrinsic variants add one additional operand over raw.
5678 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5679 Register VIndex;
5680 int OpOffset = 0;
5681 if (HasVIndex) {
5682 VIndex = MI.getOperand(3).getReg();
5683 OpOffset = 1;
5684 } else {
5685 VIndex = B.buildConstant(S32, 0).getReg(0);
5686 }
5687
5688 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5689 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5690
5691 unsigned Format = 0;
5692 if (IsTyped) {
5693 Format = MI.getOperand(5 + OpOffset).getImm();
5694 ++OpOffset;
5695 }
5696
5697 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5698
5699 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5700
5701 unsigned Opc;
5702 if (IsTyped) {
5703 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5704 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5705 } else if (IsFormat) {
5706 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5707 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5708 } else {
5709 switch (MemSize) {
5710 case 1:
5711 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5712 break;
5713 case 2:
5714 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5715 break;
5716 default:
5717 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5718 break;
5719 }
5720 }
5721
5722 auto MIB = B.buildInstr(Opc)
5723 .addUse(VData) // vdata
5724 .addUse(RSrc) // rsrc
5725 .addUse(VIndex) // vindex
5726 .addUse(VOffset) // voffset
5727 .addUse(SOffset) // soffset
5728 .addImm(ImmOffset); // offset(imm)
5729
5730 if (IsTyped)
5731 MIB.addImm(Format);
5732
5733 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5734 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5735 .addMemOperand(MMO);
5736
5737 MI.eraseFromParent();
5738 return true;
5739}
5740
5741static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5742 Register VIndex, Register VOffset, Register SOffset,
5743 unsigned ImmOffset, unsigned Format,
5744 unsigned AuxiliaryData, MachineMemOperand *MMO,
5745 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5746 auto MIB = B.buildInstr(Opc)
5747 .addDef(LoadDstReg) // vdata
5748 .addUse(RSrc) // rsrc
5749 .addUse(VIndex) // vindex
5750 .addUse(VOffset) // voffset
5751 .addUse(SOffset) // soffset
5752 .addImm(ImmOffset); // offset(imm)
5753
5754 if (IsTyped)
5755 MIB.addImm(Format);
5756
5757 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5758 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5759 .addMemOperand(MMO);
5760}
5761
5765 bool IsFormat,
5766 bool IsTyped) const {
5767 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5768 MachineMemOperand *MMO = *MI.memoperands_begin();
5769 const LLT MemTy = MMO->getMemoryType();
5770 const LLT S32 = LLT::scalar(32);
5771
5772 Register Dst = MI.getOperand(0).getReg();
5773
5774 Register StatusDst;
5775 int OpOffset = 0;
5776 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5777 bool IsTFE = MI.getNumExplicitDefs() == 2;
5778 if (IsTFE) {
5779 StatusDst = MI.getOperand(1).getReg();
5780 ++OpOffset;
5781 }
5782
5783 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5784 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5785
5786 // The typed intrinsics add an immediate after the registers.
5787 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5788
5789 // The struct intrinsic variants add one additional operand over raw.
5790 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5791 Register VIndex;
5792 if (HasVIndex) {
5793 VIndex = MI.getOperand(3 + OpOffset).getReg();
5794 ++OpOffset;
5795 } else {
5796 VIndex = B.buildConstant(S32, 0).getReg(0);
5797 }
5798
5799 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5800 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5801
5802 unsigned Format = 0;
5803 if (IsTyped) {
5804 Format = MI.getOperand(5 + OpOffset).getImm();
5805 ++OpOffset;
5806 }
5807
5808 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5809 unsigned ImmOffset;
5810
5811 LLT Ty = MRI.getType(Dst);
5812 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5813 // logic doesn't have to handle that case.
5814 if (hasBufferRsrcWorkaround(Ty)) {
5815 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5816 Dst = MI.getOperand(0).getReg();
5817 }
5818 LLT EltTy = Ty.getScalarType();
5819 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5820 const bool Unpacked = ST.hasUnpackedD16VMem();
5821
5822 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5823
5824 unsigned Opc;
5825
5826 // TODO: Support TFE for typed and narrow loads.
5827 if (IsTyped) {
5828 if (IsTFE)
5829 return false;
5830 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5831 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5832 } else if (IsFormat) {
5833 if (IsD16) {
5834 if (IsTFE)
5835 return false;
5836 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5837 } else {
5838 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5839 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5840 }
5841 } else {
5842 if (IsTFE)
5843 return false;
5844 switch (MemTy.getSizeInBits()) {
5845 case 8:
5846 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5847 break;
5848 case 16:
5849 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5850 break;
5851 default:
5852 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5853 break;
5854 }
5855 }
5856
5857 if (IsTFE) {
5858 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5859 unsigned NumLoadDWords = NumValueDWords + 1;
5860 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5861 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5862 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5863 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5864 if (NumValueDWords == 1) {
5865 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5866 } else {
5867 SmallVector<Register, 5> LoadElts;
5868 for (unsigned I = 0; I != NumValueDWords; ++I)
5869 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5870 LoadElts.push_back(StatusDst);
5871 B.buildUnmerge(LoadElts, LoadDstReg);
5872 LoadElts.truncate(NumValueDWords);
5873 B.buildMergeLikeInstr(Dst, LoadElts);
5874 }
5875 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5876 (IsD16 && !Ty.isVector())) {
5877 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5878 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5879 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5880 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5881 B.buildTrunc(Dst, LoadDstReg);
5882 } else if (Unpacked && IsD16 && Ty.isVector()) {
5883 LLT UnpackedTy = Ty.changeElementSize(32);
5884 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5885 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5886 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5887 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5888 // FIXME: G_TRUNC should work, but legalization currently fails
5889 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
5891 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
5892 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5893 B.buildMergeLikeInstr(Dst, Repack);
5894 } else {
5895 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5896 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5897 }
5898
5899 MI.eraseFromParent();
5900 return true;
5901}
5902
5903static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
5904 switch (IntrID) {
5905 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5907 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5908 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5909 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5910 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5912 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5913 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5914 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5915 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5916 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5917 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5918 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5919 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5920 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5922 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5923 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5924 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5925 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5927 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5929 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5930 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5931 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5932 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5934 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5935 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5936 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5937 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5938 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5939 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5940 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5941 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5942 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5943 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5944 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5945 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5946 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5947 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5949 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5950 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5951 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5952 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5954 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5955 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5956 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5957 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5959 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5960 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5961 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5962 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5963 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5964 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5965 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5966 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5967 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5968 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5969 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5970 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5971 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5972 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5973 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5974 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5975 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
5976 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
5977 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
5978 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5979 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5980 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5981 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5982 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5983 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5984 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5985 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5986 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5987 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5988 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
5989 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
5990 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
5991 default:
5992 llvm_unreachable("unhandled atomic opcode");
5993 }
5994}
5995
5998 Intrinsic::ID IID) const {
5999 const bool IsCmpSwap =
6000 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6001 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6002 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6003 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6004
6005 Register Dst = MI.getOperand(0).getReg();
6006 // Since we don't have 128-bit atomics, we don't need to handle the case of
6007 // p8 argmunents to the atomic itself
6008 Register VData = MI.getOperand(2).getReg();
6009
6010 Register CmpVal;
6011 int OpOffset = 0;
6012
6013 if (IsCmpSwap) {
6014 CmpVal = MI.getOperand(3).getReg();
6015 ++OpOffset;
6016 }
6017
6018 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6019 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6020 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6021
6022 // The struct intrinsic variants add one additional operand over raw.
6023 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6024 Register VIndex;
6025 if (HasVIndex) {
6026 VIndex = MI.getOperand(4 + OpOffset).getReg();
6027 ++OpOffset;
6028 } else {
6029 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6030 }
6031
6032 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6033 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6034 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6035
6036 MachineMemOperand *MMO = *MI.memoperands_begin();
6037
6038 unsigned ImmOffset;
6039 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6040
6041 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6042 .addDef(Dst)
6043 .addUse(VData); // vdata
6044
6045 if (IsCmpSwap)
6046 MIB.addReg(CmpVal);
6047
6048 MIB.addUse(RSrc) // rsrc
6049 .addUse(VIndex) // vindex
6050 .addUse(VOffset) // voffset
6051 .addUse(SOffset) // soffset
6052 .addImm(ImmOffset) // offset(imm)
6053 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6054 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6055 .addMemOperand(MMO);
6056
6057 MI.eraseFromParent();
6058 return true;
6059}
6060
6061/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6062/// vector with s16 typed elements.
6064 SmallVectorImpl<Register> &PackedAddrs,
6065 unsigned ArgOffset,
6067 bool IsA16, bool IsG16) {
6068 const LLT S16 = LLT::scalar(16);
6069 const LLT V2S16 = LLT::fixed_vector(2, 16);
6070 auto EndIdx = Intr->VAddrEnd;
6071
6072 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6073 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6074 if (!SrcOp.isReg())
6075 continue; // _L to _LZ may have eliminated this.
6076
6077 Register AddrReg = SrcOp.getReg();
6078
6079 if ((I < Intr->GradientStart) ||
6080 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6081 (I >= Intr->CoordStart && !IsA16)) {
6082 if ((I < Intr->GradientStart) && IsA16 &&
6083 (B.getMRI()->getType(AddrReg) == S16)) {
6084 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6085 // Special handling of bias when A16 is on. Bias is of type half but
6086 // occupies full 32-bit.
6087 PackedAddrs.push_back(
6088 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6089 .getReg(0));
6090 } else {
6091 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6092 "Bias needs to be converted to 16 bit in A16 mode");
6093 // Handle any gradient or coordinate operands that should not be packed
6094 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6095 PackedAddrs.push_back(AddrReg);
6096 }
6097 } else {
6098 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6099 // derivatives dx/dh and dx/dv are packed with undef.
6100 if (((I + 1) >= EndIdx) ||
6101 ((Intr->NumGradients / 2) % 2 == 1 &&
6102 (I == static_cast<unsigned>(Intr->GradientStart +
6103 (Intr->NumGradients / 2) - 1) ||
6104 I == static_cast<unsigned>(Intr->GradientStart +
6105 Intr->NumGradients - 1))) ||
6106 // Check for _L to _LZ optimization
6107 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6108 PackedAddrs.push_back(
6109 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6110 .getReg(0));
6111 } else {
6112 PackedAddrs.push_back(
6113 B.buildBuildVector(
6114 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6115 .getReg(0));
6116 ++I;
6117 }
6118 }
6119 }
6120}
6121
6122/// Convert from separate vaddr components to a single vector address register,
6123/// and replace the remaining operands with $noreg.
6125 int DimIdx, int NumVAddrs) {
6126 const LLT S32 = LLT::scalar(32);
6127 (void)S32;
6128 SmallVector<Register, 8> AddrRegs;
6129 for (int I = 0; I != NumVAddrs; ++I) {
6130 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6131 if (SrcOp.isReg()) {
6132 AddrRegs.push_back(SrcOp.getReg());
6133 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6134 }
6135 }
6136
6137 int NumAddrRegs = AddrRegs.size();
6138 if (NumAddrRegs != 1) {
6139 auto VAddr =
6140 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6141 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6142 }
6143
6144 for (int I = 1; I != NumVAddrs; ++I) {
6145 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6146 if (SrcOp.isReg())
6147 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6148 }
6149}
6150
6151/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6152///
6153/// Depending on the subtarget, load/store with 16-bit element data need to be
6154/// rewritten to use the low half of 32-bit registers, or directly use a packed
6155/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6156/// registers.
6157///
6158/// We don't want to directly select image instructions just yet, but also want
6159/// to exposes all register repacking to the legalizer/combiners. We also don't
6160/// want a selected instruction entering RegBankSelect. In order to avoid
6161/// defining a multitude of intermediate image instructions, directly hack on
6162/// the intrinsic's arguments. In cases like a16 addresses, this requires
6163/// padding now unnecessary arguments with $noreg.
6166 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6167
6168 const MachineFunction &MF = *MI.getMF();
6169 const unsigned NumDefs = MI.getNumExplicitDefs();
6170 const unsigned ArgOffset = NumDefs + 1;
6171 bool IsTFE = NumDefs == 2;
6172 // We are only processing the operands of d16 image operations on subtargets
6173 // that use the unpacked register layout, or need to repack the TFE result.
6174
6175 // TODO: Do we need to guard against already legalized intrinsics?
6176 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6178
6179 MachineRegisterInfo *MRI = B.getMRI();
6180 const LLT S32 = LLT::scalar(32);
6181 const LLT S16 = LLT::scalar(16);
6182 const LLT V2S16 = LLT::fixed_vector(2, 16);
6183
6184 unsigned DMask = 0;
6185 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6186 LLT Ty = MRI->getType(VData);
6187
6188 const bool IsAtomicPacked16Bit =
6189 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6190 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6191
6192 // Check for 16 bit addresses and pack if true.
6193 LLT GradTy =
6194 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6195 LLT AddrTy =
6196 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6197 const bool IsG16 =
6198 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6199 const bool IsA16 = AddrTy == S16;
6200 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6201
6202 int DMaskLanes = 0;
6203 if (!BaseOpcode->Atomic) {
6204 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6205 if (BaseOpcode->Gather4) {
6206 DMaskLanes = 4;
6207 } else if (DMask != 0) {
6208 DMaskLanes = llvm::popcount(DMask);
6209 } else if (!IsTFE && !BaseOpcode->Store) {
6210 // If dmask is 0, this is a no-op load. This can be eliminated.
6211 B.buildUndef(MI.getOperand(0));
6212 MI.eraseFromParent();
6213 return true;
6214 }
6215 }
6216
6217 Observer.changingInstr(MI);
6218 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6219
6220 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6221 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6222 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6223 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6224 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6225
6226 // Track that we legalized this
6227 MI.setDesc(B.getTII().get(NewOpcode));
6228
6229 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6230 // dmask to be at least 1 otherwise the instruction will fail
6231 if (IsTFE && DMask == 0) {
6232 DMask = 0x1;
6233 DMaskLanes = 1;
6234 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6235 }
6236
6237 if (BaseOpcode->Atomic) {
6238 Register VData0 = MI.getOperand(2).getReg();
6239 LLT Ty = MRI->getType(VData0);
6240
6241 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6242 if (Ty.isVector() && !IsAtomicPacked16Bit)
6243 return false;
6244
6245 if (BaseOpcode->AtomicX2) {
6246 Register VData1 = MI.getOperand(3).getReg();
6247 // The two values are packed in one register.
6248 LLT PackedTy = LLT::fixed_vector(2, Ty);
6249 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6250 MI.getOperand(2).setReg(Concat.getReg(0));
6251 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6252 }
6253 }
6254
6255 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6256
6257 // Rewrite the addressing register layout before doing anything else.
6258 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6259 // 16 bit gradients are supported, but are tied to the A16 control
6260 // so both gradients and addresses must be 16 bit
6261 return false;
6262 }
6263
6264 if (IsA16 && !ST.hasA16()) {
6265 // A16 not supported
6266 return false;
6267 }
6268
6269 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6270 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6271
6272 if (IsA16 || IsG16) {
6273 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6274 // instructions expect VGPR_32
6275 SmallVector<Register, 4> PackedRegs;
6276
6277 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6278
6279 // See also below in the non-a16 branch
6280 const bool UseNSA = ST.hasNSAEncoding() &&
6281 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6282 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6283 const bool UsePartialNSA =
6284 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6285
6286 if (UsePartialNSA) {
6287 // Pack registers that would go over NSAMaxSize into last VAddr register
6288 LLT PackedAddrTy =
6289 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6290 auto Concat = B.buildConcatVectors(
6291 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6292 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6293 PackedRegs.resize(NSAMaxSize);
6294 } else if (!UseNSA && PackedRegs.size() > 1) {
6295 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6296 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6297 PackedRegs[0] = Concat.getReg(0);
6298 PackedRegs.resize(1);
6299 }
6300
6301 const unsigned NumPacked = PackedRegs.size();
6302 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6303 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6304 if (!SrcOp.isReg()) {
6305 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6306 continue;
6307 }
6308
6309 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6310
6311 if (I - Intr->VAddrStart < NumPacked)
6312 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6313 else
6314 SrcOp.setReg(AMDGPU::NoRegister);
6315 }
6316 } else {
6317 // If the register allocator cannot place the address registers contiguously
6318 // without introducing moves, then using the non-sequential address encoding
6319 // is always preferable, since it saves VALU instructions and is usually a
6320 // wash in terms of code size or even better.
6321 //
6322 // However, we currently have no way of hinting to the register allocator
6323 // that MIMG addresses should be placed contiguously when it is possible to
6324 // do so, so force non-NSA for the common 2-address case as a heuristic.
6325 //
6326 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6327 // allocation when possible.
6328 //
6329 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6330 // set of the remaining addresses.
6331 const bool UseNSA = ST.hasNSAEncoding() &&
6332 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6333 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6334 const bool UsePartialNSA =
6335 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6336
6337 if (UsePartialNSA) {
6339 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6340 Intr->NumVAddrs - NSAMaxSize + 1);
6341 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6342 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6343 Intr->NumVAddrs);
6344 }
6345 }
6346
6347 int Flags = 0;
6348 if (IsA16)
6349 Flags |= 1;
6350 if (IsG16)
6351 Flags |= 2;
6352 MI.addOperand(MachineOperand::CreateImm(Flags));
6353
6354 if (BaseOpcode->Store) { // No TFE for stores?
6355 // TODO: Handle dmask trim
6356 if (!Ty.isVector() || !IsD16)
6357 return true;
6358
6359 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6360 if (RepackedReg != VData) {
6361 MI.getOperand(1).setReg(RepackedReg);
6362 }
6363
6364 return true;
6365 }
6366
6367 Register DstReg = MI.getOperand(0).getReg();
6368 const LLT EltTy = Ty.getScalarType();
6369 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6370
6371 // Confirm that the return type is large enough for the dmask specified
6372 if (NumElts < DMaskLanes)
6373 return false;
6374
6375 if (NumElts > 4 || DMaskLanes > 4)
6376 return false;
6377
6378 // Image atomic instructions are using DMask to specify how many bits
6379 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6380 // DMaskLanes for image atomic has default value '0'.
6381 // We must be sure that atomic variants (especially packed) will not be
6382 // truncated from v2s16 or v4s16 to s16 type.
6383 //
6384 // ChangeElementCount will be needed for image load where Ty is always scalar.
6385 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6386 const LLT AdjustedTy =
6387 DMaskLanes == 0
6388 ? Ty
6389 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6390
6391 // The raw dword aligned data component of the load. The only legal cases
6392 // where this matters should be when using the packed D16 format, for
6393 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6394 LLT RoundedTy;
6395
6396 // S32 vector to cover all data, plus TFE result element.
6397 LLT TFETy;
6398
6399 // Register type to use for each loaded component. Will be S32 or V2S16.
6400 LLT RegTy;
6401
6402 if (IsD16 && ST.hasUnpackedD16VMem()) {
6403 RoundedTy =
6404 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6405 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6406 RegTy = S32;
6407 } else {
6408 unsigned EltSize = EltTy.getSizeInBits();
6409 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6410 unsigned RoundedSize = 32 * RoundedElts;
6411 RoundedTy = LLT::scalarOrVector(
6412 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6413 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6414 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6415 }
6416
6417 // The return type does not need adjustment.
6418 // TODO: Should we change s16 case to s32 or <2 x s16>?
6419 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6420 return true;
6421
6422 Register Dst1Reg;
6423
6424 // Insert after the instruction.
6425 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6426
6427 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6428 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6429 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6430 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6431
6432 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6433
6434 MI.getOperand(0).setReg(NewResultReg);
6435
6436 // In the IR, TFE is supposed to be used with a 2 element struct return
6437 // type. The instruction really returns these two values in one contiguous
6438 // register, with one additional dword beyond the loaded data. Rewrite the
6439 // return type to use a single register result.
6440
6441 if (IsTFE) {
6442 Dst1Reg = MI.getOperand(1).getReg();
6443 if (MRI->getType(Dst1Reg) != S32)
6444 return false;
6445
6446 // TODO: Make sure the TFE operand bit is set.
6447 MI.removeOperand(1);
6448
6449 // Handle the easy case that requires no repack instructions.
6450 if (Ty == S32) {
6451 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6452 return true;
6453 }
6454 }
6455
6456 // Now figure out how to copy the new result register back into the old
6457 // result.
6458 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6459
6460 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6461
6462 if (ResultNumRegs == 1) {
6463 assert(!IsTFE);
6464 ResultRegs[0] = NewResultReg;
6465 } else {
6466 // We have to repack into a new vector of some kind.
6467 for (int I = 0; I != NumDataRegs; ++I)
6468 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6469 B.buildUnmerge(ResultRegs, NewResultReg);
6470
6471 // Drop the final TFE element to get the data part. The TFE result is
6472 // directly written to the right place already.
6473 if (IsTFE)
6474 ResultRegs.resize(NumDataRegs);
6475 }
6476
6477 // For an s16 scalar result, we form an s32 result with a truncate regardless
6478 // of packed vs. unpacked.
6479 if (IsD16 && !Ty.isVector()) {
6480 B.buildTrunc(DstReg, ResultRegs[0]);
6481 return true;
6482 }
6483
6484 // Avoid a build/concat_vector of 1 entry.
6485 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6486 B.buildBitcast(DstReg, ResultRegs[0]);
6487 return true;
6488 }
6489
6490 assert(Ty.isVector());
6491
6492 if (IsD16) {
6493 // For packed D16 results with TFE enabled, all the data components are
6494 // S32. Cast back to the expected type.
6495 //
6496 // TODO: We don't really need to use load s32 elements. We would only need one
6497 // cast for the TFE result if a multiple of v2s16 was used.
6498 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6499 for (Register &Reg : ResultRegs)
6500 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6501 } else if (ST.hasUnpackedD16VMem()) {
6502 for (Register &Reg : ResultRegs)
6503 Reg = B.buildTrunc(S16, Reg).getReg(0);
6504 }
6505 }
6506
6507 auto padWithUndef = [&](LLT Ty, int NumElts) {
6508 if (NumElts == 0)
6509 return;
6510 Register Undef = B.buildUndef(Ty).getReg(0);
6511 for (int I = 0; I != NumElts; ++I)
6512 ResultRegs.push_back(Undef);
6513 };
6514
6515 // Pad out any elements eliminated due to the dmask.
6516 LLT ResTy = MRI->getType(ResultRegs[0]);
6517 if (!ResTy.isVector()) {
6518 padWithUndef(ResTy, NumElts - ResultRegs.size());
6519 B.buildBuildVector(DstReg, ResultRegs);
6520 return true;
6521 }
6522
6523 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6524 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6525
6526 // Deal with the one annoying legal case.
6527 const LLT V3S16 = LLT::fixed_vector(3, 16);
6528 if (Ty == V3S16) {
6529 if (IsTFE) {
6530 if (ResultRegs.size() == 1) {
6531 NewResultReg = ResultRegs[0];
6532 } else if (ResultRegs.size() == 2) {
6533 LLT V4S16 = LLT::fixed_vector(4, 16);
6534 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6535 } else {
6536 return false;
6537 }
6538 }
6539
6540 if (MRI->getType(DstReg).getNumElements() <
6541 MRI->getType(NewResultReg).getNumElements()) {
6542 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6543 } else {
6544 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6545 }
6546 return true;
6547 }
6548
6549 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6550 B.buildConcatVectors(DstReg, ResultRegs);
6551 return true;
6552}
6553
6555 MachineInstr &MI) const {
6556 MachineIRBuilder &B = Helper.MIRBuilder;
6557 GISelChangeObserver &Observer = Helper.Observer;
6558
6559 Register OrigDst = MI.getOperand(0).getReg();
6560 Register Dst;
6561 LLT Ty = B.getMRI()->getType(OrigDst);
6562 unsigned Size = Ty.getSizeInBits();
6563 MachineFunction &MF = B.getMF();
6564 unsigned Opc = 0;
6565 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6566 assert(Size == 8 || Size == 16);
6567 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6568 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6569 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6570 // destination register.
6571 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6572 } else {
6573 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6574 Dst = OrigDst;
6575 }
6576
6577 Observer.changingInstr(MI);
6578
6579 // Handle needing to s.buffer.load() a p8 value.
6580 if (hasBufferRsrcWorkaround(Ty)) {
6581 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6582 B.setInsertPt(B.getMBB(), MI);
6583 }
6585 Ty = getBitcastRegisterType(Ty);
6586 Helper.bitcastDst(MI, Ty, 0);
6587 B.setInsertPt(B.getMBB(), MI);
6588 }
6589
6590 // FIXME: We don't really need this intermediate instruction. The intrinsic
6591 // should be fixed to have a memory operand. Since it's readnone, we're not
6592 // allowed to add one.
6593 MI.setDesc(B.getTII().get(Opc));
6594 MI.removeOperand(1); // Remove intrinsic ID
6595
6596 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6597 // TODO: Should this use datalayout alignment?
6598 const unsigned MemSize = (Size + 7) / 8;
6599 const Align MemAlign(std::min(MemSize, 4u));
6604 MemSize, MemAlign);
6605 MI.addMemOperand(MF, MMO);
6606 if (Dst != OrigDst) {
6607 MI.getOperand(0).setReg(Dst);
6608 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6609 B.buildTrunc(OrigDst, Dst);
6610 }
6611
6612 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6613 // always be legal. We may need to restore this to a 96-bit result if it turns
6614 // out this needs to be converted to a vector load during RegBankSelect.
6615 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6616 if (Ty.isVector())
6618 else
6619 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6620 }
6621
6622 Observer.changedInstr(MI);
6623 return true;
6624}
6625
6626// TODO: Move to selection
6629 MachineIRBuilder &B) const {
6630 if (!ST.isTrapHandlerEnabled() ||
6632 return legalizeTrapEndpgm(MI, MRI, B);
6633
6634 return ST.supportsGetDoorbellID() ?
6636}
6637
6640 const DebugLoc &DL = MI.getDebugLoc();
6641 MachineBasicBlock &BB = B.getMBB();
6642 MachineFunction *MF = BB.getParent();
6643
6644 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6645 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6646 .addImm(0);
6647 MI.eraseFromParent();
6648 return true;
6649 }
6650
6651 // We need a block split to make the real endpgm a terminator. We also don't
6652 // want to break phis in successor blocks, so we can't just delete to the
6653 // end of the block.
6654 BB.splitAt(MI, false /*UpdateLiveIns*/);
6656 MF->push_back(TrapBB);
6657 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6658 .addImm(0);
6659 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6660 .addMBB(TrapBB);
6661
6662 BB.addSuccessor(TrapBB);
6663 MI.eraseFromParent();
6664 return true;
6665}
6666
6669 MachineFunction &MF = B.getMF();
6670 const LLT S64 = LLT::scalar(64);
6671
6672 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6673 // For code object version 5, queue_ptr is passed through implicit kernarg.
6679 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6680
6681 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6683
6684 if (!loadInputValue(KernargPtrReg, B,
6686 return false;
6687
6688 // TODO: can we be smarter about machine pointer info?
6691 PtrInfo,
6695
6696 // Pointer address
6697 Register LoadAddr = MRI.createGenericVirtualRegister(
6699 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6700 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6701 // Load address
6702 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6703 B.buildCopy(SGPR01, Temp);
6704 B.buildInstr(AMDGPU::S_TRAP)
6705 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6706 .addReg(SGPR01, RegState::Implicit);
6707 MI.eraseFromParent();
6708 return true;
6709 }
6710
6711 // Pass queue pointer to trap handler as input, and insert trap instruction
6712 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6713 Register LiveIn =
6714 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6716 return false;
6717
6718 B.buildCopy(SGPR01, LiveIn);
6719 B.buildInstr(AMDGPU::S_TRAP)
6720 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6721 .addReg(SGPR01, RegState::Implicit);
6722
6723 MI.eraseFromParent();
6724 return true;
6725}
6726
6729 B.buildInstr(AMDGPU::S_TRAP)
6730 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6731 MI.eraseFromParent();
6732 return true;
6733}
6734
6737 MachineIRBuilder &B) const {
6738 // Is non-HSA path or trap-handler disabled? Then, report a warning
6739 // accordingly
6740 if (!ST.isTrapHandlerEnabled() ||
6742 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6743 "debugtrap handler not supported",
6744 MI.getDebugLoc(), DS_Warning);
6745 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6746 Ctx.diagnose(NoTrap);
6747 } else {
6748 // Insert debug-trap instruction
6749 B.buildInstr(AMDGPU::S_TRAP)
6750 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6751 }
6752
6753 MI.eraseFromParent();
6754 return true;
6755}
6756
6758 MachineIRBuilder &B) const {
6759 MachineRegisterInfo &MRI = *B.getMRI();
6760 const LLT S16 = LLT::scalar(16);
6761 const LLT S32 = LLT::scalar(32);
6762 const LLT V2S16 = LLT::fixed_vector(2, 16);
6763 const LLT V3S32 = LLT::fixed_vector(3, 32);
6764
6765 Register DstReg = MI.getOperand(0).getReg();
6766 Register NodePtr = MI.getOperand(2).getReg();
6767 Register RayExtent = MI.getOperand(3).getReg();
6768 Register RayOrigin = MI.getOperand(4).getReg();
6769 Register RayDir = MI.getOperand(5).getReg();
6770 Register RayInvDir = MI.getOperand(6).getReg();
6771 Register TDescr = MI.getOperand(7).getReg();
6772
6773 if (!ST.hasGFX10_AEncoding()) {
6774 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6775 "intrinsic not supported on subtarget",
6776 MI.getDebugLoc());
6777 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6778 return false;
6779 }
6780
6781 const bool IsGFX11 = AMDGPU::isGFX11(ST);
6782 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6783 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6784 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6785 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6786 const unsigned NumVDataDwords = 4;
6787 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6788 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6789 const bool UseNSA =
6790 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6791
6792 const unsigned BaseOpcodes[2][2] = {
6793 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6794 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6795 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6796 int Opcode;
6797 if (UseNSA) {
6798 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6799 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6800 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6801 : AMDGPU::MIMGEncGfx10NSA,
6802 NumVDataDwords, NumVAddrDwords);
6803 } else {
6804 assert(!IsGFX12Plus);
6805 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6806 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6807 : AMDGPU::MIMGEncGfx10Default,
6808 NumVDataDwords, NumVAddrDwords);
6809 }
6810 assert(Opcode != -1);
6811
6813 if (UseNSA && IsGFX11Plus) {
6814 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6815 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6816 auto Merged = B.buildMergeLikeInstr(
6817 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6818 Ops.push_back(Merged.getReg(0));
6819 };
6820
6821 Ops.push_back(NodePtr);
6822 Ops.push_back(RayExtent);
6823 packLanes(RayOrigin);
6824
6825 if (IsA16) {
6826 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6827 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6828 auto MergedDir = B.buildMergeLikeInstr(
6829 V3S32,
6830 {B.buildBitcast(
6831 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
6832 UnmergeRayDir.getReg(0)}))
6833 .getReg(0),
6834 B.buildBitcast(
6835 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
6836 UnmergeRayDir.getReg(1)}))
6837 .getReg(0),
6838 B.buildBitcast(
6839 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
6840 UnmergeRayDir.getReg(2)}))
6841 .getReg(0)});
6842 Ops.push_back(MergedDir.getReg(0));
6843 } else {
6844 packLanes(RayDir);
6845 packLanes(RayInvDir);
6846 }
6847 } else {
6848 if (Is64) {
6849 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6850 Ops.push_back(Unmerge.getReg(0));
6851 Ops.push_back(Unmerge.getReg(1));
6852 } else {
6853 Ops.push_back(NodePtr);
6854 }
6855 Ops.push_back(RayExtent);
6856
6857 auto packLanes = [&Ops, &S32, &B](Register Src) {
6858 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6859 Ops.push_back(Unmerge.getReg(0));
6860 Ops.push_back(Unmerge.getReg(1));
6861 Ops.push_back(Unmerge.getReg(2));
6862 };
6863
6864 packLanes(RayOrigin);
6865 if (IsA16) {
6866 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6867 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6868 Register R1 = MRI.createGenericVirtualRegister(S32);
6869 Register R2 = MRI.createGenericVirtualRegister(S32);
6870 Register R3 = MRI.createGenericVirtualRegister(S32);
6871 B.buildMergeLikeInstr(R1,
6872 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6873 B.buildMergeLikeInstr(
6874 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6875 B.buildMergeLikeInstr(
6876 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6877 Ops.push_back(R1);
6878 Ops.push_back(R2);
6879 Ops.push_back(R3);
6880 } else {
6881 packLanes(RayDir);
6882 packLanes(RayInvDir);
6883 }
6884 }
6885
6886 if (!UseNSA) {
6887 // Build a single vector containing all the operands so far prepared.
6888 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6889 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6890 Ops.clear();
6891 Ops.push_back(MergedOps);
6892 }
6893
6894 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6895 .addDef(DstReg)
6896 .addImm(Opcode);
6897
6898 for (Register R : Ops) {
6899 MIB.addUse(R);
6900 }
6901
6902 MIB.addUse(TDescr)
6903 .addImm(IsA16 ? 1 : 0)
6904 .cloneMemRefs(MI);
6905
6906 MI.eraseFromParent();
6907 return true;
6908}
6909
6911 MachineIRBuilder &B) const {
6912 unsigned Opc;
6913 int RoundMode = MI.getOperand(2).getImm();
6914
6915 if (RoundMode == (int)RoundingMode::TowardPositive)
6916 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6917 else if (RoundMode == (int)RoundingMode::TowardNegative)
6918 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6919 else
6920 return false;
6921
6922 B.buildInstr(Opc)
6923 .addDef(MI.getOperand(0).getReg())
6924 .addUse(MI.getOperand(1).getReg());
6925
6926 MI.eraseFromParent();
6927
6928 return true;
6929}
6930
6932 MachineIRBuilder &B) const {
6933 const SITargetLowering *TLI = ST.getTargetLowering();
6935 Register DstReg = MI.getOperand(0).getReg();
6936 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6937 MI.eraseFromParent();
6938 return true;
6939}
6940
6942 MachineIRBuilder &B) const {
6943 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
6944 if (!ST.hasArchitectedSGPRs())
6945 return false;
6946 LLT S32 = LLT::scalar(32);
6947 Register DstReg = MI.getOperand(0).getReg();
6948 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
6949 auto LSB = B.buildConstant(S32, 25);
6950 auto Width = B.buildConstant(S32, 5);
6951 B.buildUbfx(DstReg, TTMP8, LSB, Width);
6952 MI.eraseFromParent();
6953 return true;
6954}
6955
6956static constexpr unsigned FPEnvModeBitField =
6958
6959static constexpr unsigned FPEnvTrapBitField =
6961
6964 MachineIRBuilder &B) const {
6965 Register Src = MI.getOperand(0).getReg();
6966 if (MRI.getType(Src) != S64)
6967 return false;
6968
6969 auto ModeReg =
6970 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6971 /*HasSideEffects=*/true, /*isConvergent=*/false)
6972 .addImm(FPEnvModeBitField);
6973 auto TrapReg =
6974 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6975 /*HasSideEffects=*/true, /*isConvergent=*/false)
6976 .addImm(FPEnvTrapBitField);
6977 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
6978 MI.eraseFromParent();
6979 return true;
6980}
6981
6984 MachineIRBuilder &B) const {
6985 Register Src = MI.getOperand(0).getReg();
6986 if (MRI.getType(Src) != S64)
6987 return false;
6988
6989 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
6990 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
6991 /*HasSideEffects=*/true, /*isConvergent=*/false)
6992 .addImm(static_cast<int16_t>(FPEnvModeBitField))
6993 .addReg(Unmerge.getReg(0));
6994 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
6995 /*HasSideEffects=*/true, /*isConvergent=*/false)
6996 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
6997 .addReg(Unmerge.getReg(1));
6998 MI.eraseFromParent();
6999 return true;
7000}
7001
7003 MachineInstr &MI) const {
7004 MachineIRBuilder &B = Helper.MIRBuilder;
7005 MachineRegisterInfo &MRI = *B.getMRI();
7006
7007 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7008 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7009 switch (IntrID) {
7010 case Intrinsic::amdgcn_if:
7011 case Intrinsic::amdgcn_else: {
7012 MachineInstr *Br = nullptr;
7013 MachineBasicBlock *UncondBrTarget = nullptr;
7014 bool Negated = false;
7015 if (MachineInstr *BrCond =
7016 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7017 const SIRegisterInfo *TRI
7018 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7019
7020 Register Def = MI.getOperand(1).getReg();
7021 Register Use = MI.getOperand(3).getReg();
7022
7023 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7024
7025 if (Negated)
7026 std::swap(CondBrTarget, UncondBrTarget);
7027
7028 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7029 if (IntrID == Intrinsic::amdgcn_if) {
7030 B.buildInstr(AMDGPU::SI_IF)
7031 .addDef(Def)
7032 .addUse(Use)
7033 .addMBB(UncondBrTarget);
7034 } else {
7035 B.buildInstr(AMDGPU::SI_ELSE)
7036 .addDef(Def)
7037 .addUse(Use)
7038 .addMBB(UncondBrTarget);
7039 }
7040
7041 if (Br) {
7042 Br->getOperand(0).setMBB(CondBrTarget);
7043 } else {
7044 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7045 // since we're swapping branch targets it needs to be reinserted.
7046 // FIXME: IRTranslator should probably not do this
7047 B.buildBr(*CondBrTarget);
7048 }
7049
7050 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7051 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7052 MI.eraseFromParent();
7053 BrCond->eraseFromParent();
7054 return true;
7055 }
7056
7057 return false;
7058 }
7059 case Intrinsic::amdgcn_loop: {
7060 MachineInstr *Br = nullptr;
7061 MachineBasicBlock *UncondBrTarget = nullptr;
7062 bool Negated = false;
7063 if (MachineInstr *BrCond =
7064 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7065 const SIRegisterInfo *TRI
7066 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7067
7068 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7069 Register Reg = MI.getOperand(2).getReg();
7070
7071 if (Negated)
7072 std::swap(CondBrTarget, UncondBrTarget);
7073
7074 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7075 B.buildInstr(AMDGPU::SI_LOOP)
7076 .addUse(Reg)
7077 .addMBB(UncondBrTarget);
7078
7079 if (Br)
7080 Br->getOperand(0).setMBB(CondBrTarget);
7081 else
7082 B.buildBr(*CondBrTarget);
7083
7084 MI.eraseFromParent();
7085 BrCond->eraseFromParent();
7086 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7087 return true;
7088 }
7089
7090 return false;
7091 }
7092 case Intrinsic::amdgcn_addrspacecast_nonnull:
7093 return legalizeAddrSpaceCast(MI, MRI, B);
7094 case Intrinsic::amdgcn_make_buffer_rsrc:
7096 case Intrinsic::amdgcn_kernarg_segment_ptr:
7097 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7098 // This only makes sense to call in a kernel, so just lower to null.
7099 B.buildConstant(MI.getOperand(0).getReg(), 0);
7100 MI.eraseFromParent();
7101 return true;
7102 }
7103
7106 case Intrinsic::amdgcn_implicitarg_ptr:
7107 return legalizeImplicitArgPtr(MI, MRI, B);
7108 case Intrinsic::amdgcn_workitem_id_x:
7109 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7111 case Intrinsic::amdgcn_workitem_id_y:
7112 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7114 case Intrinsic::amdgcn_workitem_id_z:
7115 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7117 case Intrinsic::amdgcn_workgroup_id_x:
7120 case Intrinsic::amdgcn_workgroup_id_y:
7123 case Intrinsic::amdgcn_workgroup_id_z:
7126 case Intrinsic::amdgcn_wave_id:
7127 return legalizeWaveID(MI, B);
7128 case Intrinsic::amdgcn_lds_kernel_id:
7131 case Intrinsic::amdgcn_dispatch_ptr:
7134 case Intrinsic::amdgcn_queue_ptr:
7137 case Intrinsic::amdgcn_implicit_buffer_ptr:
7140 case Intrinsic::amdgcn_dispatch_id:
7143 case Intrinsic::r600_read_ngroups_x:
7144 // TODO: Emit error for hsa
7147 case Intrinsic::r600_read_ngroups_y:
7150 case Intrinsic::r600_read_ngroups_z:
7153 case Intrinsic::r600_read_local_size_x:
7154 // TODO: Could insert G_ASSERT_ZEXT from s16
7156 case Intrinsic::r600_read_local_size_y:
7157 // TODO: Could insert G_ASSERT_ZEXT from s16
7159 // TODO: Could insert G_ASSERT_ZEXT from s16
7160 case Intrinsic::r600_read_local_size_z:
7162 case Intrinsic::r600_read_global_size_x:
7164 case Intrinsic::r600_read_global_size_y:
7166 case Intrinsic::r600_read_global_size_z:
7168 case Intrinsic::amdgcn_fdiv_fast:
7169 return legalizeFDIVFastIntrin(MI, MRI, B);
7170 case Intrinsic::amdgcn_is_shared:
7172 case Intrinsic::amdgcn_is_private:
7174 case Intrinsic::amdgcn_wavefrontsize: {
7175 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7176 MI.eraseFromParent();
7177 return true;
7178 }
7179 case Intrinsic::amdgcn_s_buffer_load:
7180 return legalizeSBufferLoad(Helper, MI);
7181 case Intrinsic::amdgcn_raw_buffer_store:
7182 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7183 case Intrinsic::amdgcn_struct_buffer_store:
7184 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7185 return legalizeBufferStore(MI, MRI, B, false, false);
7186 case Intrinsic::amdgcn_raw_buffer_store_format:
7187 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7188 case Intrinsic::amdgcn_struct_buffer_store_format:
7189 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7190 return legalizeBufferStore(MI, MRI, B, false, true);
7191 case Intrinsic::amdgcn_raw_tbuffer_store:
7192 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7193 case Intrinsic::amdgcn_struct_tbuffer_store:
7194 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7195 return legalizeBufferStore(MI, MRI, B, true, true);
7196 case Intrinsic::amdgcn_raw_buffer_load:
7197 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7198 case Intrinsic::amdgcn_struct_buffer_load:
7199 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7200 return legalizeBufferLoad(MI, MRI, B, false, false);
7201 case Intrinsic::amdgcn_raw_buffer_load_format:
7202 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7203 case Intrinsic::amdgcn_struct_buffer_load_format:
7204 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7205 return legalizeBufferLoad(MI, MRI, B, true, false);
7206 case Intrinsic::amdgcn_raw_tbuffer_load:
7207 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7208 case Intrinsic::amdgcn_struct_tbuffer_load:
7209 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7210 return legalizeBufferLoad(MI, MRI, B, true, true);
7211 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7212 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7213 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7214 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7215 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7216 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7217 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7218 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7219 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7220 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7221 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7222 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7223 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7224 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7225 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7226 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7227 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7228 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7229 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7230 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7231 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7232 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7233 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7234 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7235 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7236 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7237 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7239 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7240 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7241 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7242 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7243 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7244 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7245 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7246 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7247 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7248 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7249 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7250 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7251 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7252 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7253 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7254 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7255 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7256 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7257 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7258 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7259 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7260 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7261 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7262 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7263 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7264 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7265 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7266 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7267 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7268 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7269 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7271 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7272 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7273 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7274 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7275 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
7276 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
7277 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
7278 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
7279 return legalizeBufferAtomic(MI, B, IntrID);
7280 case Intrinsic::amdgcn_rsq_clamp:
7282 case Intrinsic::amdgcn_ds_fadd:
7283 case Intrinsic::amdgcn_ds_fmin:
7284 case Intrinsic::amdgcn_ds_fmax:
7285 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
7286 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7287 return legalizeBVHIntrinsic(MI, B);
7288 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7289 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7290 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7291 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7292 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7293 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7294 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7295 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7296 Register Index = MI.getOperand(5).getReg();
7297 LLT S32 = LLT::scalar(32);
7298 if (MRI.getType(Index) != S32)
7299 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7300 return true;
7301 }
7302 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7303 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7304 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7305 Register Index = MI.getOperand(7).getReg();
7306 LLT S32 = LLT::scalar(32);
7307 if (MRI.getType(Index) != S32)
7308 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7309 return true;
7310 }
7311 case Intrinsic::amdgcn_fmed3: {
7312 GISelChangeObserver &Observer = Helper.Observer;
7313
7314 // FIXME: This is to workaround the inability of tablegen match combiners to
7315 // match intrinsics in patterns.
7316 Observer.changingInstr(MI);
7317 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7318 MI.removeOperand(1);
7319 Observer.changedInstr(MI);
7320 return true;
7321 }
7322 default: {
7323 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7325 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7326 return true;
7327 }
7328 }
7329
7330 return true;
7331}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
unsigned Intr
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static const LLT V3S64
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static const LLT V16S16
static const LLT S128
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static const LLT V4S32
static const LLT V2S32
static const LLT V8S64
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static const LLT V12S32
static const LLT V8S32
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static const LLT V2S16
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static const LLT V4S64
static const LLT S1
static const LLT V3S32
static const LLT S64
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static const LLT S32
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static const LLT V6S32
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static const LLT V7S32
static const LLT V5S32
static const LLT V4S16
static const LLT V11S32
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static const LLT V32S32
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static const LLT V9S32
static const LLT V10S32
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static const LLT V12S16
static const LLT V16S64
static const LLT S512
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V16S32
static const LLT V7S64
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static const LLT V5S64
static const LLT S160
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static const LLT V4S128
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static const LLT V6S64
static constexpr unsigned MaxRegisterSize
static const LLT V2S8
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static const LLT S96
static const LLT V2S64
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static const LLT S16
static const LLT V10S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static const LLT V2S128
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static const LLT S256
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static const LLT S8
static const LLT V6S16
static bool isRegisterVectorType(LLT Ty)
static const LLT S224
static const LLT V8S16
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static Error unsupported(const char *Str, const Triple &T)
Definition: MachO.cpp:71
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define R2(n)
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
ppc ctr loops verify
const char LLVMTargetMachineRef TM
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1174
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
Value * RHS
Value * LHS
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1026
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1006
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:1000
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:1002
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:160
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
bool hasA16() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:463
bool hasArchitectedSGPRs() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:252
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:441
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:447
bool hasMad64_32() const
Definition: GCNSubtarget.h:730
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:443
bool hasIntClamp() const
Definition: GCNSubtarget.h:343
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:260
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:993
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:363
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:587
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:719
bool hasNSAEncoding() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:953
Generation getGeneration() const
Definition: GCNSubtarget.h:303
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:717
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:721
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:713
bool hasFractBug() const
Definition: GCNSubtarget.h:381
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
static constexpr LLT float64()
Get a 64-bit IEEE double value.
Definition: LowLevelType.h:94
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
Definition: LowLevelType.h:214
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelType.h:64
constexpr bool isPointerVector() const
Definition: LowLevelType.h:152
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:159
constexpr bool isVector() const
Definition: LowLevelType.h:148
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr bool isPointer() const
Definition: LowLevelType.h:149
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr ElementCount getElementCount() const
Definition: LowLevelType.h:184
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
static constexpr LLT float16()
Get a 16-bit IEEE half value.
Definition: LowLevelType.h:84
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr bool isPointerOrPointerVector() const
Definition: LowLevelType.h:153
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
Definition: LowLevelType.h:230
constexpr LLT getScalarType() const
Definition: LowLevelType.h:208
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
Definition: LowLevelType.h:124
static constexpr LLT float32()
Get a 32-bit IEEE float value.
Definition: LowLevelType.h:89
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:387
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:412
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void truncate(size_type N)
Like resize, but requires that N is less than size().
Definition: SmallVector.h:657
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:271
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double inv_pi
Definition: MathExtras.h:38
constexpr double ln2
Definition: MathExtras.h:33
constexpr double ln10
Definition: MathExtras.h:34
constexpr float log2ef
Definition: MathExtras.h:50
constexpr double log2e
Definition: MathExtras.h:35
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:882
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:326
@ Offset
Definition: DWP.cpp:456
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:625
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:438
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
void * PointerTy
Definition: GenericValue.h:21
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:305
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Add
Sum of integers.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition: Utils.cpp:1645
@ DS_Warning
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:413
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:349
@ Enable
Enable colors.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static constexpr uint64_t encode(Fields... Values)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:77
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.