LLVM 22.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
37
38#define DEBUG_TYPE "amdgpu-legalinfo"
39
40using namespace llvm;
41using namespace LegalizeActions;
42using namespace LegalizeMutations;
43using namespace LegalityPredicates;
44using namespace MIPatternMatch;
45
46// Hack until load/store selection patterns support any tuple of legal types.
48 "amdgpu-global-isel-new-legality",
49 cl::desc("Use GlobalISel desired legality, rather than try to use"
50 "rules compatible with selection patterns"),
51 cl::init(false),
53
54static constexpr unsigned MaxRegisterSize = 1024;
55
56// Round the number of elements to the next power of two elements
58 unsigned NElts = Ty.getNumElements();
59 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
60 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
61}
62
63// Round the number of bits to the next power of two bits
65 unsigned Bits = Ty.getSizeInBits();
66 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
67 return LLT::scalar(Pow2Bits);
68}
69
70/// \returns true if this is an odd sized vector which should widen by adding an
71/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
72/// excludes s1 vectors, which should always be scalarized.
73static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
74 return [=](const LegalityQuery &Query) {
75 const LLT Ty = Query.Types[TypeIdx];
76 if (!Ty.isVector())
77 return false;
78
79 const LLT EltTy = Ty.getElementType();
80 const unsigned EltSize = EltTy.getSizeInBits();
81 return Ty.getNumElements() % 2 != 0 &&
82 EltSize > 1 && EltSize < 32 &&
83 Ty.getSizeInBits() % 32 != 0;
84 };
85}
86
87static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 return Ty.getSizeInBits() % 32 == 0;
91 };
92}
93
94static LegalityPredicate isWideVec16(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getScalarType();
98 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
99 };
100}
101
102static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
103 return [=](const LegalityQuery &Query) {
104 const LLT Ty = Query.Types[TypeIdx];
105 const LLT EltTy = Ty.getElementType();
106 return std::pair(TypeIdx,
107 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
108 };
109}
110
112 return [=](const LegalityQuery &Query) {
113 const LLT Ty = Query.Types[TypeIdx];
114 const LLT EltTy = Ty.getElementType();
115 unsigned Size = Ty.getSizeInBits();
116 unsigned Pieces = (Size + 63) / 64;
117 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
118 return std::pair(TypeIdx, LLT::scalarOrVector(
119 ElementCount::getFixed(NewNumElts), EltTy));
120 };
121}
122
123// Increase the number of vector elements to reach the next multiple of 32-bit
124// type.
125static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
126 return [=](const LegalityQuery &Query) {
127 const LLT Ty = Query.Types[TypeIdx];
128
129 const LLT EltTy = Ty.getElementType();
130 const int Size = Ty.getSizeInBits();
131 const int EltSize = EltTy.getSizeInBits();
132 const int NextMul32 = (Size + 31) / 32;
133
134 assert(EltSize < 32);
135
136 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
137 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
138 };
139}
140
141// Retrieves the scalar type that's the same size as the mem desc
143 return [=](const LegalityQuery &Query) {
144 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
145 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
146 };
147}
148
149// Increase the number of vector elements to reach the next legal RegClass.
151 return [=](const LegalityQuery &Query) {
152 const LLT Ty = Query.Types[TypeIdx];
153 const unsigned NumElts = Ty.getNumElements();
154 const unsigned EltSize = Ty.getElementType().getSizeInBits();
155 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
156
157 assert(EltSize == 32 || EltSize == 64);
158 assert(Ty.getSizeInBits() < MaxRegisterSize);
159
160 unsigned NewNumElts;
161 // Find the nearest legal RegClass that is larger than the current type.
162 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
163 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
164 break;
165 }
166 return std::pair(TypeIdx,
167 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
168 };
169}
170
172 if (!Ty.isVector())
173 return LLT::scalar(128);
174 const ElementCount NumElems = Ty.getElementCount();
175 return LLT::vector(NumElems, LLT::scalar(128));
176}
177
179 if (!Ty.isVector())
180 return LLT::fixed_vector(4, LLT::scalar(32));
181 const unsigned NumElems = Ty.getElementCount().getFixedValue();
182 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
183}
184
186 const unsigned Size = Ty.getSizeInBits();
187
188 if (Size <= 32) {
189 // <2 x s8> -> s16
190 // <4 x s8> -> s32
191 return LLT::scalar(Size);
192 }
193
195}
196
197static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
198 return [=](const LegalityQuery &Query) {
199 const LLT Ty = Query.Types[TypeIdx];
200 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
201 };
202}
203
205 return [=](const LegalityQuery &Query) {
206 const LLT Ty = Query.Types[TypeIdx];
207 unsigned Size = Ty.getSizeInBits();
208 assert(Size % 32 == 0);
209 return std::pair(
211 };
212}
213
214static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
215 return [=](const LegalityQuery &Query) {
216 const LLT QueryTy = Query.Types[TypeIdx];
217 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
218 };
219}
220
221static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
222 return [=](const LegalityQuery &Query) {
223 const LLT QueryTy = Query.Types[TypeIdx];
224 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
225 };
226}
227
228static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
229 return [=](const LegalityQuery &Query) {
230 const LLT QueryTy = Query.Types[TypeIdx];
231 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
232 };
233}
234
235static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
236 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
238}
239
241 const int EltSize = EltTy.getSizeInBits();
242 return EltSize == 16 || EltSize % 32 == 0;
243}
244
245static bool isRegisterVectorType(LLT Ty) {
246 const int EltSize = Ty.getElementType().getSizeInBits();
247 return EltSize == 32 || EltSize == 64 ||
248 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
249 EltSize == 128 || EltSize == 256;
250}
251
252// TODO: replace all uses of isRegisterType with isRegisterClassType
253static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
254 if (!isRegisterSize(ST, Ty.getSizeInBits()))
255 return false;
256
257 if (Ty.isVector())
258 return isRegisterVectorType(Ty);
259
260 return true;
261}
262
263// Any combination of 32 or 64-bit elements up the maximum register size, and
264// multiples of v2s16.
266 unsigned TypeIdx) {
267 return [=, &ST](const LegalityQuery &Query) {
268 return isRegisterType(ST, Query.Types[TypeIdx]);
269 };
270}
271
272// RegisterType that doesn't have a corresponding RegClass.
273// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
274// should be removed.
276 unsigned TypeIdx) {
277 return [=, &ST](const LegalityQuery &Query) {
278 LLT Ty = Query.Types[TypeIdx];
279 return isRegisterType(ST, Ty) &&
280 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
281 };
282}
283
284static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
285 return [=](const LegalityQuery &Query) {
286 const LLT QueryTy = Query.Types[TypeIdx];
287 if (!QueryTy.isVector())
288 return false;
289 const LLT EltTy = QueryTy.getElementType();
290 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
291 };
292}
293
294constexpr LLT S1 = LLT::scalar(1);
295constexpr LLT S8 = LLT::scalar(8);
296constexpr LLT S16 = LLT::scalar(16);
297constexpr LLT S32 = LLT::scalar(32);
298constexpr LLT F32 = LLT::float32();
299constexpr LLT S64 = LLT::scalar(64);
300constexpr LLT F64 = LLT::float64();
301constexpr LLT S96 = LLT::scalar(96);
302constexpr LLT S128 = LLT::scalar(128);
303constexpr LLT S160 = LLT::scalar(160);
304constexpr LLT S192 = LLT::scalar(192);
305constexpr LLT S224 = LLT::scalar(224);
306constexpr LLT S256 = LLT::scalar(256);
307constexpr LLT S512 = LLT::scalar(512);
308constexpr LLT S1024 = LLT::scalar(1024);
310
311constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
312constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
313constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
314constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
315constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
316constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
317constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
318constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
319
321constexpr LLT V2BF16 = V2F16; // FIXME
322
323constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
324constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
325constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
326constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
327constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
328constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
329constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
330constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
331constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
332constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
333constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
334constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
335constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
336
337constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
338constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
339constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
340constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
341constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
342constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
343constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
344constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
345
346constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
347constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
348
349constexpr std::initializer_list<LLT> AllScalarTypes = {
351
352constexpr std::initializer_list<LLT> AllS16Vectors{
354
355constexpr std::initializer_list<LLT> AllS32Vectors = {
358
359constexpr std::initializer_list<LLT> AllS64Vectors = {
361
367
368// Checks whether a type is in the list of legal register types.
369static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
370 if (Ty.isPointerOrPointerVector())
371 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
372
375 (ST.useRealTrue16Insts() && Ty == S16) ||
377}
378
380 unsigned TypeIdx) {
381 return [&ST, TypeIdx](const LegalityQuery &Query) {
382 return isRegisterClassType(ST, Query.Types[TypeIdx]);
383 };
384}
385
386// If we have a truncating store or an extending load with a data size larger
387// than 32-bits, we need to reduce to a 32-bit type.
389 return [=](const LegalityQuery &Query) {
390 const LLT Ty = Query.Types[TypeIdx];
391 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
392 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
393 };
394}
395
396// If we have a truncating store or an extending load with a data size larger
397// than 32-bits and mem location is a power of 2
399 return [=](const LegalityQuery &Query) {
400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
401 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
402 isPowerOf2_64(MemSize);
403 };
404}
405
406// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
407// handle some operations by just promoting the register during
408// selection. There are also d16 loads on GFX9+ which preserve the high bits.
409static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
410 bool IsLoad, bool IsAtomic) {
411 switch (AS) {
413 // FIXME: Private element size.
414 return ST.enableFlatScratch() ? 128 : 32;
416 return ST.useDS128() ? 128 : 64;
421 // Treat constant and global as identical. SMRD loads are sometimes usable for
422 // global loads (ideally constant address space should be eliminated)
423 // depending on the context. Legality cannot be context dependent, but
424 // RegBankSelect can split the load as necessary depending on the pointer
425 // register bank/uniformity and if the memory is invariant or not written in a
426 // kernel.
427 return IsLoad ? 512 : 128;
428 default:
429 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
430 // if they may alias scratch depending on the subtarget. This needs to be
431 // moved to custom handling to use addressMayBeAccessedAsPrivate
432 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
433 }
434}
435
436static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
437 const LegalityQuery &Query) {
438 const LLT Ty = Query.Types[0];
439
440 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
441 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
442
443 unsigned RegSize = Ty.getSizeInBits();
444 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
445 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
446 unsigned AS = Query.Types[1].getAddressSpace();
447
448 // All of these need to be custom lowered to cast the pointer operand.
450 return false;
451
452 // Do not handle extending vector loads.
453 if (Ty.isVector() && MemSize != RegSize)
454 return false;
455
456 // TODO: We should be able to widen loads if the alignment is high enough, but
457 // we also need to modify the memory access size.
458#if 0
459 // Accept widening loads based on alignment.
460 if (IsLoad && MemSize < Size)
461 MemSize = std::max(MemSize, Align);
462#endif
463
464 // Only 1-byte and 2-byte to 32-bit extloads are valid.
465 if (MemSize != RegSize && RegSize != 32)
466 return false;
467
468 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
469 Query.MMODescrs[0].Ordering !=
471 return false;
472
473 switch (MemSize) {
474 case 8:
475 case 16:
476 case 32:
477 case 64:
478 case 128:
479 break;
480 case 96:
481 if (!ST.hasDwordx3LoadStores())
482 return false;
483 break;
484 case 256:
485 case 512:
486 // These may contextually need to be broken down.
487 break;
488 default:
489 return false;
490 }
491
492 assert(RegSize >= MemSize);
493
494 if (AlignBits < MemSize) {
495 const SITargetLowering *TLI = ST.getTargetLowering();
496 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
497 Align(AlignBits / 8)))
498 return false;
499 }
500
501 return true;
502}
503
504// The newer buffer intrinsic forms take their resource arguments as
505// pointers in address space 8, aka s128 values. However, in order to not break
506// SelectionDAG, the underlying operations have to continue to take v4i32
507// arguments. Therefore, we convert resource pointers - or vectors of them
508// to integer values here.
509static bool hasBufferRsrcWorkaround(const LLT Ty) {
510 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
511 return true;
512 if (Ty.isVector()) {
513 const LLT ElemTy = Ty.getElementType();
514 return hasBufferRsrcWorkaround(ElemTy);
515 }
516 return false;
517}
518
519// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
520// workaround this. Eventually it should ignore the type for loads and only care
521// about the size. Return true in cases where we will workaround this for now by
522// bitcasting.
523static bool loadStoreBitcastWorkaround(const LLT Ty) {
525 return false;
526
527 const unsigned Size = Ty.getSizeInBits();
528 if (Ty.isPointerVector())
529 return true;
530 if (Size <= 64)
531 return false;
532 // Address space 8 pointers get their own workaround.
534 return false;
535 if (!Ty.isVector())
536 return true;
537
538 unsigned EltSize = Ty.getScalarSizeInBits();
539 return EltSize != 32 && EltSize != 64;
540}
541
542static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
543 const LLT Ty = Query.Types[0];
544 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
546}
547
548/// Return true if a load or store of the type should be lowered with a bitcast
549/// to a different type.
550static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
551 const LLT MemTy) {
552 const unsigned MemSizeInBits = MemTy.getSizeInBits();
553 const unsigned Size = Ty.getSizeInBits();
554 if (Size != MemSizeInBits)
555 return Size <= 32 && Ty.isVector();
556
558 return true;
559
560 // Don't try to handle bitcasting vector ext loads for now.
561 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
562 (Size <= 32 || isRegisterSize(ST, Size)) &&
563 !isRegisterVectorElementType(Ty.getElementType());
564}
565
566/// Return true if we should legalize a load by widening an odd sized memory
567/// access up to the alignment. Note this case when the memory access itself
568/// changes, not the size of the result register.
569static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
570 uint64_t AlignInBits, unsigned AddrSpace,
571 unsigned Opcode) {
572 unsigned SizeInBits = MemoryTy.getSizeInBits();
573 // We don't want to widen cases that are naturally legal.
574 if (isPowerOf2_32(SizeInBits))
575 return false;
576
577 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
578 // end up widening these for a scalar load during RegBankSelect, if we don't
579 // have 96-bit scalar loads.
580 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
581 return false;
582
583 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
584 return false;
585
586 // A load is known dereferenceable up to the alignment, so it's legal to widen
587 // to it.
588 //
589 // TODO: Could check dereferenceable for less aligned cases.
590 unsigned RoundedSize = NextPowerOf2(SizeInBits);
591 if (AlignInBits < RoundedSize)
592 return false;
593
594 // Do not widen if it would introduce a slow unaligned load.
595 const SITargetLowering *TLI = ST.getTargetLowering();
596 unsigned Fast = 0;
598 RoundedSize, AddrSpace, Align(AlignInBits / 8),
600 Fast;
601}
602
603static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
604 unsigned Opcode) {
605 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
606 return false;
607
608 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
609 Query.MMODescrs[0].AlignInBits,
610 Query.Types[1].getAddressSpace(), Opcode);
611}
612
613/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
614/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
615/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
617 MachineRegisterInfo &MRI, unsigned Idx) {
618 MachineOperand &MO = MI.getOperand(Idx);
619
620 const LLT PointerTy = MRI.getType(MO.getReg());
621
622 // Paranoidly prevent us from doing this multiple times.
624 return PointerTy;
625
626 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
627 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
628 if (!PointerTy.isVector()) {
629 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
630 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
631 const LLT S32 = LLT::scalar(32);
632
633 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
634 std::array<Register, 4> VectorElems;
635 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
636 for (unsigned I = 0; I < NumParts; ++I)
637 VectorElems[I] =
638 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
639 B.buildMergeValues(MO, VectorElems);
640 MO.setReg(VectorReg);
641 return VectorTy;
642 }
643 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
644 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
645 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
646 B.buildIntToPtr(MO, Scalar);
647 MO.setReg(BitcastReg);
648
649 return VectorTy;
650}
651
652/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
653/// the form in which the value must be in order to be passed to the low-level
654/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
655/// needed in order to account for the fact that we can't define a register
656/// class for s128 without breaking SelectionDAG.
658 MachineRegisterInfo &MRI = *B.getMRI();
659 const LLT PointerTy = MRI.getType(Pointer);
660 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
661 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
662
663 if (!PointerTy.isVector()) {
664 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
665 SmallVector<Register, 4> PointerParts;
666 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
667 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
668 for (unsigned I = 0; I < NumParts; ++I)
669 PointerParts.push_back(Unmerged.getReg(I));
670 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
671 }
672 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
673 return B.buildBitcast(VectorTy, Scalar).getReg(0);
674}
675
677 unsigned Idx) {
678 MachineOperand &MO = MI.getOperand(Idx);
679
680 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
681 // Paranoidly prevent us from doing this multiple times.
683 return;
685}
686
688 const GCNTargetMachine &TM)
689 : ST(ST_) {
690 using namespace TargetOpcode;
691
692 auto GetAddrSpacePtr = [&TM](unsigned AS) {
693 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
694 };
695
696 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
697 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
698 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
699 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
700 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
701 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
702 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
703 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
704 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
705 const LLT BufferStridedPtr =
706 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
707
708 const LLT CodePtr = FlatPtr;
709
710 const std::initializer_list<LLT> AddrSpaces64 = {
711 GlobalPtr, ConstantPtr, FlatPtr
712 };
713
714 const std::initializer_list<LLT> AddrSpaces32 = {
715 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
716 };
717
718 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
719
720 const std::initializer_list<LLT> FPTypesBase = {
721 S32, S64
722 };
723
724 const std::initializer_list<LLT> FPTypes16 = {
725 S32, S64, S16
726 };
727
728 const std::initializer_list<LLT> FPTypesPK16 = {
729 S32, S64, S16, V2S16
730 };
731
732 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
733
734 // s1 for VCC branches, s32 for SCC branches.
736
737 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
738 // elements for v3s16
741 .legalFor(AllS32Vectors)
743 .legalFor(AddrSpaces64)
744 .legalFor(AddrSpaces32)
745 .legalFor(AddrSpaces128)
746 .legalIf(isPointer(0))
747 .clampScalar(0, S16, S256)
749 .clampMaxNumElements(0, S32, 16)
751 .scalarize(0);
752
753 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
754 // Full set of gfx9 features.
755 if (ST.hasScalarAddSub64()) {
756 getActionDefinitionsBuilder({G_ADD, G_SUB})
757 .legalFor({S64, S32, S16, V2S16})
758 .clampMaxNumElementsStrict(0, S16, 2)
759 .scalarize(0)
760 .minScalar(0, S16)
762 .maxScalar(0, S32);
763 } else {
764 getActionDefinitionsBuilder({G_ADD, G_SUB})
765 .legalFor({S32, S16, V2S16})
766 .clampMaxNumElementsStrict(0, S16, 2)
767 .scalarize(0)
768 .minScalar(0, S16)
770 .maxScalar(0, S32);
771 }
772
773 if (ST.hasScalarSMulU64()) {
775 .legalFor({S64, S32, S16, V2S16})
776 .clampMaxNumElementsStrict(0, S16, 2)
777 .scalarize(0)
778 .minScalar(0, S16)
780 .custom();
781 } else {
783 .legalFor({S32, S16, V2S16})
784 .clampMaxNumElementsStrict(0, S16, 2)
785 .scalarize(0)
786 .minScalar(0, S16)
788 .custom();
789 }
790 assert(ST.hasMad64_32());
791
792 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
793 .legalFor({S32, S16, V2S16}) // Clamp modifier
794 .minScalarOrElt(0, S16)
796 .scalarize(0)
798 .lower();
799 } else if (ST.has16BitInsts()) {
800 getActionDefinitionsBuilder({G_ADD, G_SUB})
801 .legalFor({S32, S16})
802 .minScalar(0, S16)
804 .maxScalar(0, S32)
805 .scalarize(0);
806
808 .legalFor({S32, S16})
809 .scalarize(0)
810 .minScalar(0, S16)
812 .custom();
813 assert(ST.hasMad64_32());
814
815 // Technically the saturating operations require clamp bit support, but this
816 // was introduced at the same time as 16-bit operations.
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32, S16}) // Clamp modifier
819 .minScalar(0, S16)
820 .scalarize(0)
822 .lower();
823
824 // We're just lowering this, but it helps get a better result to try to
825 // coerce to the desired type first.
826 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
827 .minScalar(0, S16)
828 .scalarize(0)
829 .lower();
830 } else {
831 getActionDefinitionsBuilder({G_ADD, G_SUB})
832 .legalFor({S32})
833 .widenScalarToNextMultipleOf(0, 32)
834 .clampScalar(0, S32, S32)
835 .scalarize(0);
836
837 auto &Mul = getActionDefinitionsBuilder(G_MUL)
838 .legalFor({S32})
839 .scalarize(0)
840 .minScalar(0, S32)
842
843 if (ST.hasMad64_32())
844 Mul.custom();
845 else
846 Mul.maxScalar(0, S32);
847
848 if (ST.hasIntClamp()) {
849 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
850 .legalFor({S32}) // Clamp modifier.
851 .scalarize(0)
853 .lower();
854 } else {
855 // Clamp bit support was added in VI, along with 16-bit operations.
856 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
857 .minScalar(0, S32)
858 .scalarize(0)
859 .lower();
860 }
861
862 // FIXME: DAG expansion gets better results. The widening uses the smaller
863 // range values and goes for the min/max lowering directly.
864 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
865 .minScalar(0, S32)
866 .scalarize(0)
867 .lower();
868 }
869
871 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
872 .customFor({S32, S64})
873 .clampScalar(0, S32, S64)
875 .scalarize(0);
876
877 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
878 .legalFor({S32})
879 .maxScalar(0, S32);
880
881 if (ST.hasVOP3PInsts()) {
882 Mulh
883 .clampMaxNumElements(0, S8, 2)
884 .lowerFor({V2S8});
885 }
886
887 Mulh
888 .scalarize(0)
889 .lower();
890
891 // Report legal for any types we can handle anywhere. For the cases only legal
892 // on the SALU, RegBankSelect will be able to re-legalize.
893 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
894 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
895 .clampScalar(0, S32, S64)
901 .scalarize(0);
902
904 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
905 .legalFor({{S32, S1}, {S32, S32}})
906 .clampScalar(0, S32, S32)
907 .scalarize(0);
908
910 // Don't worry about the size constraint.
912 .lower();
913
915 .legalFor({S1, S32, S64, S16, GlobalPtr,
916 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
917 .legalIf(isPointer(0))
918 .clampScalar(0, S32, S64)
920
921 getActionDefinitionsBuilder(G_FCONSTANT)
922 .legalFor({S32, S64, S16})
923 .clampScalar(0, S16, S64);
924
925 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
926 .legalIf(isRegisterClassType(ST, 0))
927 // s1 and s16 are special cases because they have legal operations on
928 // them, but don't really occupy registers in the normal way.
929 .legalFor({S1, S16})
930 .clampNumElements(0, V16S32, V32S32)
934 .clampMaxNumElements(0, S32, 16);
935
936 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
937
938 // If the amount is divergent, we have to do a wave reduction to get the
939 // maximum value, so this is expanded during RegBankSelect.
940 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
941 .legalFor({{PrivatePtr, S32}});
942
943 getActionDefinitionsBuilder(G_STACKSAVE)
944 .customFor({PrivatePtr});
945 getActionDefinitionsBuilder(G_STACKRESTORE)
946 .legalFor({PrivatePtr});
947
948 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
949
950 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
951 .customIf(typeIsNot(0, PrivatePtr));
952
953 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
954
955 auto &FPOpActions = getActionDefinitionsBuilder(
956 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
957 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
958 .legalFor({S32, S64});
959 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
960 .customFor({S32, S64});
961 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
962 .customFor({S32, S64});
963
964 if (ST.has16BitInsts()) {
965 if (ST.hasVOP3PInsts())
966 FPOpActions.legalFor({S16, V2S16});
967 else
968 FPOpActions.legalFor({S16});
969
970 TrigActions.customFor({S16});
971 FDIVActions.customFor({S16});
972 }
973
974 if (ST.hasPackedFP32Ops()) {
975 FPOpActions.legalFor({V2S32});
976 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
977 }
978
979 auto &MinNumMaxNumIeee =
980 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
981
982 if (ST.hasVOP3PInsts()) {
983 MinNumMaxNumIeee.legalFor(FPTypesPK16)
984 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
985 .clampMaxNumElements(0, S16, 2)
986 .clampScalar(0, S16, S64)
987 .scalarize(0);
988 } else if (ST.has16BitInsts()) {
989 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
990 } else {
991 MinNumMaxNumIeee.legalFor(FPTypesBase)
992 .clampScalar(0, S32, S64)
993 .scalarize(0);
994 }
995
996 auto &MinNumMaxNum = getActionDefinitionsBuilder(
997 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
998
999 if (ST.hasVOP3PInsts()) {
1000 MinNumMaxNum.customFor(FPTypesPK16)
1001 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1002 .clampMaxNumElements(0, S16, 2)
1003 .clampScalar(0, S16, S64)
1004 .scalarize(0);
1005 } else if (ST.has16BitInsts()) {
1006 MinNumMaxNum.customFor(FPTypes16)
1007 .clampScalar(0, S16, S64)
1008 .scalarize(0);
1009 } else {
1010 MinNumMaxNum.customFor(FPTypesBase)
1011 .clampScalar(0, S32, S64)
1012 .scalarize(0);
1013 }
1014
1015 if (ST.hasVOP3PInsts())
1016 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1017
1018 FPOpActions
1019 .scalarize(0)
1020 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1021
1022 TrigActions
1023 .scalarize(0)
1024 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1025
1026 FDIVActions
1027 .scalarize(0)
1028 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1029
1030 getActionDefinitionsBuilder({G_FNEG, G_FABS})
1031 .legalFor(FPTypesPK16)
1033 .scalarize(0)
1034 .clampScalar(0, S16, S64);
1035
1036 if (ST.has16BitInsts()) {
1038 .legalFor({S16})
1039 .customFor({S32, S64})
1040 .scalarize(0)
1041 .unsupported();
1043 .legalFor({S32, S64, S16})
1044 .scalarize(0)
1045 .clampScalar(0, S16, S64);
1046
1047 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1048 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1049 .scalarize(0)
1050 .maxScalarIf(typeIs(0, S16), 1, S16)
1051 .clampScalar(1, S32, S32)
1052 .lower();
1053
1055 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1056 .scalarize(0)
1057 .lower();
1058 } else {
1060 .customFor({S32, S64, S16})
1061 .scalarize(0)
1062 .unsupported();
1063
1064
1065 if (ST.hasFractBug()) {
1067 .customFor({S64})
1068 .legalFor({S32, S64})
1069 .scalarize(0)
1070 .clampScalar(0, S32, S64);
1071 } else {
1073 .legalFor({S32, S64})
1074 .scalarize(0)
1075 .clampScalar(0, S32, S64);
1076 }
1077
1078 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1079 .legalFor({{S32, S32}, {S64, S32}})
1080 .scalarize(0)
1081 .clampScalar(0, S32, S64)
1082 .clampScalar(1, S32, S32)
1083 .lower();
1084
1086 .customFor({{S32, S32}, {S64, S32}})
1087 .scalarize(0)
1088 .minScalar(0, S32)
1089 .clampScalar(1, S32, S32)
1090 .lower();
1091 }
1092
1093 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1094 if (ST.hasCvtPkF16F32Inst()) {
1095 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1096 .clampMaxNumElements(0, S16, 2);
1097 } else {
1098 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1099 }
1100 FPTruncActions.scalarize(0).lower();
1101
1103 .legalFor({{S64, S32}, {S32, S16}})
1104 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1105 .scalarize(0);
1106
1107 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1108 if (ST.has16BitInsts()) {
1109 FSubActions
1110 // Use actual fsub instruction
1111 .legalFor({S32, S16})
1112 // Must use fadd + fneg
1113 .lowerFor({S64, V2S16});
1114 } else {
1115 FSubActions
1116 // Use actual fsub instruction
1117 .legalFor({S32})
1118 // Must use fadd + fneg
1119 .lowerFor({S64, S16, V2S16});
1120 }
1121
1122 FSubActions
1123 .scalarize(0)
1124 .clampScalar(0, S32, S64);
1125
1126 // Whether this is legal depends on the floating point mode for the function.
1127 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1128 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1129 FMad.customFor({S32, S16});
1130 else if (ST.hasMadMacF32Insts())
1131 FMad.customFor({S32});
1132 else if (ST.hasMadF16())
1133 FMad.customFor({S16});
1134 FMad.scalarize(0)
1135 .lower();
1136
1137 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1138 if (ST.has16BitInsts()) {
1139 FRem.customFor({S16, S32, S64});
1140 } else {
1141 FRem.minScalar(0, S32)
1142 .customFor({S32, S64});
1143 }
1144 FRem.scalarize(0);
1145
1146 // TODO: Do we need to clamp maximum bitwidth?
1148 .legalIf(isScalar(0))
1149 .legalFor({{V2S16, V2S32}})
1150 .clampMaxNumElements(0, S16, 2)
1151 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1152 // situations (like an invalid implicit use), we don't want to infinite loop
1153 // in the legalizer.
1155 .alwaysLegal();
1156
1157 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1158 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1159 {S32, S1}, {S64, S1}, {S16, S1}})
1160 .scalarize(0)
1161 .clampScalar(0, S32, S64)
1162 .widenScalarToNextPow2(1, 32);
1163
1164 // TODO: Split s1->s64 during regbankselect for VALU.
1165 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1166 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1167 .lowerIf(typeIs(1, S1))
1168 .customFor({{S32, S64}, {S64, S64}});
1169 if (ST.has16BitInsts())
1170 IToFP.legalFor({{S16, S16}});
1171 IToFP.clampScalar(1, S32, S64)
1172 .minScalar(0, S32)
1173 .scalarize(0)
1175
1176 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1177 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1178 .customFor({{S64, S32}, {S64, S64}})
1179 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1180 if (ST.has16BitInsts())
1181 FPToI.legalFor({{S16, S16}});
1182 else
1183 FPToI.minScalar(1, S32);
1184
1185 FPToI.minScalar(0, S32)
1186 .widenScalarToNextPow2(0, 32)
1187 .scalarize(0)
1188 .lower();
1189
1190 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1191 .clampScalar(0, S16, S64)
1192 .scalarize(0)
1193 .lower();
1194
1195 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1196 .legalFor({S16, S32})
1197 .scalarize(0)
1198 .lower();
1199
1200 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1201 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1202 .scalarize(0)
1203 .lower();
1204
1205 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1206 .clampScalar(0, S16, S64)
1207 .scalarize(0)
1208 .lower();
1209
1210 if (ST.has16BitInsts()) {
1212 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1213 .legalFor({S16, S32, S64})
1214 .clampScalar(0, S16, S64)
1215 .scalarize(0);
1216 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1218 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1219 .legalFor({S32, S64})
1220 .clampScalar(0, S32, S64)
1221 .scalarize(0);
1222 } else {
1224 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1225 .legalFor({S32})
1226 .customFor({S64})
1227 .clampScalar(0, S32, S64)
1228 .scalarize(0);
1229 }
1230
1232 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1233 .legalIf(all(isPointer(0), sameSize(0, 1)))
1234 .scalarize(0)
1235 .scalarSameSizeAs(1, 0);
1236
1238 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1239 .scalarSameSizeAs(1, 0)
1240 .scalarize(0);
1241
1242 auto &CmpBuilder =
1244 // The compare output type differs based on the register bank of the output,
1245 // so make both s1 and s32 legal.
1246 //
1247 // Scalar compares producing output in scc will be promoted to s32, as that
1248 // is the allocatable register type that will be needed for the copy from
1249 // scc. This will be promoted during RegBankSelect, and we assume something
1250 // before that won't try to use s32 result types.
1251 //
1252 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1253 // bank.
1255 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1256 .legalForCartesianProduct(
1257 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1258 if (ST.has16BitInsts()) {
1259 CmpBuilder.legalFor({{S1, S16}});
1260 }
1261
1262 CmpBuilder
1264 .clampScalar(1, S32, S64)
1265 .scalarize(0)
1266 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1267
1268 auto &FCmpBuilder =
1270 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1271
1272 if (ST.hasSALUFloatInsts())
1273 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1274
1275 FCmpBuilder
1277 .clampScalar(1, S32, S64)
1278 .scalarize(0);
1279
1280 // FIXME: fpow has a selection pattern that should move to custom lowering.
1281 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1282 if (ST.has16BitInsts())
1283 ExpOps.customFor({{S32}, {S16}});
1284 else
1285 ExpOps.customFor({S32});
1286 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1287 .scalarize(0);
1288
1290 .clampScalar(0, MinScalarFPTy, S32)
1291 .lower();
1292
1293 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1294 Log2Ops.customFor({S32});
1295 if (ST.has16BitInsts())
1296 Log2Ops.legalFor({S16});
1297 else
1298 Log2Ops.customFor({S16});
1299 Log2Ops.scalarize(0)
1300 .lower();
1301
1302 auto &LogOps =
1303 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1304 LogOps.customFor({S32, S16});
1305 LogOps.clampScalar(0, MinScalarFPTy, S32)
1306 .scalarize(0);
1307
1308 // The 64-bit versions produce 32-bit results, but only on the SALU.
1310 .legalFor({{S32, S32}, {S32, S64}})
1311 .clampScalar(0, S32, S32)
1312 .widenScalarToNextPow2(1, 32)
1313 .clampScalar(1, S32, S64)
1314 .scalarize(0)
1315 .widenScalarToNextPow2(0, 32);
1316
1317 // If no 16 bit instr is available, lower into different instructions.
1318 if (ST.has16BitInsts())
1319 getActionDefinitionsBuilder(G_IS_FPCLASS)
1320 .legalForCartesianProduct({S1}, FPTypes16)
1321 .widenScalarToNextPow2(1)
1322 .scalarize(0)
1323 .lower();
1324 else
1325 getActionDefinitionsBuilder(G_IS_FPCLASS)
1326 .legalForCartesianProduct({S1}, FPTypesBase)
1327 .lowerFor({S1, S16})
1328 .widenScalarToNextPow2(1)
1329 .scalarize(0)
1330 .lower();
1331
1332 // The hardware instructions return a different result on 0 than the generic
1333 // instructions expect. The hardware produces -1, but these produce the
1334 // bitwidth.
1335 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1336 .scalarize(0)
1337 .clampScalar(0, S32, S32)
1338 .clampScalar(1, S32, S64)
1339 .widenScalarToNextPow2(0, 32)
1340 .widenScalarToNextPow2(1, 32)
1341 .custom();
1342
1343 // The 64-bit versions produce 32-bit results, but only on the SALU.
1344 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1345 .legalFor({{S32, S32}, {S32, S64}})
1346 .customIf(scalarNarrowerThan(1, 32))
1347 .clampScalar(0, S32, S32)
1348 .clampScalar(1, S32, S64)
1349 .scalarize(0)
1350 .widenScalarToNextPow2(0, 32)
1351 .widenScalarToNextPow2(1, 32);
1352
1353 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1354 .legalFor({{S32, S32}, {S32, S64}})
1355 .clampScalar(0, S32, S32)
1356 .clampScalar(1, S32, S64)
1357 .scalarize(0)
1358 .widenScalarToNextPow2(0, 32)
1359 .widenScalarToNextPow2(1, 32);
1360
1361 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1362 // RegBankSelect.
1363 getActionDefinitionsBuilder(G_BITREVERSE)
1364 .legalFor({S32, S64})
1365 .clampScalar(0, S32, S64)
1366 .scalarize(0)
1368
1369 if (ST.has16BitInsts()) {
1371 .legalFor({S16, S32, V2S16})
1372 .clampMaxNumElementsStrict(0, S16, 2)
1373 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1374 // narrowScalar limitation.
1376 .clampScalar(0, S16, S32)
1377 .scalarize(0);
1378
1379 if (ST.hasVOP3PInsts()) {
1381 .legalFor({S32, S16, V2S16})
1382 .clampMaxNumElements(0, S16, 2)
1383 .minScalar(0, S16)
1385 .scalarize(0)
1386 .lower();
1387 if (ST.hasIntMinMax64()) {
1388 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1389 .legalFor({S32, S16, S64, V2S16})
1390 .clampMaxNumElements(0, S16, 2)
1391 .minScalar(0, S16)
1393 .scalarize(0)
1394 .lower();
1395 } else {
1396 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1397 .legalFor({S32, S16, V2S16})
1398 .clampMaxNumElements(0, S16, 2)
1399 .minScalar(0, S16)
1401 .scalarize(0)
1402 .lower();
1403 }
1404 } else {
1405 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1406 .legalFor({S32, S16})
1407 .widenScalarToNextPow2(0)
1408 .minScalar(0, S16)
1409 .scalarize(0)
1410 .lower();
1411 }
1412 } else {
1413 // TODO: Should have same legality without v_perm_b32
1415 .legalFor({S32})
1416 .lowerIf(scalarNarrowerThan(0, 32))
1417 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1418 // narrowScalar limitation.
1420 .maxScalar(0, S32)
1421 .scalarize(0)
1422 .lower();
1423
1424 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1425 .legalFor({S32})
1426 .minScalar(0, S32)
1428 .scalarize(0)
1429 .lower();
1430 }
1431
1432 getActionDefinitionsBuilder(G_INTTOPTR)
1433 // List the common cases
1434 .legalForCartesianProduct(AddrSpaces64, {S64})
1435 .legalForCartesianProduct(AddrSpaces32, {S32})
1436 .scalarize(0)
1437 // Accept any address space as long as the size matches
1438 .legalIf(sameSize(0, 1))
1440 [](const LegalityQuery &Query) {
1441 return std::pair(
1442 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1443 })
1444 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1445 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1446 });
1447
1448 getActionDefinitionsBuilder(G_PTRTOINT)
1449 // List the common cases
1450 .legalForCartesianProduct(AddrSpaces64, {S64})
1451 .legalForCartesianProduct(AddrSpaces32, {S32})
1452 .scalarize(0)
1453 // Accept any address space as long as the size matches
1454 .legalIf(sameSize(0, 1))
1456 [](const LegalityQuery &Query) {
1457 return std::pair(
1458 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1459 })
1460 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1461 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1462 });
1463
1464 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1465 .scalarize(0)
1466 .custom();
1467
1468 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1469 bool IsLoad) -> bool {
1470 const LLT DstTy = Query.Types[0];
1471
1472 // Split vector extloads.
1473 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1474
1475 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1476 return true;
1477
1478 const LLT PtrTy = Query.Types[1];
1479 unsigned AS = PtrTy.getAddressSpace();
1480 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1481 Query.MMODescrs[0].Ordering !=
1483 return true;
1484
1485 // Catch weird sized loads that don't evenly divide into the access sizes
1486 // TODO: May be able to widen depending on alignment etc.
1487 unsigned NumRegs = (MemSize + 31) / 32;
1488 if (NumRegs == 3) {
1489 if (!ST.hasDwordx3LoadStores())
1490 return true;
1491 } else {
1492 // If the alignment allows, these should have been widened.
1493 if (!isPowerOf2_32(NumRegs))
1494 return true;
1495 }
1496
1497 return false;
1498 };
1499
1500 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1501 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1502 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1503
1504 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1505 // LDS
1506 // TODO: Unsupported flat for SI.
1507
1508 for (unsigned Op : {G_LOAD, G_STORE}) {
1509 const bool IsStore = Op == G_STORE;
1510
1511 auto &Actions = getActionDefinitionsBuilder(Op);
1512 // Explicitly list some common cases.
1513 // TODO: Does this help compile time at all?
1514 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1515 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1516 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1517 {S64, GlobalPtr, S64, GlobalAlign32},
1518 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1519 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1520 {S32, GlobalPtr, S8, GlobalAlign8},
1521 {S32, GlobalPtr, S16, GlobalAlign16},
1522
1523 {S32, LocalPtr, S32, 32},
1524 {S64, LocalPtr, S64, 32},
1525 {V2S32, LocalPtr, V2S32, 32},
1526 {S32, LocalPtr, S8, 8},
1527 {S32, LocalPtr, S16, 16},
1528 {V2S16, LocalPtr, S32, 32},
1529
1530 {S32, PrivatePtr, S32, 32},
1531 {S32, PrivatePtr, S8, 8},
1532 {S32, PrivatePtr, S16, 16},
1533 {V2S16, PrivatePtr, S32, 32},
1534
1535 {S32, ConstantPtr, S32, GlobalAlign32},
1536 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1537 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1538 {S64, ConstantPtr, S64, GlobalAlign32},
1539 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1540 Actions.legalIf(
1541 [=](const LegalityQuery &Query) -> bool {
1542 return isLoadStoreLegal(ST, Query);
1543 });
1544
1545 // The custom pointers (fat pointers, buffer resources) don't work with load
1546 // and store at this level. Fat pointers should have been lowered to
1547 // intrinsics before the translation to MIR.
1548 Actions.unsupportedIf(
1549 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1550
1551 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1552 // ptrtoint. This is needed to account for the fact that we can't have i128
1553 // as a register class for SelectionDAG reasons.
1554 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1555 return hasBufferRsrcWorkaround(Query.Types[0]);
1556 });
1557
1558 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1559 // 64-bits.
1560 //
1561 // TODO: Should generalize bitcast action into coerce, which will also cover
1562 // inserting addrspacecasts.
1563 Actions.customIf(typeIs(1, Constant32Ptr));
1564
1565 // Turn any illegal element vectors into something easier to deal
1566 // with. These will ultimately produce 32-bit scalar shifts to extract the
1567 // parts anyway.
1568 //
1569 // For odd 16-bit element vectors, prefer to split those into pieces with
1570 // 16-bit vector parts.
1571 Actions.bitcastIf(
1572 [=](const LegalityQuery &Query) -> bool {
1573 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1574 Query.MMODescrs[0].MemoryTy);
1575 }, bitcastToRegisterType(0));
1576
1577 if (!IsStore) {
1578 // Widen suitably aligned loads by loading extra bytes. The standard
1579 // legalization actions can't properly express widening memory operands.
1580 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1581 return shouldWidenLoad(ST, Query, G_LOAD);
1582 });
1583 }
1584
1585 // FIXME: load/store narrowing should be moved to lower action
1586 Actions
1587 .narrowScalarIf(
1588 [=](const LegalityQuery &Query) -> bool {
1589 return !Query.Types[0].isVector() &&
1590 needToSplitMemOp(Query, Op == G_LOAD);
1591 },
1592 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1593 const LLT DstTy = Query.Types[0];
1594 const LLT PtrTy = Query.Types[1];
1595
1596 const unsigned DstSize = DstTy.getSizeInBits();
1597 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1598
1599 // Split extloads.
1600 if (DstSize > MemSize)
1601 return std::pair(0, LLT::scalar(MemSize));
1602
1603 unsigned MaxSize = maxSizeForAddrSpace(
1604 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1605 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1606 if (MemSize > MaxSize)
1607 return std::pair(0, LLT::scalar(MaxSize));
1608
1609 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1610 return std::pair(0, LLT::scalar(Align));
1611 })
1612 .fewerElementsIf(
1613 [=](const LegalityQuery &Query) -> bool {
1614 return Query.Types[0].isVector() &&
1615 needToSplitMemOp(Query, Op == G_LOAD);
1616 },
1617 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1618 const LLT DstTy = Query.Types[0];
1619 const LLT PtrTy = Query.Types[1];
1620
1621 LLT EltTy = DstTy.getElementType();
1622 unsigned MaxSize = maxSizeForAddrSpace(
1623 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1624 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1625
1626 // FIXME: Handle widened to power of 2 results better. This ends
1627 // up scalarizing.
1628 // FIXME: 3 element stores scalarized on SI
1629
1630 // Split if it's too large for the address space.
1631 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1632 if (MemSize > MaxSize) {
1633 unsigned NumElts = DstTy.getNumElements();
1634 unsigned EltSize = EltTy.getSizeInBits();
1635
1636 if (MaxSize % EltSize == 0) {
1637 return std::pair(
1639 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1640 }
1641
1642 unsigned NumPieces = MemSize / MaxSize;
1643
1644 // FIXME: Refine when odd breakdowns handled
1645 // The scalars will need to be re-legalized.
1646 if (NumPieces == 1 || NumPieces >= NumElts ||
1647 NumElts % NumPieces != 0)
1648 return std::pair(0, EltTy);
1649
1650 return std::pair(0,
1651 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1652 }
1653
1654 // FIXME: We could probably handle weird extending loads better.
1655 if (DstTy.getSizeInBits() > MemSize)
1656 return std::pair(0, EltTy);
1657
1658 unsigned EltSize = EltTy.getSizeInBits();
1659 unsigned DstSize = DstTy.getSizeInBits();
1660 if (!isPowerOf2_32(DstSize)) {
1661 // We're probably decomposing an odd sized store. Try to split
1662 // to the widest type. TODO: Account for alignment. As-is it
1663 // should be OK, since the new parts will be further legalized.
1664 unsigned FloorSize = llvm::bit_floor(DstSize);
1665 return std::pair(
1667 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1668 }
1669
1670 // May need relegalization for the scalars.
1671 return std::pair(0, EltTy);
1672 })
1673 .minScalar(0, S32)
1674 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1676 .widenScalarToNextPow2(0)
1677 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1678 .lower();
1679 }
1680
1681 // FIXME: Unaligned accesses not lowered.
1682 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1683 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1684 {S32, GlobalPtr, S16, 2 * 8},
1685 {S32, LocalPtr, S8, 8},
1686 {S32, LocalPtr, S16, 16},
1687 {S32, PrivatePtr, S8, 8},
1688 {S32, PrivatePtr, S16, 16},
1689 {S32, ConstantPtr, S8, 8},
1690 {S32, ConstantPtr, S16, 2 * 8}})
1691 .legalIf(
1692 [=](const LegalityQuery &Query) -> bool {
1693 return isLoadStoreLegal(ST, Query);
1694 });
1695
1696 if (ST.hasFlatAddressSpace()) {
1697 ExtLoads.legalForTypesWithMemDesc(
1698 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1699 }
1700
1701 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1702 // 64-bits.
1703 //
1704 // TODO: Should generalize bitcast action into coerce, which will also cover
1705 // inserting addrspacecasts.
1706 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1707
1708 ExtLoads.clampScalar(0, S32, S32)
1710 .lower();
1711
1712 auto &Atomics = getActionDefinitionsBuilder(
1713 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1714 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1715 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1716 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1717 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1718 {S64, GlobalPtr}, {S64, LocalPtr},
1719 {S32, RegionPtr}, {S64, RegionPtr}});
1720 if (ST.hasFlatAddressSpace()) {
1721 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1722 }
1723
1724 // TODO: v2bf16 operations, and fat buffer pointer support.
1725 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1726 if (ST.hasLDSFPAtomicAddF32()) {
1727 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1728 if (ST.hasLdsAtomicAddF64())
1729 Atomic.legalFor({{S64, LocalPtr}});
1730 if (ST.hasAtomicDsPkAdd16Insts())
1731 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1732 }
1733 if (ST.hasAtomicFaddInsts())
1734 Atomic.legalFor({{S32, GlobalPtr}});
1735 if (ST.hasFlatAtomicFaddF32Inst())
1736 Atomic.legalFor({{S32, FlatPtr}});
1737
1738 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1739 // These are legal with some caveats, and should have undergone expansion in
1740 // the IR in most situations
1741 // TODO: Move atomic expansion into legalizer
1742 Atomic.legalFor({
1743 {S32, GlobalPtr},
1744 {S64, GlobalPtr},
1745 {S64, FlatPtr}
1746 });
1747 }
1748
1749 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1750 ST.hasAtomicBufferGlobalPkAddF16Insts())
1751 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1752 if (ST.hasAtomicGlobalPkAddBF16Inst())
1753 Atomic.legalFor({{V2BF16, GlobalPtr}});
1754 if (ST.hasAtomicFlatPkAdd16Insts())
1755 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1756
1757
1758 // Most of the legalization work here is done by AtomicExpand. We could
1759 // probably use a simpler legality rule that just assumes anything is OK.
1760 auto &AtomicFMinFMax =
1761 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1762 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1763
1764 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1765 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1766 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1767 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1768 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1769 AtomicFMinFMax.legalFor({F32, FlatPtr});
1770 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1771 AtomicFMinFMax.legalFor({F64, FlatPtr});
1772
1773 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1774 // demarshalling
1775 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1776 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1777 {S32, FlatPtr}, {S64, FlatPtr}})
1778 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1779 {S32, RegionPtr}, {S64, RegionPtr}});
1780 // TODO: Pointer types, any 32-bit or 64-bit vector
1781
1782 // Condition should be s32 for scalar, s1 for vector.
1785 LocalPtr, FlatPtr, PrivatePtr,
1786 LLT::fixed_vector(2, LocalPtr),
1787 LLT::fixed_vector(2, PrivatePtr)},
1788 {S1, S32})
1789 .clampScalar(0, S16, S64)
1790 .scalarize(1)
1793 .clampMaxNumElements(0, S32, 2)
1794 .clampMaxNumElements(0, LocalPtr, 2)
1795 .clampMaxNumElements(0, PrivatePtr, 2)
1796 .scalarize(0)
1798 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1799
1800 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1801 // be more flexible with the shift amount type.
1802 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1803 .legalFor({{S32, S32}, {S64, S32}});
1804 if (ST.has16BitInsts()) {
1805 if (ST.hasVOP3PInsts()) {
1806 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1807 .clampMaxNumElements(0, S16, 2);
1808 } else
1809 Shifts.legalFor({{S16, S16}});
1810
1811 // TODO: Support 16-bit shift amounts for all types
1812 Shifts.widenScalarIf(
1813 [=](const LegalityQuery &Query) {
1814 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1815 // 32-bit amount.
1816 const LLT ValTy = Query.Types[0];
1817 const LLT AmountTy = Query.Types[1];
1818 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1819 AmountTy.getSizeInBits() < 16;
1820 }, changeTo(1, S16));
1821 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1822 Shifts.clampScalar(1, S32, S32);
1823 Shifts.widenScalarToNextPow2(0, 16);
1824 Shifts.clampScalar(0, S16, S64);
1825
1826 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1827 .minScalar(0, S16)
1828 .scalarize(0)
1829 .lower();
1830 } else {
1831 // Make sure we legalize the shift amount type first, as the general
1832 // expansion for the shifted type will produce much worse code if it hasn't
1833 // been truncated already.
1834 Shifts.clampScalar(1, S32, S32);
1835 Shifts.widenScalarToNextPow2(0, 32);
1836 Shifts.clampScalar(0, S32, S64);
1837
1838 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1839 .minScalar(0, S32)
1840 .scalarize(0)
1841 .lower();
1842 }
1843 Shifts.scalarize(0);
1844
1845 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1846 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1847 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1848 unsigned IdxTypeIdx = 2;
1849
1851 .customIf([=](const LegalityQuery &Query) {
1852 const LLT EltTy = Query.Types[EltTypeIdx];
1853 const LLT VecTy = Query.Types[VecTypeIdx];
1854 const LLT IdxTy = Query.Types[IdxTypeIdx];
1855 const unsigned EltSize = EltTy.getSizeInBits();
1856 const bool isLegalVecType =
1858 // Address space 8 pointers are 128-bit wide values, but the logic
1859 // below will try to bitcast them to 2N x s64, which will fail.
1860 // Therefore, as an intermediate step, wrap extracts/insertions from a
1861 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1862 // extraction result) in order to produce a vector operation that can
1863 // be handled by the logic below.
1864 if (EltTy.isPointer() && EltSize > 64)
1865 return true;
1866 return (EltSize == 32 || EltSize == 64) &&
1867 VecTy.getSizeInBits() % 32 == 0 &&
1868 VecTy.getSizeInBits() <= MaxRegisterSize &&
1869 IdxTy.getSizeInBits() == 32 &&
1870 isLegalVecType;
1871 })
1872 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1873 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1874 bitcastToVectorElement32(VecTypeIdx))
1875 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1876 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1877 scalarOrEltWiderThan(VecTypeIdx, 64)),
1878 [=](const LegalityQuery &Query) {
1879 // For > 64-bit element types, try to turn this into a
1880 // 64-bit element vector since we may be able to do better
1881 // indexing if this is scalar. If not, fall back to 32.
1882 const LLT EltTy = Query.Types[EltTypeIdx];
1883 const LLT VecTy = Query.Types[VecTypeIdx];
1884 const unsigned DstEltSize = EltTy.getSizeInBits();
1885 const unsigned VecSize = VecTy.getSizeInBits();
1886
1887 const unsigned TargetEltSize =
1888 DstEltSize % 64 == 0 ? 64 : 32;
1889 return std::pair(VecTypeIdx,
1890 LLT::fixed_vector(VecSize / TargetEltSize,
1891 TargetEltSize));
1892 })
1893 .clampScalar(EltTypeIdx, S32, S64)
1894 .clampScalar(VecTypeIdx, S32, S64)
1895 .clampScalar(IdxTypeIdx, S32, S32)
1896 .clampMaxNumElements(VecTypeIdx, S32, 32)
1897 // TODO: Clamp elements for 64-bit vectors?
1898 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1900 // It should only be necessary with variable indexes.
1901 // As a last resort, lower to the stack
1902 .lower();
1903 }
1904
1905 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1906 .unsupportedIf([=](const LegalityQuery &Query) {
1907 const LLT &EltTy = Query.Types[1].getElementType();
1908 return Query.Types[0] != EltTy;
1909 });
1910
1911 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1912 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1913 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1914
1915 // FIXME: Doesn't handle extract of illegal sizes.
1917 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1918 .lowerIf([=](const LegalityQuery &Query) {
1919 // Sub-vector(or single element) insert and extract.
1920 // TODO: verify immediate offset here since lower only works with
1921 // whole elements.
1922 const LLT BigTy = Query.Types[BigTyIdx];
1923 return BigTy.isVector();
1924 })
1925 // FIXME: Multiples of 16 should not be legal.
1926 .legalIf([=](const LegalityQuery &Query) {
1927 const LLT BigTy = Query.Types[BigTyIdx];
1928 const LLT LitTy = Query.Types[LitTyIdx];
1929 return (BigTy.getSizeInBits() % 32 == 0) &&
1930 (LitTy.getSizeInBits() % 16 == 0);
1931 })
1932 .widenScalarIf(
1933 [=](const LegalityQuery &Query) {
1934 const LLT BigTy = Query.Types[BigTyIdx];
1935 return (BigTy.getScalarSizeInBits() < 16);
1936 },
1938 .widenScalarIf(
1939 [=](const LegalityQuery &Query) {
1940 const LLT LitTy = Query.Types[LitTyIdx];
1941 return (LitTy.getScalarSizeInBits() < 16);
1942 },
1944 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1945 .widenScalarToNextPow2(BigTyIdx, 32);
1946
1947 }
1948
1949 auto &BuildVector =
1950 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1952 .legalForCartesianProduct(AllS64Vectors, {S64})
1953 .clampNumElements(0, V16S32, V32S32)
1958
1959 if (ST.hasScalarPackInsts()) {
1960 BuildVector
1961 // FIXME: Should probably widen s1 vectors straight to s32
1962 .minScalarOrElt(0, S16)
1963 .minScalar(1, S16);
1964
1965 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1966 .legalFor({V2S16, S32})
1967 .lower();
1968 } else {
1969 BuildVector.customFor({V2S16, S16});
1970 BuildVector.minScalarOrElt(0, S32);
1971
1972 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1973 .customFor({V2S16, S32})
1974 .lower();
1975 }
1976
1977 BuildVector.legalIf(isRegisterType(ST, 0));
1978
1979 // FIXME: Clamp maximum size
1980 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1981 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1982 .clampMaxNumElements(0, S32, 32)
1983 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1984 .clampMaxNumElements(0, S16, 64);
1985
1986 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1987
1988 // Merge/Unmerge
1989 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1990 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1991 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1992
1993 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1994 const LLT Ty = Query.Types[TypeIdx];
1995 if (Ty.isVector()) {
1996 const LLT &EltTy = Ty.getElementType();
1997 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1998 return true;
2000 return true;
2001 }
2002 return false;
2003 };
2004
2005 auto &Builder =
2007 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2008 .lowerFor({{S16, V2S16}})
2009 .lowerIf([=](const LegalityQuery &Query) {
2010 const LLT BigTy = Query.Types[BigTyIdx];
2011 return BigTy.getSizeInBits() == 32;
2012 })
2013 // Try to widen to s16 first for small types.
2014 // TODO: Only do this on targets with legal s16 shifts
2015 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2016 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2018 oneMoreElement(BigTyIdx))
2020 elementTypeIs(1, S16)),
2021 changeTo(1, V2S16))
2022 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2023 // not worth considering the multiples of 64 since 2*192 and 2*384
2024 // are not valid.
2025 .clampScalar(LitTyIdx, S32, S512)
2026 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2027 // Break up vectors with weird elements into scalars
2029 [=](const LegalityQuery &Query) {
2030 return notValidElt(Query, LitTyIdx);
2031 },
2032 scalarize(0))
2033 .fewerElementsIf(
2034 [=](const LegalityQuery &Query) {
2035 return notValidElt(Query, BigTyIdx);
2036 },
2037 scalarize(1))
2038 .clampScalar(BigTyIdx, S32, MaxScalar);
2039
2040 if (Op == G_MERGE_VALUES) {
2041 Builder.widenScalarIf(
2042 // TODO: Use 16-bit shifts if legal for 8-bit values?
2043 [=](const LegalityQuery &Query) {
2044 const LLT Ty = Query.Types[LitTyIdx];
2045 return Ty.getSizeInBits() < 32;
2046 },
2047 changeTo(LitTyIdx, S32));
2048 }
2049
2050 Builder.widenScalarIf(
2051 [=](const LegalityQuery &Query) {
2052 const LLT Ty = Query.Types[BigTyIdx];
2053 return Ty.getSizeInBits() % 16 != 0;
2054 },
2055 [=](const LegalityQuery &Query) {
2056 // Pick the next power of 2, or a multiple of 64 over 128.
2057 // Whichever is smaller.
2058 const LLT &Ty = Query.Types[BigTyIdx];
2059 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2060 if (NewSizeInBits >= 256) {
2061 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2062 if (RoundedTo < NewSizeInBits)
2063 NewSizeInBits = RoundedTo;
2064 }
2065 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2066 })
2067 // Any vectors left are the wrong size. Scalarize them.
2068 .scalarize(0)
2069 .scalarize(1);
2070 }
2071
2072 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2073 // RegBankSelect.
2074 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2075 .legalFor({{S32}, {S64}})
2076 .clampScalar(0, S32, S64);
2077
2078 if (ST.hasVOP3PInsts()) {
2079 SextInReg.lowerFor({{V2S16}})
2080 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2081 // get more vector shift opportunities, since we'll get those when
2082 // expanded.
2083 .clampMaxNumElementsStrict(0, S16, 2);
2084 } else if (ST.has16BitInsts()) {
2085 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2086 } else {
2087 // Prefer to promote to s32 before lowering if we don't have 16-bit
2088 // shifts. This avoid a lot of intermediate truncate and extend operations.
2089 SextInReg.lowerFor({{S32}, {S64}});
2090 }
2091
2092 SextInReg
2093 .scalarize(0)
2094 .clampScalar(0, S32, S64)
2095 .lower();
2096
2097 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2098 .scalarize(0)
2099 .lower();
2100
2101 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2102 FSHRActionDefs.legalFor({{S32, S32}})
2103 .clampMaxNumElementsStrict(0, S16, 2);
2104 if (ST.hasVOP3PInsts())
2105 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2106 FSHRActionDefs.scalarize(0).lower();
2107
2108 if (ST.hasVOP3PInsts()) {
2110 .lowerFor({{V2S16, V2S16}})
2111 .clampMaxNumElementsStrict(0, S16, 2)
2112 .scalarize(0)
2113 .lower();
2114 } else {
2116 .scalarize(0)
2117 .lower();
2118 }
2119
2120 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2121 .legalFor({S64});
2122
2123 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2124
2126 .alwaysLegal();
2127
2128 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2129 .scalarize(0)
2130 .minScalar(0, S32)
2131 .lower();
2132
2133 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2134 .legalFor({{S32, S32}, {S64, S32}})
2135 .clampScalar(1, S32, S32)
2136 .clampScalar(0, S32, S64)
2138 .scalarize(0);
2139
2141 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2142 G_FCOPYSIGN,
2143
2144 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2145 G_READ_REGISTER, G_WRITE_REGISTER,
2146
2147 G_SADDO, G_SSUBO})
2148 .lower();
2149
2150 if (ST.hasIEEEMinimumMaximumInsts()) {
2151 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2152 .legalFor(FPTypesPK16)
2153 .clampMaxNumElements(0, S16, 2)
2154 .scalarize(0);
2155 } else if (ST.hasVOP3PInsts()) {
2156 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2157 .lowerFor({V2S16})
2158 .clampMaxNumElementsStrict(0, S16, 2)
2159 .scalarize(0)
2160 .lower();
2161 } else {
2162 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2163 .scalarize(0)
2164 .clampScalar(0, S32, S64)
2165 .lower();
2166 }
2167
2168 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2169 .lower();
2170
2171 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2172
2173 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2174 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2175 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2176 .unsupported();
2177
2179
2181 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2182 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2183 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2184 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2185 .legalFor(AllVectors)
2186 .scalarize(1)
2187 .lower();
2188
2190 verify(*ST.getInstrInfo());
2191}
2192
2195 LostDebugLocObserver &LocObserver) const {
2196 MachineIRBuilder &B = Helper.MIRBuilder;
2197 MachineRegisterInfo &MRI = *B.getMRI();
2198
2199 switch (MI.getOpcode()) {
2200 case TargetOpcode::G_ADDRSPACE_CAST:
2201 return legalizeAddrSpaceCast(MI, MRI, B);
2202 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2203 return legalizeFroundeven(MI, MRI, B);
2204 case TargetOpcode::G_FCEIL:
2205 return legalizeFceil(MI, MRI, B);
2206 case TargetOpcode::G_FREM:
2207 return legalizeFrem(MI, MRI, B);
2208 case TargetOpcode::G_INTRINSIC_TRUNC:
2209 return legalizeIntrinsicTrunc(MI, MRI, B);
2210 case TargetOpcode::G_SITOFP:
2211 return legalizeITOFP(MI, MRI, B, true);
2212 case TargetOpcode::G_UITOFP:
2213 return legalizeITOFP(MI, MRI, B, false);
2214 case TargetOpcode::G_FPTOSI:
2215 return legalizeFPTOI(MI, MRI, B, true);
2216 case TargetOpcode::G_FPTOUI:
2217 return legalizeFPTOI(MI, MRI, B, false);
2218 case TargetOpcode::G_FMINNUM:
2219 case TargetOpcode::G_FMAXNUM:
2220 case TargetOpcode::G_FMINIMUMNUM:
2221 case TargetOpcode::G_FMAXIMUMNUM:
2222 return legalizeMinNumMaxNum(Helper, MI);
2223 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2224 return legalizeExtractVectorElt(MI, MRI, B);
2225 case TargetOpcode::G_INSERT_VECTOR_ELT:
2226 return legalizeInsertVectorElt(MI, MRI, B);
2227 case TargetOpcode::G_FSIN:
2228 case TargetOpcode::G_FCOS:
2229 return legalizeSinCos(MI, MRI, B);
2230 case TargetOpcode::G_GLOBAL_VALUE:
2231 return legalizeGlobalValue(MI, MRI, B);
2232 case TargetOpcode::G_LOAD:
2233 case TargetOpcode::G_SEXTLOAD:
2234 case TargetOpcode::G_ZEXTLOAD:
2235 return legalizeLoad(Helper, MI);
2236 case TargetOpcode::G_STORE:
2237 return legalizeStore(Helper, MI);
2238 case TargetOpcode::G_FMAD:
2239 return legalizeFMad(MI, MRI, B);
2240 case TargetOpcode::G_FDIV:
2241 return legalizeFDIV(MI, MRI, B);
2242 case TargetOpcode::G_FFREXP:
2243 return legalizeFFREXP(MI, MRI, B);
2244 case TargetOpcode::G_FSQRT:
2245 return legalizeFSQRT(MI, MRI, B);
2246 case TargetOpcode::G_UDIV:
2247 case TargetOpcode::G_UREM:
2248 case TargetOpcode::G_UDIVREM:
2249 return legalizeUnsignedDIV_REM(MI, MRI, B);
2250 case TargetOpcode::G_SDIV:
2251 case TargetOpcode::G_SREM:
2252 case TargetOpcode::G_SDIVREM:
2253 return legalizeSignedDIV_REM(MI, MRI, B);
2254 case TargetOpcode::G_ATOMIC_CMPXCHG:
2255 return legalizeAtomicCmpXChg(MI, MRI, B);
2256 case TargetOpcode::G_FLOG2:
2257 return legalizeFlog2(MI, B);
2258 case TargetOpcode::G_FLOG:
2259 case TargetOpcode::G_FLOG10:
2260 return legalizeFlogCommon(MI, B);
2261 case TargetOpcode::G_FEXP2:
2262 return legalizeFExp2(MI, B);
2263 case TargetOpcode::G_FEXP:
2264 case TargetOpcode::G_FEXP10:
2265 return legalizeFExp(MI, B);
2266 case TargetOpcode::G_FPOW:
2267 return legalizeFPow(MI, B);
2268 case TargetOpcode::G_FFLOOR:
2269 return legalizeFFloor(MI, MRI, B);
2270 case TargetOpcode::G_BUILD_VECTOR:
2271 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2272 return legalizeBuildVector(MI, MRI, B);
2273 case TargetOpcode::G_MUL:
2274 return legalizeMul(Helper, MI);
2275 case TargetOpcode::G_CTLZ:
2276 case TargetOpcode::G_CTTZ:
2277 return legalizeCTLZ_CTTZ(MI, MRI, B);
2278 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2279 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2280 case TargetOpcode::G_STACKSAVE:
2281 return legalizeStackSave(MI, B);
2282 case TargetOpcode::G_GET_FPENV:
2283 return legalizeGetFPEnv(MI, MRI, B);
2284 case TargetOpcode::G_SET_FPENV:
2285 return legalizeSetFPEnv(MI, MRI, B);
2286 case TargetOpcode::G_TRAP:
2287 return legalizeTrap(MI, MRI, B);
2288 case TargetOpcode::G_DEBUGTRAP:
2289 return legalizeDebugTrap(MI, MRI, B);
2290 default:
2291 return false;
2292 }
2293
2294 llvm_unreachable("expected switch to return");
2295}
2296
2298 unsigned AS,
2300 MachineIRBuilder &B) const {
2301 MachineFunction &MF = B.getMF();
2302 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2303 const LLT S32 = LLT::scalar(32);
2304 const LLT S64 = LLT::scalar(64);
2305
2307
2308 if (ST.hasApertureRegs()) {
2309 // Note: this register is somewhat broken. When used as a 32-bit operand,
2310 // it only returns zeroes. The real value is in the upper 32 bits.
2311 // Thus, we must emit extract the high 32 bits.
2312 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2313 ? AMDGPU::SRC_SHARED_BASE
2314 : AMDGPU::SRC_PRIVATE_BASE;
2315 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2316 !ST.hasGloballyAddressableScratch()) &&
2317 "Cannot use src_private_base with globally addressable scratch!");
2318 Register Dst = MRI.createGenericVirtualRegister(S64);
2319 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2320 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2321 return B.buildUnmerge(S32, Dst).getReg(1);
2322 }
2323
2324 // TODO: can we be smarter about machine pointer info?
2326 Register LoadAddr = MRI.createGenericVirtualRegister(
2328 // For code object version 5, private_base and shared_base are passed through
2329 // implicit kernargs.
2336 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2337
2338 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2340
2341 if (!loadInputValue(KernargPtrReg, B,
2343 return Register();
2344
2346 PtrInfo,
2350
2351 // Pointer address
2352 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2353 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2354 // Load address
2355 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2356 }
2357
2358 Register QueuePtr = MRI.createGenericVirtualRegister(
2360
2362 return Register();
2363
2364 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2365 // private_segment_aperture_base_hi.
2366 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2367
2369 PtrInfo,
2372 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2373
2374 B.buildObjectPtrOffset(
2375 LoadAddr, QueuePtr,
2376 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2377 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2378}
2379
2380/// Return true if the value is a known valid address, such that a null check is
2381/// not necessary.
2383 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2384 MachineInstr *Def = MRI.getVRegDef(Val);
2385 switch (Def->getOpcode()) {
2386 case AMDGPU::G_FRAME_INDEX:
2387 case AMDGPU::G_GLOBAL_VALUE:
2388 case AMDGPU::G_BLOCK_ADDR:
2389 return true;
2390 case AMDGPU::G_CONSTANT: {
2391 const ConstantInt *CI = Def->getOperand(1).getCImm();
2392 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2393 }
2394 default:
2395 return false;
2396 }
2397
2398 return false;
2399}
2400
2403 MachineIRBuilder &B) const {
2404 MachineFunction &MF = B.getMF();
2405
2406 // MI can either be a G_ADDRSPACE_CAST or a
2407 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2408 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2409 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2410 Intrinsic::amdgcn_addrspacecast_nonnull));
2411
2412 const LLT S32 = LLT::scalar(32);
2413 Register Dst = MI.getOperand(0).getReg();
2414 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2415 : MI.getOperand(1).getReg();
2416 LLT DstTy = MRI.getType(Dst);
2417 LLT SrcTy = MRI.getType(Src);
2418 unsigned DestAS = DstTy.getAddressSpace();
2419 unsigned SrcAS = SrcTy.getAddressSpace();
2420
2421 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2422 // vector element.
2423 assert(!DstTy.isVector());
2424
2425 const AMDGPUTargetMachine &TM
2426 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2427
2428 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2429 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2430 return true;
2431 }
2432
2433 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2434 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2435 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2436 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2437 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2438 ST.hasGloballyAddressableScratch()) {
2439 // flat -> private with globally addressable scratch: subtract
2440 // src_flat_scratch_base_lo.
2441 const LLT S32 = LLT::scalar(32);
2442 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2443 Register FlatScratchBaseLo =
2444 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2445 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2446 .getReg(0);
2447 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2448 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2449 return B.buildIntToPtr(Dst, Sub).getReg(0);
2450 }
2451
2452 // Extract low 32-bits of the pointer.
2453 return B.buildExtract(Dst, Src, 0).getReg(0);
2454 };
2455
2456 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2457 // G_ADDRSPACE_CAST we need to guess.
2458 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2459 castFlatToLocalOrPrivate(Dst);
2460 MI.eraseFromParent();
2461 return true;
2462 }
2463
2464 unsigned NullVal = TM.getNullPointerValue(DestAS);
2465
2466 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2467 auto FlatNull = B.buildConstant(SrcTy, 0);
2468
2469 // Extract low 32-bits of the pointer.
2470 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2471
2472 auto CmpRes =
2473 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2474 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2475
2476 MI.eraseFromParent();
2477 return true;
2478 }
2479
2480 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2481 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2482 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2483 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2484 // Coerce the type of the low half of the result so we can use
2485 // merge_values.
2486 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2487
2488 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2489 ST.hasGloballyAddressableScratch()) {
2490 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2491 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2492 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2493 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2494 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2495 .addUse(AllOnes)
2496 .addUse(ThreadID)
2497 .getReg(0);
2498 if (ST.isWave64()) {
2499 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2500 .addUse(AllOnes)
2501 .addUse(ThreadID)
2502 .getReg(0);
2503 }
2504 Register ShAmt =
2505 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2506 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2507 Register CvtPtr =
2508 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2509 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2510 // 64-bit hi:lo value.
2511 Register FlatScratchBase =
2512 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2513 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2514 .getReg(0);
2515 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2516 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2517 }
2518
2519 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2520 if (!ApertureReg.isValid())
2521 return false;
2522
2523 // TODO: Should we allow mismatched types but matching sizes in merges to
2524 // avoid the ptrtoint?
2525 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2526 };
2527
2528 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2529 // G_ADDRSPACE_CAST we need to guess.
2530 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2531 castLocalOrPrivateToFlat(Dst);
2532 MI.eraseFromParent();
2533 return true;
2534 }
2535
2536 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2537
2538 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2539 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2540
2541 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2542 SegmentNull.getReg(0));
2543
2544 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2545
2546 MI.eraseFromParent();
2547 return true;
2548 }
2549
2550 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2551 SrcTy.getSizeInBits() == 64) {
2552 // Truncate.
2553 B.buildExtract(Dst, Src, 0);
2554 MI.eraseFromParent();
2555 return true;
2556 }
2557
2558 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2559 DstTy.getSizeInBits() == 64) {
2561 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2562 auto PtrLo = B.buildPtrToInt(S32, Src);
2563 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2564 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2565 MI.eraseFromParent();
2566 return true;
2567 }
2568
2569 // Invalid casts are poison.
2570 // TODO: Should return poison
2571 B.buildUndef(Dst);
2572 MI.eraseFromParent();
2573 return true;
2574}
2575
2578 MachineIRBuilder &B) const {
2579 Register Src = MI.getOperand(1).getReg();
2580 LLT Ty = MRI.getType(Src);
2581 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2582
2583 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2584 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2585
2586 auto C1 = B.buildFConstant(Ty, C1Val);
2587 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2588
2589 // TODO: Should this propagate fast-math-flags?
2590 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2591 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2592
2593 auto C2 = B.buildFConstant(Ty, C2Val);
2594 auto Fabs = B.buildFAbs(Ty, Src);
2595
2596 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2597 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2598 MI.eraseFromParent();
2599 return true;
2600}
2601
2604 MachineIRBuilder &B) const {
2605
2606 const LLT S1 = LLT::scalar(1);
2607 const LLT S64 = LLT::scalar(64);
2608
2609 Register Src = MI.getOperand(1).getReg();
2610 assert(MRI.getType(Src) == S64);
2611
2612 // result = trunc(src)
2613 // if (src > 0.0 && src != result)
2614 // result += 1.0
2615
2616 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2617
2618 const auto Zero = B.buildFConstant(S64, 0.0);
2619 const auto One = B.buildFConstant(S64, 1.0);
2620 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2621 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2622 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2623 auto Add = B.buildSelect(S64, And, One, Zero);
2624
2625 // TODO: Should this propagate fast-math-flags?
2626 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2627 MI.eraseFromParent();
2628 return true;
2629}
2630
2633 MachineIRBuilder &B) const {
2634 Register DstReg = MI.getOperand(0).getReg();
2635 Register Src0Reg = MI.getOperand(1).getReg();
2636 Register Src1Reg = MI.getOperand(2).getReg();
2637 auto Flags = MI.getFlags();
2638 LLT Ty = MRI.getType(DstReg);
2639
2640 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2641 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2642 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2643 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2644 MI.eraseFromParent();
2645 return true;
2646}
2647
2650 const unsigned FractBits = 52;
2651 const unsigned ExpBits = 11;
2652 LLT S32 = LLT::scalar(32);
2653
2654 auto Const0 = B.buildConstant(S32, FractBits - 32);
2655 auto Const1 = B.buildConstant(S32, ExpBits);
2656
2657 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2658 .addUse(Hi)
2659 .addUse(Const0.getReg(0))
2660 .addUse(Const1.getReg(0));
2661
2662 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2663}
2664
2667 MachineIRBuilder &B) const {
2668 const LLT S1 = LLT::scalar(1);
2669 const LLT S32 = LLT::scalar(32);
2670 const LLT S64 = LLT::scalar(64);
2671
2672 Register Src = MI.getOperand(1).getReg();
2673 assert(MRI.getType(Src) == S64);
2674
2675 // TODO: Should this use extract since the low half is unused?
2676 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2677 Register Hi = Unmerge.getReg(1);
2678
2679 // Extract the upper half, since this is where we will find the sign and
2680 // exponent.
2681 auto Exp = extractF64Exponent(Hi, B);
2682
2683 const unsigned FractBits = 52;
2684
2685 // Extract the sign bit.
2686 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2687 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2688
2689 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2690
2691 const auto Zero32 = B.buildConstant(S32, 0);
2692
2693 // Extend back to 64-bits.
2694 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2695
2696 auto Shr = B.buildAShr(S64, FractMask, Exp);
2697 auto Not = B.buildNot(S64, Shr);
2698 auto Tmp0 = B.buildAnd(S64, Src, Not);
2699 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2700
2701 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2702 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2703
2704 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2705 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2706 MI.eraseFromParent();
2707 return true;
2708}
2709
2712 MachineIRBuilder &B, bool Signed) const {
2713
2714 Register Dst = MI.getOperand(0).getReg();
2715 Register Src = MI.getOperand(1).getReg();
2716
2717 const LLT S64 = LLT::scalar(64);
2718 const LLT S32 = LLT::scalar(32);
2719
2720 assert(MRI.getType(Src) == S64);
2721
2722 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2723 auto ThirtyTwo = B.buildConstant(S32, 32);
2724
2725 if (MRI.getType(Dst) == S64) {
2726 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2727 : B.buildUITOFP(S64, Unmerge.getReg(1));
2728
2729 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2730 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2731
2732 // TODO: Should this propagate fast-math-flags?
2733 B.buildFAdd(Dst, LdExp, CvtLo);
2734 MI.eraseFromParent();
2735 return true;
2736 }
2737
2738 assert(MRI.getType(Dst) == S32);
2739
2740 auto One = B.buildConstant(S32, 1);
2741
2742 MachineInstrBuilder ShAmt;
2743 if (Signed) {
2744 auto ThirtyOne = B.buildConstant(S32, 31);
2745 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2746 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2747 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2748 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2749 .addUse(Unmerge.getReg(1));
2750 auto LS2 = B.buildSub(S32, LS, One);
2751 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2752 } else
2753 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2754 auto Norm = B.buildShl(S64, Src, ShAmt);
2755 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2756 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2757 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2758 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2759 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2760 B.buildFLdexp(Dst, FVal, Scale);
2761 MI.eraseFromParent();
2762 return true;
2763}
2764
2765// TODO: Copied from DAG implementation. Verify logic and document how this
2766// actually works.
2770 bool Signed) const {
2771
2772 Register Dst = MI.getOperand(0).getReg();
2773 Register Src = MI.getOperand(1).getReg();
2774
2775 const LLT S64 = LLT::scalar(64);
2776 const LLT S32 = LLT::scalar(32);
2777
2778 const LLT SrcLT = MRI.getType(Src);
2779 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2780
2781 unsigned Flags = MI.getFlags();
2782
2783 // The basic idea of converting a floating point number into a pair of 32-bit
2784 // integers is illustrated as follows:
2785 //
2786 // tf := trunc(val);
2787 // hif := floor(tf * 2^-32);
2788 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2789 // hi := fptoi(hif);
2790 // lo := fptoi(lof);
2791 //
2792 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2794 if (Signed && SrcLT == S32) {
2795 // However, a 32-bit floating point number has only 23 bits mantissa and
2796 // it's not enough to hold all the significant bits of `lof` if val is
2797 // negative. To avoid the loss of precision, We need to take the absolute
2798 // value after truncating and flip the result back based on the original
2799 // signedness.
2800 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2801 Trunc = B.buildFAbs(S32, Trunc, Flags);
2802 }
2803 MachineInstrBuilder K0, K1;
2804 if (SrcLT == S64) {
2805 K0 = B.buildFConstant(
2806 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2807 K1 = B.buildFConstant(
2808 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2809 } else {
2810 K0 = B.buildFConstant(
2811 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2812 K1 = B.buildFConstant(
2813 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2814 }
2815
2816 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2817 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2818 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2819
2820 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2821 : B.buildFPTOUI(S32, FloorMul);
2822 auto Lo = B.buildFPTOUI(S32, Fma);
2823
2824 if (Signed && SrcLT == S32) {
2825 // Flip the result based on the signedness, which is either all 0s or 1s.
2826 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2827 // r := xor({lo, hi}, sign) - sign;
2828 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2829 Sign);
2830 } else
2831 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2832 MI.eraseFromParent();
2833
2834 return true;
2835}
2836
2838 MachineInstr &MI) const {
2839 MachineFunction &MF = Helper.MIRBuilder.getMF();
2841
2842 // With ieee_mode disabled, the instructions have the correct behavior.
2843 if (!MFI->getMode().IEEE)
2844 return true;
2845
2847}
2848
2851 MachineIRBuilder &B) const {
2852 // TODO: Should move some of this into LegalizerHelper.
2853
2854 // TODO: Promote dynamic indexing of s16 to s32
2855
2856 Register Dst = MI.getOperand(0).getReg();
2857 Register Vec = MI.getOperand(1).getReg();
2858
2859 LLT VecTy = MRI.getType(Vec);
2860 LLT EltTy = VecTy.getElementType();
2861 assert(EltTy == MRI.getType(Dst));
2862
2863 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2864 // but we can't go directly to that logic becasue you can't bitcast a vector
2865 // of pointers to a vector of integers. Therefore, introduce an intermediate
2866 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2867 // drive the legalization forward.
2868 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2869 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2870 LLT IntVecTy = VecTy.changeElementType(IntTy);
2871
2872 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2873 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2874 B.buildIntToPtr(Dst, IntElt);
2875
2876 MI.eraseFromParent();
2877 return true;
2878 }
2879
2880 // FIXME: Artifact combiner probably should have replaced the truncated
2881 // constant before this, so we shouldn't need
2882 // getIConstantVRegValWithLookThrough.
2883 std::optional<ValueAndVReg> MaybeIdxVal =
2884 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2885 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2886 return true;
2887 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2888
2889 if (IdxVal < VecTy.getNumElements()) {
2890 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2891 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2892 } else {
2893 B.buildUndef(Dst);
2894 }
2895
2896 MI.eraseFromParent();
2897 return true;
2898}
2899
2902 MachineIRBuilder &B) const {
2903 // TODO: Should move some of this into LegalizerHelper.
2904
2905 // TODO: Promote dynamic indexing of s16 to s32
2906
2907 Register Dst = MI.getOperand(0).getReg();
2908 Register Vec = MI.getOperand(1).getReg();
2909 Register Ins = MI.getOperand(2).getReg();
2910
2911 LLT VecTy = MRI.getType(Vec);
2912 LLT EltTy = VecTy.getElementType();
2913 assert(EltTy == MRI.getType(Ins));
2914
2915 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2916 // but we can't go directly to that logic becasue you can't bitcast a vector
2917 // of pointers to a vector of integers. Therefore, make the pointer vector
2918 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2919 // new value, and then inttoptr the result vector back. This will then allow
2920 // the rest of legalization to take over.
2921 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2922 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2923 LLT IntVecTy = VecTy.changeElementType(IntTy);
2924
2925 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2926 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2927 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2928 MI.getOperand(3));
2929 B.buildIntToPtr(Dst, IntVecDest);
2930 MI.eraseFromParent();
2931 return true;
2932 }
2933
2934 // FIXME: Artifact combiner probably should have replaced the truncated
2935 // constant before this, so we shouldn't need
2936 // getIConstantVRegValWithLookThrough.
2937 std::optional<ValueAndVReg> MaybeIdxVal =
2938 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2939 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2940 return true;
2941
2942 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2943
2944 unsigned NumElts = VecTy.getNumElements();
2945 if (IdxVal < NumElts) {
2947 for (unsigned i = 0; i < NumElts; ++i)
2948 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2949 B.buildUnmerge(SrcRegs, Vec);
2950
2951 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2952 B.buildMergeLikeInstr(Dst, SrcRegs);
2953 } else {
2954 B.buildUndef(Dst);
2955 }
2956
2957 MI.eraseFromParent();
2958 return true;
2959}
2960
2963 MachineIRBuilder &B) const {
2964
2965 Register DstReg = MI.getOperand(0).getReg();
2966 Register SrcReg = MI.getOperand(1).getReg();
2967 LLT Ty = MRI.getType(DstReg);
2968 unsigned Flags = MI.getFlags();
2969
2970 Register TrigVal;
2971 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2972 if (ST.hasTrigReducedRange()) {
2973 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2974 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2975 .addUse(MulVal.getReg(0))
2976 .setMIFlags(Flags)
2977 .getReg(0);
2978 } else
2979 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2980
2981 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2982 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2983 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2984 .addUse(TrigVal)
2985 .setMIFlags(Flags);
2986 MI.eraseFromParent();
2987 return true;
2988}
2989
2992 const GlobalValue *GV,
2993 int64_t Offset,
2994 unsigned GAFlags) const {
2995 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2996 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2997 // to the following code sequence:
2998 //
2999 // For constant address space:
3000 // s_getpc_b64 s[0:1]
3001 // s_add_u32 s0, s0, $symbol
3002 // s_addc_u32 s1, s1, 0
3003 //
3004 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3005 // a fixup or relocation is emitted to replace $symbol with a literal
3006 // constant, which is a pc-relative offset from the encoding of the $symbol
3007 // operand to the global variable.
3008 //
3009 // For global address space:
3010 // s_getpc_b64 s[0:1]
3011 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3012 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3013 //
3014 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3015 // fixups or relocations are emitted to replace $symbol@*@lo and
3016 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3017 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3018 // operand to the global variable.
3019
3021
3022 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3023 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3024
3025 if (ST.has64BitLiterals()) {
3026 assert(GAFlags != SIInstrInfo::MO_NONE);
3027
3029 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3030 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3031 } else {
3033 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3034
3035 MIB.addGlobalAddress(GV, Offset, GAFlags);
3036 if (GAFlags == SIInstrInfo::MO_NONE)
3037 MIB.addImm(0);
3038 else
3039 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3040 }
3041
3042 if (!B.getMRI()->getRegClassOrNull(PCReg))
3043 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3044
3045 if (PtrTy.getSizeInBits() == 32)
3046 B.buildExtract(DstReg, PCReg, 0);
3047 return true;
3048}
3049
3050// Emit a ABS32_LO / ABS32_HI relocation stub.
3052 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3053 MachineRegisterInfo &MRI) const {
3054 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3055
3056 if (RequiresHighHalf && ST.has64BitLiterals()) {
3057 if (!MRI.getRegClassOrNull(DstReg))
3058 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3059 B.buildInstr(AMDGPU::S_MOV_B64)
3060 .addDef(DstReg)
3061 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3062 return;
3063 }
3064
3065 LLT S32 = LLT::scalar(32);
3066
3067 // Use the destination directly, if and only if we store the lower address
3068 // part only and we don't have a register class being set.
3069 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3070 ? DstReg
3071 : MRI.createGenericVirtualRegister(S32);
3072
3073 if (!MRI.getRegClassOrNull(AddrLo))
3074 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3075
3076 // Write the lower half.
3077 B.buildInstr(AMDGPU::S_MOV_B32)
3078 .addDef(AddrLo)
3079 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3080
3081 // If required, write the upper half as well.
3082 if (RequiresHighHalf) {
3083 assert(PtrTy.getSizeInBits() == 64 &&
3084 "Must provide a 64-bit pointer type!");
3085
3086 Register AddrHi = MRI.createGenericVirtualRegister(S32);
3087 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3088
3089 B.buildInstr(AMDGPU::S_MOV_B32)
3090 .addDef(AddrHi)
3091 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3092
3093 // Use the destination directly, if and only if we don't have a register
3094 // class being set.
3095 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3096 ? DstReg
3097 : MRI.createGenericVirtualRegister(LLT::scalar(64));
3098
3099 if (!MRI.getRegClassOrNull(AddrDst))
3100 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3101
3102 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3103
3104 // If we created a new register for the destination, cast the result into
3105 // the final output.
3106 if (AddrDst != DstReg)
3107 B.buildCast(DstReg, AddrDst);
3108 } else if (AddrLo != DstReg) {
3109 // If we created a new register for the destination, cast the result into
3110 // the final output.
3111 B.buildCast(DstReg, AddrLo);
3112 }
3113}
3114
3117 MachineIRBuilder &B) const {
3118 Register DstReg = MI.getOperand(0).getReg();
3119 LLT Ty = MRI.getType(DstReg);
3120 unsigned AS = Ty.getAddressSpace();
3121
3122 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3123 MachineFunction &MF = B.getMF();
3125
3127 if (!MFI->isModuleEntryFunction() &&
3128 GV->getName() != "llvm.amdgcn.module.lds" &&
3130 const Function &Fn = MF.getFunction();
3132 Fn, "local memory global used by non-kernel function",
3133 MI.getDebugLoc(), DS_Warning));
3134
3135 // We currently don't have a way to correctly allocate LDS objects that
3136 // aren't directly associated with a kernel. We do force inlining of
3137 // functions that use local objects. However, if these dead functions are
3138 // not eliminated, we don't want a compile time error. Just emit a warning
3139 // and a trap, since there should be no callable path here.
3140 B.buildTrap();
3141 B.buildUndef(DstReg);
3142 MI.eraseFromParent();
3143 return true;
3144 }
3145
3146 // TODO: We could emit code to handle the initialization somewhere.
3147 // We ignore the initializer for now and legalize it to allow selection.
3148 // The initializer will anyway get errored out during assembly emission.
3149 const SITargetLowering *TLI = ST.getTargetLowering();
3150 if (!TLI->shouldUseLDSConstAddress(GV)) {
3151 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3152 return true; // Leave in place;
3153 }
3154
3155 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3156 Type *Ty = GV->getValueType();
3157 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3158 // zero-sized type in other languages to declare the dynamic shared
3159 // memory which size is not known at the compile time. They will be
3160 // allocated by the runtime and placed directly after the static
3161 // allocated ones. They all share the same offset.
3162 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3163 // Adjust alignment for that dynamic shared memory array.
3165 LLT S32 = LLT::scalar(32);
3166 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3167 B.buildIntToPtr(DstReg, Sz);
3168 MI.eraseFromParent();
3169 return true;
3170 }
3171 }
3172
3173 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3174 *cast<GlobalVariable>(GV)));
3175 MI.eraseFromParent();
3176 return true;
3177 }
3178
3179 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3180 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3181 MI.eraseFromParent();
3182 return true;
3183 }
3184
3185 const SITargetLowering *TLI = ST.getTargetLowering();
3186
3187 if (TLI->shouldEmitFixup(GV)) {
3188 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3189 MI.eraseFromParent();
3190 return true;
3191 }
3192
3193 if (TLI->shouldEmitPCReloc(GV)) {
3194 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3195 MI.eraseFromParent();
3196 return true;
3197 }
3198
3200 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3201
3202 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3207 LoadTy, Align(8));
3208
3209 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3210
3211 if (Ty.getSizeInBits() == 32) {
3212 // Truncate if this is a 32-bit constant address.
3213 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3214 B.buildExtract(DstReg, Load, 0);
3215 } else
3216 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3217
3218 MI.eraseFromParent();
3219 return true;
3220}
3221
3223 if (Ty.isVector())
3224 return Ty.changeElementCount(
3225 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3226 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3227}
3228
3230 MachineInstr &MI) const {
3231 MachineIRBuilder &B = Helper.MIRBuilder;
3232 MachineRegisterInfo &MRI = *B.getMRI();
3233 GISelChangeObserver &Observer = Helper.Observer;
3234
3235 Register PtrReg = MI.getOperand(1).getReg();
3236 LLT PtrTy = MRI.getType(PtrReg);
3237 unsigned AddrSpace = PtrTy.getAddressSpace();
3238
3239 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3241 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3242 Observer.changingInstr(MI);
3243 MI.getOperand(1).setReg(Cast.getReg(0));
3244 Observer.changedInstr(MI);
3245 return true;
3246 }
3247
3248 if (MI.getOpcode() != AMDGPU::G_LOAD)
3249 return false;
3250
3251 Register ValReg = MI.getOperand(0).getReg();
3252 LLT ValTy = MRI.getType(ValReg);
3253
3254 if (hasBufferRsrcWorkaround(ValTy)) {
3255 Observer.changingInstr(MI);
3257 Observer.changedInstr(MI);
3258 return true;
3259 }
3260
3261 MachineMemOperand *MMO = *MI.memoperands_begin();
3262 const unsigned ValSize = ValTy.getSizeInBits();
3263 const LLT MemTy = MMO->getMemoryType();
3264 const Align MemAlign = MMO->getAlign();
3265 const unsigned MemSize = MemTy.getSizeInBits();
3266 const uint64_t AlignInBits = 8 * MemAlign.value();
3267
3268 // Widen non-power-of-2 loads to the alignment if needed
3269 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3270 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3271
3272 // This was already the correct extending load result type, so just adjust
3273 // the memory type.
3274 if (WideMemSize == ValSize) {
3275 MachineFunction &MF = B.getMF();
3276
3277 MachineMemOperand *WideMMO =
3278 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3279 Observer.changingInstr(MI);
3280 MI.setMemRefs(MF, {WideMMO});
3281 Observer.changedInstr(MI);
3282 return true;
3283 }
3284
3285 // Don't bother handling edge case that should probably never be produced.
3286 if (ValSize > WideMemSize)
3287 return false;
3288
3289 LLT WideTy = widenToNextPowerOf2(ValTy);
3290
3291 Register WideLoad;
3292 if (!WideTy.isVector()) {
3293 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3294 B.buildTrunc(ValReg, WideLoad).getReg(0);
3295 } else {
3296 // Extract the subvector.
3297
3298 if (isRegisterType(ST, ValTy)) {
3299 // If this a case where G_EXTRACT is legal, use it.
3300 // (e.g. <3 x s32> -> <4 x s32>)
3301 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3302 B.buildExtract(ValReg, WideLoad, 0);
3303 } else {
3304 // For cases where the widened type isn't a nice register value, unmerge
3305 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3306 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3307 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3308 }
3309 }
3310
3311 MI.eraseFromParent();
3312 return true;
3313 }
3314
3315 return false;
3316}
3317
3319 MachineInstr &MI) const {
3320 MachineIRBuilder &B = Helper.MIRBuilder;
3321 MachineRegisterInfo &MRI = *B.getMRI();
3322 GISelChangeObserver &Observer = Helper.Observer;
3323
3324 Register DataReg = MI.getOperand(0).getReg();
3325 LLT DataTy = MRI.getType(DataReg);
3326
3327 if (hasBufferRsrcWorkaround(DataTy)) {
3328 Observer.changingInstr(MI);
3330 Observer.changedInstr(MI);
3331 return true;
3332 }
3333 return false;
3334}
3335
3338 MachineIRBuilder &B) const {
3339 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3340 assert(Ty.isScalar());
3341
3342 MachineFunction &MF = B.getMF();
3344
3345 // TODO: Always legal with future ftz flag.
3346 // FIXME: Do we need just output?
3347 if (Ty == LLT::float32() &&
3349 return true;
3350 if (Ty == LLT::float16() &&
3352 return true;
3353
3354 MachineIRBuilder HelperBuilder(MI);
3355 GISelObserverWrapper DummyObserver;
3356 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3357 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3358}
3359
3362 Register DstReg = MI.getOperand(0).getReg();
3363 Register PtrReg = MI.getOperand(1).getReg();
3364 Register CmpVal = MI.getOperand(2).getReg();
3365 Register NewVal = MI.getOperand(3).getReg();
3366
3367 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3368 "this should not have been custom lowered");
3369
3370 LLT ValTy = MRI.getType(CmpVal);
3371 LLT VecTy = LLT::fixed_vector(2, ValTy);
3372
3373 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3374
3375 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3376 .addDef(DstReg)
3377 .addUse(PtrReg)
3378 .addUse(PackedVal)
3379 .setMemRefs(MI.memoperands());
3380
3381 MI.eraseFromParent();
3382 return true;
3383}
3384
3385/// Return true if it's known that \p Src can never be an f32 denormal value.
3387 Register Src) {
3388 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3389 switch (DefMI->getOpcode()) {
3390 case TargetOpcode::G_INTRINSIC: {
3392 case Intrinsic::amdgcn_frexp_mant:
3393 return true;
3394 default:
3395 break;
3396 }
3397
3398 break;
3399 }
3400 case TargetOpcode::G_FFREXP: {
3401 if (DefMI->getOperand(0).getReg() == Src)
3402 return true;
3403 break;
3404 }
3405 case TargetOpcode::G_FPEXT: {
3406 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3407 }
3408 default:
3409 return false;
3410 }
3411
3412 return false;
3413}
3414
3415static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3416 return Flags & MachineInstr::FmAfn;
3417}
3418
3420 unsigned Flags) {
3421 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3424}
3425
3426std::pair<Register, Register>
3428 unsigned Flags) const {
3429 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3430 return {};
3431
3432 const LLT F32 = LLT::scalar(32);
3433 auto SmallestNormal = B.buildFConstant(
3435 auto IsLtSmallestNormal =
3436 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3437
3438 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3439 auto One = B.buildFConstant(F32, 1.0);
3440 auto ScaleFactor =
3441 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3442 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3443
3444 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3445}
3446
3448 MachineIRBuilder &B) const {
3449 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3450 // If we have to handle denormals, scale up the input and adjust the result.
3451
3452 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3453 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3454
3455 Register Dst = MI.getOperand(0).getReg();
3456 Register Src = MI.getOperand(1).getReg();
3457 LLT Ty = B.getMRI()->getType(Dst);
3458 unsigned Flags = MI.getFlags();
3459
3460 if (Ty == LLT::scalar(16)) {
3461 const LLT F32 = LLT::scalar(32);
3462 // Nothing in half is a denormal when promoted to f32.
3463 auto Ext = B.buildFPExt(F32, Src, Flags);
3464 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3465 .addUse(Ext.getReg(0))
3466 .setMIFlags(Flags);
3467 B.buildFPTrunc(Dst, Log2, Flags);
3468 MI.eraseFromParent();
3469 return true;
3470 }
3471
3472 assert(Ty == LLT::scalar(32));
3473
3474 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3475 if (!ScaledInput) {
3476 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3477 .addUse(Src)
3478 .setMIFlags(Flags);
3479 MI.eraseFromParent();
3480 return true;
3481 }
3482
3483 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3484 .addUse(ScaledInput)
3485 .setMIFlags(Flags);
3486
3487 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3488 auto Zero = B.buildFConstant(Ty, 0.0);
3489 auto ResultOffset =
3490 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3491 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3492
3493 MI.eraseFromParent();
3494 return true;
3495}
3496
3498 Register Z, unsigned Flags) {
3499 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3500 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3501}
3502
3504 MachineIRBuilder &B) const {
3505 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3506 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3507
3508 MachineRegisterInfo &MRI = *B.getMRI();
3509 Register Dst = MI.getOperand(0).getReg();
3510 Register X = MI.getOperand(1).getReg();
3511 unsigned Flags = MI.getFlags();
3512 const LLT Ty = MRI.getType(X);
3513 MachineFunction &MF = B.getMF();
3514
3515 const LLT F32 = LLT::scalar(32);
3516 const LLT F16 = LLT::scalar(16);
3517
3518 const AMDGPUTargetMachine &TM =
3519 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3520
3521 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3522 if (Ty == F16 && !ST.has16BitInsts()) {
3523 Register LogVal = MRI.createGenericVirtualRegister(F32);
3524 auto PromoteSrc = B.buildFPExt(F32, X);
3525 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3526 B.buildFPTrunc(Dst, LogVal);
3527 } else {
3528 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3529 }
3530
3531 MI.eraseFromParent();
3532 return true;
3533 }
3534
3535 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3536 if (ScaledInput)
3537 X = ScaledInput;
3538
3539 auto Y =
3540 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3541
3542 Register R;
3543 if (ST.hasFastFMAF32()) {
3544 // c+cc are ln(2)/ln(10) to more than 49 bits
3545 const float c_log10 = 0x1.344134p-2f;
3546 const float cc_log10 = 0x1.09f79ep-26f;
3547
3548 // c + cc is ln(2) to more than 49 bits
3549 const float c_log = 0x1.62e42ep-1f;
3550 const float cc_log = 0x1.efa39ep-25f;
3551
3552 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3553 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3554
3555 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3556 auto NegR = B.buildFNeg(Ty, R, Flags);
3557 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3558 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3559 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3560 } else {
3561 // ch+ct is ln(2)/ln(10) to more than 36 bits
3562 const float ch_log10 = 0x1.344000p-2f;
3563 const float ct_log10 = 0x1.3509f6p-18f;
3564
3565 // ch + ct is ln(2) to more than 36 bits
3566 const float ch_log = 0x1.62e000p-1f;
3567 const float ct_log = 0x1.0bfbe8p-15f;
3568
3569 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3570 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3571
3572 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3573 auto YH = B.buildAnd(Ty, Y, MaskConst);
3574 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3575 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3576
3577 Register Mad0 =
3578 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3579 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3580 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3581 }
3582
3583 const bool IsFiniteOnly =
3584 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3585 MI.getFlag(MachineInstr::FmNoInfs);
3586
3587 if (!IsFiniteOnly) {
3588 // Expand isfinite(x) => fabs(x) < inf
3589 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3590 auto Fabs = B.buildFAbs(Ty, Y);
3591 auto IsFinite =
3592 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3593 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3594 }
3595
3596 if (ScaledInput) {
3597 auto Zero = B.buildFConstant(Ty, 0.0);
3598 auto ShiftK =
3599 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3600 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3601 B.buildFSub(Dst, R, Shift, Flags);
3602 } else {
3603 B.buildCopy(Dst, R);
3604 }
3605
3606 MI.eraseFromParent();
3607 return true;
3608}
3609
3611 Register Src, bool IsLog10,
3612 unsigned Flags) const {
3613 const double Log2BaseInverted =
3615
3616 LLT Ty = B.getMRI()->getType(Dst);
3617
3618 if (Ty == LLT::scalar(32)) {
3619 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3620 if (ScaledInput) {
3621 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3622 .addUse(Src)
3623 .setMIFlags(Flags);
3624 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3625 auto Zero = B.buildFConstant(Ty, 0.0);
3626 auto ResultOffset =
3627 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3628 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3629
3630 if (ST.hasFastFMAF32())
3631 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3632 else {
3633 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3634 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3635 }
3636
3637 return true;
3638 }
3639 }
3640
3641 auto Log2Operand = Ty == LLT::scalar(16)
3642 ? B.buildFLog2(Ty, Src, Flags)
3643 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3644 .addUse(Src)
3645 .setMIFlags(Flags);
3646 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3647 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3648 return true;
3649}
3650
3652 MachineIRBuilder &B) const {
3653 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3654 // If we have to handle denormals, scale up the input and adjust the result.
3655
3656 Register Dst = MI.getOperand(0).getReg();
3657 Register Src = MI.getOperand(1).getReg();
3658 unsigned Flags = MI.getFlags();
3659 LLT Ty = B.getMRI()->getType(Dst);
3660 const LLT F16 = LLT::scalar(16);
3661 const LLT F32 = LLT::scalar(32);
3662
3663 if (Ty == F16) {
3664 // Nothing in half is a denormal when promoted to f32.
3665 auto Ext = B.buildFPExt(F32, Src, Flags);
3666 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3667 .addUse(Ext.getReg(0))
3668 .setMIFlags(Flags);
3669 B.buildFPTrunc(Dst, Log2, Flags);
3670 MI.eraseFromParent();
3671 return true;
3672 }
3673
3674 assert(Ty == F32);
3675
3676 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3677 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3678 .addUse(Src)
3679 .setMIFlags(Flags);
3680 MI.eraseFromParent();
3681 return true;
3682 }
3683
3684 // bool needs_scaling = x < -0x1.f80000p+6f;
3685 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3686
3687 // -nextafter(128.0, -1)
3688 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3689 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3690 RangeCheckConst, Flags);
3691
3692 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3693 auto Zero = B.buildFConstant(Ty, 0.0);
3694 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3695 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3696
3697 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3698 .addUse(AddInput.getReg(0))
3699 .setMIFlags(Flags);
3700
3701 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3702 auto One = B.buildFConstant(Ty, 1.0);
3703 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3704 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3705 MI.eraseFromParent();
3706 return true;
3707}
3708
3710 Register X, unsigned Flags) const {
3711 LLT Ty = B.getMRI()->getType(Dst);
3712 LLT F32 = LLT::scalar(32);
3713
3714 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3715 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3716 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3717
3718 if (Ty == F32) {
3719 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3720 .addUse(Mul.getReg(0))
3721 .setMIFlags(Flags);
3722 } else {
3723 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3724 }
3725
3726 return true;
3727 }
3728
3729 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3730 auto NeedsScaling =
3731 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3732 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3733 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3734 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3735
3736 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3737 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3738
3739 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3740 .addUse(ExpInput.getReg(0))
3741 .setMIFlags(Flags);
3742
3743 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3744 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3745 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3746 return true;
3747}
3748
3750 MachineIRBuilder &B) const {
3751 Register Dst = MI.getOperand(0).getReg();
3752 Register X = MI.getOperand(1).getReg();
3753 const unsigned Flags = MI.getFlags();
3754 MachineFunction &MF = B.getMF();
3755 MachineRegisterInfo &MRI = *B.getMRI();
3756 LLT Ty = MRI.getType(Dst);
3757 const LLT F16 = LLT::scalar(16);
3758 const LLT F32 = LLT::scalar(32);
3759 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3760
3761 if (Ty == F16) {
3762 // v_exp_f16 (fmul x, log2e)
3763 if (allowApproxFunc(MF, Flags)) {
3764 // TODO: Does this really require fast?
3765 legalizeFExpUnsafe(B, Dst, X, Flags);
3766 MI.eraseFromParent();
3767 return true;
3768 }
3769
3770 // exp(f16 x) ->
3771 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3772
3773 // Nothing in half is a denormal when promoted to f32.
3774 auto Ext = B.buildFPExt(F32, X, Flags);
3775 Register Lowered = MRI.createGenericVirtualRegister(F32);
3776 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3777 B.buildFPTrunc(Dst, Lowered, Flags);
3778 MI.eraseFromParent();
3779 return true;
3780 }
3781
3782 assert(Ty == F32);
3783
3784 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3785 // library behavior. Also, is known-not-daz source sufficient?
3786 if (allowApproxFunc(MF, Flags)) {
3787 legalizeFExpUnsafe(B, Dst, X, Flags);
3788 MI.eraseFromParent();
3789 return true;
3790 }
3791
3792 // Algorithm:
3793 //
3794 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3795 //
3796 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3797 // n = 64*m + j, 0 <= j < 64
3798 //
3799 // e^x = 2^((64*m + j + f)/64)
3800 // = (2^m) * (2^(j/64)) * 2^(f/64)
3801 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3802 //
3803 // f = x*(64/ln(2)) - n
3804 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3805 //
3806 // e^x = (2^m) * (2^(j/64)) * e^r
3807 //
3808 // (2^(j/64)) is precomputed
3809 //
3810 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3811 // e^r = 1 + q
3812 //
3813 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3814 //
3815 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3816 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3817 Register PH, PL;
3818
3819 if (ST.hasFastFMAF32()) {
3820 const float c_exp = numbers::log2ef;
3821 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3822 const float c_exp10 = 0x1.a934f0p+1f;
3823 const float cc_exp10 = 0x1.2f346ep-24f;
3824
3825 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3826 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3827 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3828 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3829
3830 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3831 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3832 } else {
3833 const float ch_exp = 0x1.714000p+0f;
3834 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3835
3836 const float ch_exp10 = 0x1.a92000p+1f;
3837 const float cl_exp10 = 0x1.4f0978p-11f;
3838
3839 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3840 auto XH = B.buildAnd(Ty, X, MaskConst);
3841 auto XL = B.buildFSub(Ty, X, XH, Flags);
3842
3843 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3844 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3845
3846 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3847 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3848
3849 Register Mad0 =
3850 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3851 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3852 }
3853
3854 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3855
3856 // It is unsafe to contract this fsub into the PH multiply.
3857 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3858 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3859 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3860
3861 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3862 .addUse(A.getReg(0))
3863 .setMIFlags(Flags);
3864 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3865
3866 auto UnderflowCheckConst =
3867 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3868 auto Zero = B.buildFConstant(Ty, 0.0);
3869 auto Underflow =
3870 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3871
3872 R = B.buildSelect(Ty, Underflow, Zero, R);
3873
3874 if (!(Flags & MachineInstr::FmNoInfs)) {
3875 auto OverflowCheckConst =
3876 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3877
3878 auto Overflow =
3879 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3880 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3881 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3882 }
3883
3884 B.buildCopy(Dst, R);
3885 MI.eraseFromParent();
3886 return true;
3887}
3888
3890 MachineIRBuilder &B) const {
3891 Register Dst = MI.getOperand(0).getReg();
3892 Register Src0 = MI.getOperand(1).getReg();
3893 Register Src1 = MI.getOperand(2).getReg();
3894 unsigned Flags = MI.getFlags();
3895 LLT Ty = B.getMRI()->getType(Dst);
3896 const LLT F16 = LLT::float16();
3897 const LLT F32 = LLT::float32();
3898
3899 if (Ty == F32) {
3900 auto Log = B.buildFLog2(F32, Src0, Flags);
3901 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3902 .addUse(Log.getReg(0))
3903 .addUse(Src1)
3904 .setMIFlags(Flags);
3905 B.buildFExp2(Dst, Mul, Flags);
3906 } else if (Ty == F16) {
3907 // There's no f16 fmul_legacy, so we need to convert for it.
3908 auto Log = B.buildFLog2(F16, Src0, Flags);
3909 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3910 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3911 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3912 .addUse(Ext0.getReg(0))
3913 .addUse(Ext1.getReg(0))
3914 .setMIFlags(Flags);
3915 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3916 } else
3917 return false;
3918
3919 MI.eraseFromParent();
3920 return true;
3921}
3922
3923// Find a source register, ignoring any possible source modifiers.
3925 Register ModSrc = OrigSrc;
3926 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3927 ModSrc = SrcFNeg->getOperand(1).getReg();
3928 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3929 ModSrc = SrcFAbs->getOperand(1).getReg();
3930 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3931 ModSrc = SrcFAbs->getOperand(1).getReg();
3932 return ModSrc;
3933}
3934
3937 MachineIRBuilder &B) const {
3938
3939 const LLT S1 = LLT::scalar(1);
3940 const LLT F64 = LLT::float64();
3941 Register Dst = MI.getOperand(0).getReg();
3942 Register OrigSrc = MI.getOperand(1).getReg();
3943 unsigned Flags = MI.getFlags();
3944 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3945 "this should not have been custom lowered");
3946
3947 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3948 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3949 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3950 // V_FRACT bug is:
3951 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3952 //
3953 // Convert floor(x) to (x - fract(x))
3954
3955 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3956 .addUse(OrigSrc)
3957 .setMIFlags(Flags);
3958
3959 // Give source modifier matching some assistance before obscuring a foldable
3960 // pattern.
3961
3962 // TODO: We can avoid the neg on the fract? The input sign to fract
3963 // shouldn't matter?
3964 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3965
3966 auto Const =
3967 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3968
3969 Register Min = MRI.createGenericVirtualRegister(F64);
3970
3971 // We don't need to concern ourselves with the snan handling difference, so
3972 // use the one which will directly select.
3973 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3974 if (MFI->getMode().IEEE)
3975 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3976 else
3977 B.buildFMinNum(Min, Fract, Const, Flags);
3978
3979 Register CorrectedFract = Min;
3980 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3981 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3982 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3983 }
3984
3985 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3986 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3987
3988 MI.eraseFromParent();
3989 return true;
3990}
3991
3992// Turn an illegal packed v2s16 build vector into bit operations.
3993// TODO: This should probably be a bitcast action in LegalizerHelper.
3996 Register Dst = MI.getOperand(0).getReg();
3997 const LLT S32 = LLT::scalar(32);
3998 const LLT S16 = LLT::scalar(16);
3999 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4000
4001 Register Src0 = MI.getOperand(1).getReg();
4002 Register Src1 = MI.getOperand(2).getReg();
4003
4004 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4005 assert(MRI.getType(Src0) == S32);
4006 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4007 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4008 }
4009
4010 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4011 B.buildBitcast(Dst, Merge);
4012
4013 MI.eraseFromParent();
4014 return true;
4015}
4016
4017// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4018//
4019// Source and accumulation registers must all be 32-bits.
4020//
4021// TODO: When the multiply is uniform, we should produce a code sequence
4022// that is better suited to instruction selection on the SALU. Instead of
4023// the outer loop going over parts of the result, the outer loop should go
4024// over parts of one of the factors. This should result in instruction
4025// selection that makes full use of S_ADDC_U32 instructions.
4028 ArrayRef<Register> Src0,
4029 ArrayRef<Register> Src1,
4030 bool UsePartialMad64_32,
4031 bool SeparateOddAlignedProducts) const {
4032 // Use (possibly empty) vectors of S1 registers to represent the set of
4033 // carries from one pair of positions to the next.
4034 using Carry = SmallVector<Register, 2>;
4035
4036 MachineIRBuilder &B = Helper.MIRBuilder;
4037 GISelValueTracking &VT = *Helper.getValueTracking();
4038
4039 const LLT S1 = LLT::scalar(1);
4040 const LLT S32 = LLT::scalar(32);
4041 const LLT S64 = LLT::scalar(64);
4042
4043 Register Zero32;
4044 Register Zero64;
4045
4046 auto getZero32 = [&]() -> Register {
4047 if (!Zero32)
4048 Zero32 = B.buildConstant(S32, 0).getReg(0);
4049 return Zero32;
4050 };
4051 auto getZero64 = [&]() -> Register {
4052 if (!Zero64)
4053 Zero64 = B.buildConstant(S64, 0).getReg(0);
4054 return Zero64;
4055 };
4056
4057 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4058 for (unsigned i = 0; i < Src0.size(); ++i) {
4059 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4060 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4061 }
4062
4063 // Merge the given carries into the 32-bit LocalAccum, which is modified
4064 // in-place.
4065 //
4066 // Returns the carry-out, which is a single S1 register or null.
4067 auto mergeCarry =
4068 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4069 if (CarryIn.empty())
4070 return Register();
4071
4072 bool HaveCarryOut = true;
4073 Register CarryAccum;
4074 if (CarryIn.size() == 1) {
4075 if (!LocalAccum) {
4076 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4077 return Register();
4078 }
4079
4080 CarryAccum = getZero32();
4081 } else {
4082 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4083 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4084 CarryAccum =
4085 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4086 .getReg(0);
4087 }
4088
4089 if (!LocalAccum) {
4090 LocalAccum = getZero32();
4091 HaveCarryOut = false;
4092 }
4093 }
4094
4095 auto Add =
4096 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4097 LocalAccum = Add.getReg(0);
4098 return HaveCarryOut ? Add.getReg(1) : Register();
4099 };
4100
4101 // Build a multiply-add chain to compute
4102 //
4103 // LocalAccum + (partial products at DstIndex)
4104 // + (opportunistic subset of CarryIn)
4105 //
4106 // LocalAccum is an array of one or two 32-bit registers that are updated
4107 // in-place. The incoming registers may be null.
4108 //
4109 // In some edge cases, carry-ins can be consumed "for free". In that case,
4110 // the consumed carry bits are removed from CarryIn in-place.
4111 auto buildMadChain =
4112 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4113 -> Carry {
4114 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4115 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4116
4117 Carry CarryOut;
4118 unsigned j0 = 0;
4119
4120 // Use plain 32-bit multiplication for the most significant part of the
4121 // result by default.
4122 if (LocalAccum.size() == 1 &&
4123 (!UsePartialMad64_32 || !CarryIn.empty())) {
4124 do {
4125 // Skip multiplication if one of the operands is 0
4126 unsigned j1 = DstIndex - j0;
4127 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4128 ++j0;
4129 continue;
4130 }
4131 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4132 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4133 LocalAccum[0] = Mul.getReg(0);
4134 } else {
4135 if (CarryIn.empty()) {
4136 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4137 } else {
4138 LocalAccum[0] =
4139 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4140 .getReg(0);
4141 CarryIn.pop_back();
4142 }
4143 }
4144 ++j0;
4145 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4146 }
4147
4148 // Build full 64-bit multiplies.
4149 if (j0 <= DstIndex) {
4150 bool HaveSmallAccum = false;
4151 Register Tmp;
4152
4153 if (LocalAccum[0]) {
4154 if (LocalAccum.size() == 1) {
4155 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4156 HaveSmallAccum = true;
4157 } else if (LocalAccum[1]) {
4158 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4159 HaveSmallAccum = false;
4160 } else {
4161 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4162 HaveSmallAccum = true;
4163 }
4164 } else {
4165 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4166 Tmp = getZero64();
4167 HaveSmallAccum = true;
4168 }
4169
4170 do {
4171 unsigned j1 = DstIndex - j0;
4172 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4173 ++j0;
4174 continue;
4175 }
4176 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4177 {Src0[j0], Src1[j1], Tmp});
4178 Tmp = Mad.getReg(0);
4179 if (!HaveSmallAccum)
4180 CarryOut.push_back(Mad.getReg(1));
4181 HaveSmallAccum = false;
4182
4183 ++j0;
4184 } while (j0 <= DstIndex);
4185
4186 auto Unmerge = B.buildUnmerge(S32, Tmp);
4187 LocalAccum[0] = Unmerge.getReg(0);
4188 if (LocalAccum.size() > 1)
4189 LocalAccum[1] = Unmerge.getReg(1);
4190 }
4191
4192 return CarryOut;
4193 };
4194
4195 // Outer multiply loop, iterating over destination parts from least
4196 // significant to most significant parts.
4197 //
4198 // The columns of the following diagram correspond to the destination parts
4199 // affected by one iteration of the outer loop (ignoring boundary
4200 // conditions).
4201 //
4202 // Dest index relative to 2 * i: 1 0 -1
4203 // ------
4204 // Carries from previous iteration: e o
4205 // Even-aligned partial product sum: E E .
4206 // Odd-aligned partial product sum: O O
4207 //
4208 // 'o' is OddCarry, 'e' is EvenCarry.
4209 // EE and OO are computed from partial products via buildMadChain and use
4210 // accumulation where possible and appropriate.
4211 //
4212 Register SeparateOddCarry;
4213 Carry EvenCarry;
4214 Carry OddCarry;
4215
4216 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4217 Carry OddCarryIn = std::move(OddCarry);
4218 Carry EvenCarryIn = std::move(EvenCarry);
4219 OddCarry.clear();
4220 EvenCarry.clear();
4221
4222 // Partial products at offset 2 * i.
4223 if (2 * i < Accum.size()) {
4224 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4225 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4226 }
4227
4228 // Partial products at offset 2 * i - 1.
4229 if (i > 0) {
4230 if (!SeparateOddAlignedProducts) {
4231 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4232 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4233 } else {
4234 bool IsHighest = 2 * i >= Accum.size();
4235 Register SeparateOddOut[2];
4236 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4237 .take_front(IsHighest ? 1 : 2);
4238 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4239
4241
4242 if (i == 1) {
4243 if (!IsHighest)
4244 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4245 else
4246 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4247 } else {
4248 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4249 SeparateOddCarry);
4250 }
4251 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4252
4253 if (!IsHighest) {
4254 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4255 Lo->getOperand(1).getReg());
4256 Accum[2 * i] = Hi.getReg(0);
4257 SeparateOddCarry = Hi.getReg(1);
4258 }
4259 }
4260 }
4261
4262 // Add in the carries from the previous iteration
4263 if (i > 0) {
4264 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4265 EvenCarryIn.push_back(CarryOut);
4266
4267 if (2 * i < Accum.size()) {
4268 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4269 OddCarry.push_back(CarryOut);
4270 }
4271 }
4272 }
4273}
4274
4275// Custom narrowing of wide multiplies using wide multiply-add instructions.
4276//
4277// TODO: If the multiply is followed by an addition, we should attempt to
4278// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4280 MachineInstr &MI) const {
4281 assert(ST.hasMad64_32());
4282 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4283
4284 MachineIRBuilder &B = Helper.MIRBuilder;
4285 MachineRegisterInfo &MRI = *B.getMRI();
4286
4287 Register DstReg = MI.getOperand(0).getReg();
4288 Register Src0 = MI.getOperand(1).getReg();
4289 Register Src1 = MI.getOperand(2).getReg();
4290
4291 LLT Ty = MRI.getType(DstReg);
4292 assert(Ty.isScalar());
4293
4294 unsigned Size = Ty.getSizeInBits();
4295 if (ST.hasVectorMulU64() && Size == 64)
4296 return true;
4297
4298 unsigned NumParts = Size / 32;
4299 assert((Size % 32) == 0);
4300 assert(NumParts >= 2);
4301
4302 // Whether to use MAD_64_32 for partial products whose high half is
4303 // discarded. This avoids some ADD instructions but risks false dependency
4304 // stalls on some subtargets in some cases.
4305 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4306
4307 // Whether to compute odd-aligned partial products separately. This is
4308 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4309 // in an even-aligned VGPR.
4310 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4311
4312 LLT S32 = LLT::scalar(32);
4313 SmallVector<Register, 2> Src0Parts, Src1Parts;
4314 for (unsigned i = 0; i < NumParts; ++i) {
4315 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4316 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4317 }
4318 B.buildUnmerge(Src0Parts, Src0);
4319 B.buildUnmerge(Src1Parts, Src1);
4320
4321 SmallVector<Register, 2> AccumRegs(NumParts);
4322 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4323 SeparateOddAlignedProducts);
4324
4325 B.buildMergeLikeInstr(DstReg, AccumRegs);
4326 MI.eraseFromParent();
4327 return true;
4328}
4329
4330// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4331// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4332// case with a single min instruction instead of a compare+select.
4335 MachineIRBuilder &B) const {
4336 Register Dst = MI.getOperand(0).getReg();
4337 Register Src = MI.getOperand(1).getReg();
4338 LLT DstTy = MRI.getType(Dst);
4339 LLT SrcTy = MRI.getType(Src);
4340
4341 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4342 ? AMDGPU::G_AMDGPU_FFBH_U32
4343 : AMDGPU::G_AMDGPU_FFBL_B32;
4344 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4345 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4346
4347 MI.eraseFromParent();
4348 return true;
4349}
4350
4353 MachineIRBuilder &B) const {
4354 Register Dst = MI.getOperand(0).getReg();
4355 Register Src = MI.getOperand(1).getReg();
4356 LLT SrcTy = MRI.getType(Src);
4357 TypeSize NumBits = SrcTy.getSizeInBits();
4358
4359 assert(NumBits < 32u);
4360
4361 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4362 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4363 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4364 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4365 B.buildTrunc(Dst, Ctlz);
4366 MI.eraseFromParent();
4367 return true;
4368}
4369
4370// Check that this is a G_XOR x, -1
4371static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4372 if (MI.getOpcode() != TargetOpcode::G_XOR)
4373 return false;
4374 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4375 return ConstVal == -1;
4376}
4377
4378// Return the use branch instruction, otherwise null if the usage is invalid.
4379static MachineInstr *
4381 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4382 Register CondDef = MI.getOperand(0).getReg();
4383 if (!MRI.hasOneNonDBGUse(CondDef))
4384 return nullptr;
4385
4386 MachineBasicBlock *Parent = MI.getParent();
4387 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4388
4389 if (isNot(MRI, *UseMI)) {
4390 Register NegatedCond = UseMI->getOperand(0).getReg();
4391 if (!MRI.hasOneNonDBGUse(NegatedCond))
4392 return nullptr;
4393
4394 // We're deleting the def of this value, so we need to remove it.
4395 eraseInstr(*UseMI, MRI);
4396
4397 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4398 Negated = true;
4399 }
4400
4401 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4402 return nullptr;
4403
4404 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4405 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4406 if (Next == Parent->end()) {
4407 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4408 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4409 return nullptr;
4410 UncondBrTarget = &*NextMBB;
4411 } else {
4412 if (Next->getOpcode() != AMDGPU::G_BR)
4413 return nullptr;
4414 Br = &*Next;
4415 UncondBrTarget = Br->getOperand(0).getMBB();
4416 }
4417
4418 return UseMI;
4419}
4420
4423 const ArgDescriptor *Arg,
4424 const TargetRegisterClass *ArgRC,
4425 LLT ArgTy) const {
4426 MCRegister SrcReg = Arg->getRegister();
4427 assert(SrcReg.isPhysical() && "Physical register expected");
4428 assert(DstReg.isVirtual() && "Virtual register expected");
4429
4430 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4431 *ArgRC, B.getDebugLoc(), ArgTy);
4432 if (Arg->isMasked()) {
4433 // TODO: Should we try to emit this once in the entry block?
4434 const LLT S32 = LLT::scalar(32);
4435 const unsigned Mask = Arg->getMask();
4436 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4437
4438 Register AndMaskSrc = LiveIn;
4439
4440 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4441 // 0.
4442 if (Shift != 0) {
4443 auto ShiftAmt = B.buildConstant(S32, Shift);
4444 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4445 }
4446
4447 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4448 } else {
4449 B.buildCopy(DstReg, LiveIn);
4450 }
4451}
4452
4457 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4458 Register DstReg = MI.getOperand(0).getReg();
4459 if (!ST.hasClusters()) {
4460 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4461 return false;
4462 MI.eraseFromParent();
4463 return true;
4464 }
4465
4466 // Clusters are supported. Return the global position in the grid. If clusters
4467 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4468
4469 // WorkGroupIdXYZ = ClusterId == 0 ?
4470 // ClusterIdXYZ :
4471 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4472 MachineRegisterInfo &MRI = *B.getMRI();
4473 const LLT S32 = LLT::scalar(32);
4474 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4475 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4476 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4477 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4478 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4479 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4480 return false;
4481
4482 auto One = B.buildConstant(S32, 1);
4483 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4484 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4485 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4486
4487 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4488
4489 switch (MFI->getClusterDims().getKind()) {
4492 B.buildCopy(DstReg, GlobalIdXYZ);
4493 MI.eraseFromParent();
4494 return true;
4495 }
4497 B.buildCopy(DstReg, ClusterIdXYZ);
4498 MI.eraseFromParent();
4499 return true;
4500 }
4502 using namespace AMDGPU::Hwreg;
4503 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4504 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4505 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4506 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4507 .addDef(ClusterId)
4508 .addImm(ClusterIdField);
4509 auto Zero = B.buildConstant(S32, 0);
4510 auto NoClusters =
4511 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4512 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4513 MI.eraseFromParent();
4514 return true;
4515 }
4516 }
4517
4518 llvm_unreachable("nothing should reach here");
4519}
4520
4522 Register DstReg, MachineIRBuilder &B,
4524 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4525 const ArgDescriptor *Arg = nullptr;
4526 const TargetRegisterClass *ArgRC;
4527 LLT ArgTy;
4528
4529 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4530 const ArgDescriptor WorkGroupIDX =
4531 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4532 // If GridZ is not programmed in an entry function then the hardware will set
4533 // it to all zeros, so there is no need to mask the GridY value in the low
4534 // order bits.
4535 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4536 AMDGPU::TTMP7,
4537 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4538 const ArgDescriptor WorkGroupIDZ =
4539 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4540 const ArgDescriptor ClusterWorkGroupIDX =
4541 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4542 const ArgDescriptor ClusterWorkGroupIDY =
4543 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4544 const ArgDescriptor ClusterWorkGroupIDZ =
4545 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4546 const ArgDescriptor ClusterWorkGroupMaxIDX =
4547 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4548 const ArgDescriptor ClusterWorkGroupMaxIDY =
4549 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4550 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4551 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4552 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4553 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4554
4555 auto LoadConstant = [&](unsigned N) {
4556 B.buildConstant(DstReg, N);
4557 return true;
4558 };
4559
4560 if (ST.hasArchitectedSGPRs() &&
4562 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4563 bool HasFixedDims = ClusterDims.isFixedDims();
4564
4565 switch (ArgType) {
4567 Arg = &WorkGroupIDX;
4568 ArgRC = &AMDGPU::SReg_32RegClass;
4569 ArgTy = LLT::scalar(32);
4570 break;
4572 Arg = &WorkGroupIDY;
4573 ArgRC = &AMDGPU::SReg_32RegClass;
4574 ArgTy = LLT::scalar(32);
4575 break;
4577 Arg = &WorkGroupIDZ;
4578 ArgRC = &AMDGPU::SReg_32RegClass;
4579 ArgTy = LLT::scalar(32);
4580 break;
4582 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4583 return LoadConstant(0);
4584 Arg = &ClusterWorkGroupIDX;
4585 ArgRC = &AMDGPU::SReg_32RegClass;
4586 ArgTy = LLT::scalar(32);
4587 break;
4589 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4590 return LoadConstant(0);
4591 Arg = &ClusterWorkGroupIDY;
4592 ArgRC = &AMDGPU::SReg_32RegClass;
4593 ArgTy = LLT::scalar(32);
4594 break;
4596 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4597 return LoadConstant(0);
4598 Arg = &ClusterWorkGroupIDZ;
4599 ArgRC = &AMDGPU::SReg_32RegClass;
4600 ArgTy = LLT::scalar(32);
4601 break;
4603 if (HasFixedDims)
4604 return LoadConstant(ClusterDims.getDims()[0] - 1);
4605 Arg = &ClusterWorkGroupMaxIDX;
4606 ArgRC = &AMDGPU::SReg_32RegClass;
4607 ArgTy = LLT::scalar(32);
4608 break;
4610 if (HasFixedDims)
4611 return LoadConstant(ClusterDims.getDims()[1] - 1);
4612 Arg = &ClusterWorkGroupMaxIDY;
4613 ArgRC = &AMDGPU::SReg_32RegClass;
4614 ArgTy = LLT::scalar(32);
4615 break;
4617 if (HasFixedDims)
4618 return LoadConstant(ClusterDims.getDims()[2] - 1);
4619 Arg = &ClusterWorkGroupMaxIDZ;
4620 ArgRC = &AMDGPU::SReg_32RegClass;
4621 ArgTy = LLT::scalar(32);
4622 break;
4624 Arg = &ClusterWorkGroupMaxFlatID;
4625 ArgRC = &AMDGPU::SReg_32RegClass;
4626 ArgTy = LLT::scalar(32);
4627 break;
4628 default:
4629 break;
4630 }
4631 }
4632
4633 if (!Arg)
4634 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4635
4636 if (!Arg) {
4638 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4639 // which case the pointer argument may be missing and we use null.
4640 return LoadConstant(0);
4641 }
4642
4643 // It's undefined behavior if a function marked with the amdgpu-no-*
4644 // attributes uses the corresponding intrinsic.
4645 B.buildUndef(DstReg);
4646 return true;
4647 }
4648
4649 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4650 return false; // TODO: Handle these
4651 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4652 return true;
4653}
4654
4658 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4659 return false;
4660
4661 MI.eraseFromParent();
4662 return true;
4663}
4664
4666 int64_t C) {
4667 B.buildConstant(MI.getOperand(0).getReg(), C);
4668 MI.eraseFromParent();
4669 return true;
4670}
4671
4674 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4675 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4676 if (MaxID == 0)
4677 return replaceWithConstant(B, MI, 0);
4678
4679 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4680 const ArgDescriptor *Arg;
4681 const TargetRegisterClass *ArgRC;
4682 LLT ArgTy;
4683 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4684
4685 Register DstReg = MI.getOperand(0).getReg();
4686 if (!Arg) {
4687 // It's undefined behavior if a function marked with the amdgpu-no-*
4688 // attributes uses the corresponding intrinsic.
4689 B.buildUndef(DstReg);
4690 MI.eraseFromParent();
4691 return true;
4692 }
4693
4694 if (Arg->isMasked()) {
4695 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4696 // masking operations anyway.
4697 //
4698 // TODO: We could assert the top bit is 0 for the source copy.
4699 if (!loadInputValue(DstReg, B, ArgType))
4700 return false;
4701 } else {
4702 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4703 if (!loadInputValue(TmpReg, B, ArgType))
4704 return false;
4705 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4706 }
4707
4708 MI.eraseFromParent();
4709 return true;
4710}
4711
4713 int64_t Offset) const {
4715 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4716
4717 // TODO: If we passed in the base kernel offset we could have a better
4718 // alignment than 4, but we don't really need it.
4719 if (!loadInputValue(KernArgReg, B,
4721 llvm_unreachable("failed to find kernarg segment ptr");
4722
4723 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4724 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4725}
4726
4727/// Legalize a value that's loaded from kernel arguments. This is only used by
4728/// legacy intrinsics.
4732 Align Alignment) const {
4733 Register DstReg = MI.getOperand(0).getReg();
4734
4735 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4736 "unexpected kernarg parameter type");
4737
4740 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4743 MI.eraseFromParent();
4744 return true;
4745}
4746
4749 MachineIRBuilder &B) const {
4750 Register Dst = MI.getOperand(0).getReg();
4751 LLT DstTy = MRI.getType(Dst);
4752 LLT S16 = LLT::scalar(16);
4753 LLT S32 = LLT::scalar(32);
4754 LLT S64 = LLT::scalar(64);
4755
4756 if (DstTy == S16)
4757 return legalizeFDIV16(MI, MRI, B);
4758 if (DstTy == S32)
4759 return legalizeFDIV32(MI, MRI, B);
4760 if (DstTy == S64)
4761 return legalizeFDIV64(MI, MRI, B);
4762
4763 return false;
4764}
4765
4767 Register DstDivReg,
4768 Register DstRemReg,
4769 Register X,
4770 Register Y) const {
4771 const LLT S1 = LLT::scalar(1);
4772 const LLT S32 = LLT::scalar(32);
4773
4774 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4775 // algorithm used here.
4776
4777 // Initial estimate of inv(y).
4778 auto FloatY = B.buildUITOFP(S32, Y);
4779 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4780 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4781 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4782 auto Z = B.buildFPTOUI(S32, ScaledY);
4783
4784 // One round of UNR.
4785 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4786 auto NegYZ = B.buildMul(S32, NegY, Z);
4787 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4788
4789 // Quotient/remainder estimate.
4790 auto Q = B.buildUMulH(S32, X, Z);
4791 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4792
4793 // First quotient/remainder refinement.
4794 auto One = B.buildConstant(S32, 1);
4795 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4796 if (DstDivReg)
4797 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4798 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4799
4800 // Second quotient/remainder refinement.
4801 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4802 if (DstDivReg)
4803 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4804
4805 if (DstRemReg)
4806 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4807}
4808
4809// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4810//
4811// Return lo, hi of result
4812//
4813// %cvt.lo = G_UITOFP Val.lo
4814// %cvt.hi = G_UITOFP Val.hi
4815// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4816// %rcp = G_AMDGPU_RCP_IFLAG %mad
4817// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4818// %mul2 = G_FMUL %mul1, 2**(-32)
4819// %trunc = G_INTRINSIC_TRUNC %mul2
4820// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4821// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4822static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4823 Register Val) {
4824 const LLT S32 = LLT::scalar(32);
4825 auto Unmerge = B.buildUnmerge(S32, Val);
4826
4827 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4828 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4829
4830 auto Mad = B.buildFMAD(
4831 S32, CvtHi, // 2**32
4832 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4833
4834 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4835 auto Mul1 = B.buildFMul(
4836 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4837
4838 // 2**(-32)
4839 auto Mul2 = B.buildFMul(
4840 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4841 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4842
4843 // -(2**32)
4844 auto Mad2 = B.buildFMAD(
4845 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4846 Mul1);
4847
4848 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4849 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4850
4851 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4852}
4853
4855 Register DstDivReg,
4856 Register DstRemReg,
4857 Register Numer,
4858 Register Denom) const {
4859 const LLT S32 = LLT::scalar(32);
4860 const LLT S64 = LLT::scalar(64);
4861 const LLT S1 = LLT::scalar(1);
4862 Register RcpLo, RcpHi;
4863
4864 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4865
4866 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4867
4868 auto Zero64 = B.buildConstant(S64, 0);
4869 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4870
4871 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4872 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4873
4874 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4875 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4876 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4877
4878 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4879 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4880 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4881
4882 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4883 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4884 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4885 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4886 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4887
4888 auto Zero32 = B.buildConstant(S32, 0);
4889 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4890 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4891 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4892
4893 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4894 Register NumerLo = UnmergeNumer.getReg(0);
4895 Register NumerHi = UnmergeNumer.getReg(1);
4896
4897 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4898 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4899 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4900 Register Mul3_Lo = UnmergeMul3.getReg(0);
4901 Register Mul3_Hi = UnmergeMul3.getReg(1);
4902 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4903 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4904 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4905 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4906
4907 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4908 Register DenomLo = UnmergeDenom.getReg(0);
4909 Register DenomHi = UnmergeDenom.getReg(1);
4910
4911 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4912 auto C1 = B.buildSExt(S32, CmpHi);
4913
4914 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4915 auto C2 = B.buildSExt(S32, CmpLo);
4916
4917 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4918 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4919
4920 // TODO: Here and below portions of the code can be enclosed into if/endif.
4921 // Currently control flow is unconditional and we have 4 selects after
4922 // potential endif to substitute PHIs.
4923
4924 // if C3 != 0 ...
4925 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4926 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4927 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4928 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4929
4930 auto One64 = B.buildConstant(S64, 1);
4931 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4932
4933 auto C4 =
4934 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4935 auto C5 =
4936 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4937 auto C6 = B.buildSelect(
4938 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4939
4940 // if (C6 != 0)
4941 auto Add4 = B.buildAdd(S64, Add3, One64);
4942 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4943
4944 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4945 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4946 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4947
4948 // endif C6
4949 // endif C3
4950
4951 if (DstDivReg) {
4952 auto Sel1 = B.buildSelect(
4953 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4954 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4955 Sel1, MulHi3);
4956 }
4957
4958 if (DstRemReg) {
4959 auto Sel2 = B.buildSelect(
4960 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4961 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4962 Sel2, Sub1);
4963 }
4964}
4965
4968 MachineIRBuilder &B) const {
4969 Register DstDivReg, DstRemReg;
4970 switch (MI.getOpcode()) {
4971 default:
4972 llvm_unreachable("Unexpected opcode!");
4973 case AMDGPU::G_UDIV: {
4974 DstDivReg = MI.getOperand(0).getReg();
4975 break;
4976 }
4977 case AMDGPU::G_UREM: {
4978 DstRemReg = MI.getOperand(0).getReg();
4979 break;
4980 }
4981 case AMDGPU::G_UDIVREM: {
4982 DstDivReg = MI.getOperand(0).getReg();
4983 DstRemReg = MI.getOperand(1).getReg();
4984 break;
4985 }
4986 }
4987
4988 const LLT S64 = LLT::scalar(64);
4989 const LLT S32 = LLT::scalar(32);
4990 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4991 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4992 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4993 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4994
4995 if (Ty == S32)
4996 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4997 else if (Ty == S64)
4998 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4999 else
5000 return false;
5001
5002 MI.eraseFromParent();
5003 return true;
5004}
5005
5008 MachineIRBuilder &B) const {
5009 const LLT S64 = LLT::scalar(64);
5010 const LLT S32 = LLT::scalar(32);
5011
5012 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5013 if (Ty != S32 && Ty != S64)
5014 return false;
5015
5016 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5017 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5018 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5019
5020 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5021 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5022 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5023
5024 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5025 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5026
5027 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5028 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5029
5030 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5031 switch (MI.getOpcode()) {
5032 default:
5033 llvm_unreachable("Unexpected opcode!");
5034 case AMDGPU::G_SDIV: {
5035 DstDivReg = MI.getOperand(0).getReg();
5036 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5037 break;
5038 }
5039 case AMDGPU::G_SREM: {
5040 DstRemReg = MI.getOperand(0).getReg();
5041 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5042 break;
5043 }
5044 case AMDGPU::G_SDIVREM: {
5045 DstDivReg = MI.getOperand(0).getReg();
5046 DstRemReg = MI.getOperand(1).getReg();
5047 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5048 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5049 break;
5050 }
5051 }
5052
5053 if (Ty == S32)
5054 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5055 else
5056 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5057
5058 if (DstDivReg) {
5059 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5060 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5061 B.buildSub(DstDivReg, SignXor, Sign);
5062 }
5063
5064 if (DstRemReg) {
5065 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5066 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5067 B.buildSub(DstRemReg, SignXor, Sign);
5068 }
5069
5070 MI.eraseFromParent();
5071 return true;
5072}
5073
5076 MachineIRBuilder &B) const {
5077 Register Res = MI.getOperand(0).getReg();
5078 Register LHS = MI.getOperand(1).getReg();
5079 Register RHS = MI.getOperand(2).getReg();
5080 uint16_t Flags = MI.getFlags();
5081 LLT ResTy = MRI.getType(Res);
5082
5083 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5084
5085 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5086 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5087 return false;
5088
5089 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5090 // the CI documentation has a worst case error of 1 ulp.
5091 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5092 // use it as long as we aren't trying to use denormals.
5093 //
5094 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5095
5096 // 1 / x -> RCP(x)
5097 if (CLHS->isExactlyValue(1.0)) {
5098 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5099 .addUse(RHS)
5100 .setMIFlags(Flags);
5101
5102 MI.eraseFromParent();
5103 return true;
5104 }
5105
5106 // -1 / x -> RCP( FNEG(x) )
5107 if (CLHS->isExactlyValue(-1.0)) {
5108 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5109 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5110 .addUse(FNeg.getReg(0))
5111 .setMIFlags(Flags);
5112
5113 MI.eraseFromParent();
5114 return true;
5115 }
5116 }
5117
5118 // For f16 require afn or arcp.
5119 // For f32 require afn.
5120 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5121 !MI.getFlag(MachineInstr::FmArcp)))
5122 return false;
5123
5124 // x / y -> x * (1.0 / y)
5125 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5126 .addUse(RHS)
5127 .setMIFlags(Flags);
5128 B.buildFMul(Res, LHS, RCP, Flags);
5129
5130 MI.eraseFromParent();
5131 return true;
5132}
5133
5136 MachineIRBuilder &B) const {
5137 Register Res = MI.getOperand(0).getReg();
5138 Register X = MI.getOperand(1).getReg();
5139 Register Y = MI.getOperand(2).getReg();
5140 uint16_t Flags = MI.getFlags();
5141 LLT ResTy = MRI.getType(Res);
5142
5143 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5144
5145 if (!AllowInaccurateRcp)
5146 return false;
5147
5148 auto NegY = B.buildFNeg(ResTy, Y);
5149 auto One = B.buildFConstant(ResTy, 1.0);
5150
5151 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5152 .addUse(Y)
5153 .setMIFlags(Flags);
5154
5155 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5156 R = B.buildFMA(ResTy, Tmp0, R, R);
5157
5158 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5159 R = B.buildFMA(ResTy, Tmp1, R, R);
5160
5161 auto Ret = B.buildFMul(ResTy, X, R);
5162 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5163
5164 B.buildFMA(Res, Tmp2, R, Ret);
5165 MI.eraseFromParent();
5166 return true;
5167}
5168
5171 MachineIRBuilder &B) const {
5173 return true;
5174
5175 Register Res = MI.getOperand(0).getReg();
5176 Register LHS = MI.getOperand(1).getReg();
5177 Register RHS = MI.getOperand(2).getReg();
5178
5179 uint16_t Flags = MI.getFlags();
5180
5181 LLT S16 = LLT::scalar(16);
5182 LLT S32 = LLT::scalar(32);
5183
5184 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5185 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5186 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5187 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5188 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5189 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5190 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5191 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5192 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5193 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5194 // q16.u = opx(V_CVT_F16_F32, q32.u);
5195 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5196
5197 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5198 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5199 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5200 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5201 .addUse(RHSExt.getReg(0))
5202 .setMIFlags(Flags);
5203 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5205 if (ST.hasMadMacF32Insts()) {
5206 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5207 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5208 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5209 } else {
5210 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5211 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5212 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5213 }
5214 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5215 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5216 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5217 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5218 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5219 .addUse(RDst.getReg(0))
5220 .addUse(RHS)
5221 .addUse(LHS)
5222 .setMIFlags(Flags);
5223
5224 MI.eraseFromParent();
5225 return true;
5226}
5227
5228static constexpr unsigned SPDenormModeBitField =
5230
5231// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5232// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5234 const GCNSubtarget &ST,
5236 // Set SP denorm mode to this value.
5237 unsigned SPDenormMode =
5238 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5239
5240 if (ST.hasDenormModeInst()) {
5241 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5242 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5243
5244 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5245 B.buildInstr(AMDGPU::S_DENORM_MODE)
5246 .addImm(NewDenormModeValue);
5247
5248 } else {
5249 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5250 .addImm(SPDenormMode)
5251 .addImm(SPDenormModeBitField);
5252 }
5253}
5254
5257 MachineIRBuilder &B) const {
5259 return true;
5260
5261 Register Res = MI.getOperand(0).getReg();
5262 Register LHS = MI.getOperand(1).getReg();
5263 Register RHS = MI.getOperand(2).getReg();
5264 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5265 SIModeRegisterDefaults Mode = MFI->getMode();
5266
5267 uint16_t Flags = MI.getFlags();
5268
5269 LLT S32 = LLT::scalar(32);
5270 LLT S1 = LLT::scalar(1);
5271
5272 auto One = B.buildFConstant(S32, 1.0f);
5273
5274 auto DenominatorScaled =
5275 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5276 .addUse(LHS)
5277 .addUse(RHS)
5278 .addImm(0)
5279 .setMIFlags(Flags);
5280 auto NumeratorScaled =
5281 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5282 .addUse(LHS)
5283 .addUse(RHS)
5284 .addImm(1)
5285 .setMIFlags(Flags);
5286
5287 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5288 .addUse(DenominatorScaled.getReg(0))
5289 .setMIFlags(Flags);
5290 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5291
5292 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5293 const bool HasDynamicDenormals =
5294 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5295 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5296
5297 Register SavedSPDenormMode;
5298 if (!PreservesDenormals) {
5299 if (HasDynamicDenormals) {
5300 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5301 B.buildInstr(AMDGPU::S_GETREG_B32)
5302 .addDef(SavedSPDenormMode)
5303 .addImm(SPDenormModeBitField);
5304 }
5305 toggleSPDenormMode(true, B, ST, Mode);
5306 }
5307
5308 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5309 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5310 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5311 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5312 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5313 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5314
5315 if (!PreservesDenormals) {
5316 if (HasDynamicDenormals) {
5317 assert(SavedSPDenormMode);
5318 B.buildInstr(AMDGPU::S_SETREG_B32)
5319 .addReg(SavedSPDenormMode)
5320 .addImm(SPDenormModeBitField);
5321 } else
5322 toggleSPDenormMode(false, B, ST, Mode);
5323 }
5324
5325 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5326 .addUse(Fma4.getReg(0))
5327 .addUse(Fma1.getReg(0))
5328 .addUse(Fma3.getReg(0))
5329 .addUse(NumeratorScaled.getReg(1))
5330 .setMIFlags(Flags);
5331
5332 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5333 .addUse(Fmas.getReg(0))
5334 .addUse(RHS)
5335 .addUse(LHS)
5336 .setMIFlags(Flags);
5337
5338 MI.eraseFromParent();
5339 return true;
5340}
5341
5344 MachineIRBuilder &B) const {
5346 return true;
5347
5348 Register Res = MI.getOperand(0).getReg();
5349 Register LHS = MI.getOperand(1).getReg();
5350 Register RHS = MI.getOperand(2).getReg();
5351
5352 uint16_t Flags = MI.getFlags();
5353
5354 LLT S64 = LLT::scalar(64);
5355 LLT S1 = LLT::scalar(1);
5356
5357 auto One = B.buildFConstant(S64, 1.0);
5358
5359 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5360 .addUse(LHS)
5361 .addUse(RHS)
5362 .addImm(0)
5363 .setMIFlags(Flags);
5364
5365 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5366
5367 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5368 .addUse(DivScale0.getReg(0))
5369 .setMIFlags(Flags);
5370
5371 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5372 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5373 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5374
5375 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5376 .addUse(LHS)
5377 .addUse(RHS)
5378 .addImm(1)
5379 .setMIFlags(Flags);
5380
5381 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5382 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5383 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5384
5385 Register Scale;
5386 if (!ST.hasUsableDivScaleConditionOutput()) {
5387 // Workaround a hardware bug on SI where the condition output from div_scale
5388 // is not usable.
5389
5390 LLT S32 = LLT::scalar(32);
5391
5392 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5393 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5394 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5395 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5396
5397 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5398 Scale1Unmerge.getReg(1));
5399 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5400 Scale0Unmerge.getReg(1));
5401 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5402 } else {
5403 Scale = DivScale1.getReg(1);
5404 }
5405
5406 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5407 .addUse(Fma4.getReg(0))
5408 .addUse(Fma3.getReg(0))
5409 .addUse(Mul.getReg(0))
5410 .addUse(Scale)
5411 .setMIFlags(Flags);
5412
5413 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5414 .addUse(Fmas.getReg(0))
5415 .addUse(RHS)
5416 .addUse(LHS)
5417 .setMIFlags(Flags);
5418
5419 MI.eraseFromParent();
5420 return true;
5421}
5422
5425 MachineIRBuilder &B) const {
5426 Register Res0 = MI.getOperand(0).getReg();
5427 Register Res1 = MI.getOperand(1).getReg();
5428 Register Val = MI.getOperand(2).getReg();
5429 uint16_t Flags = MI.getFlags();
5430
5431 LLT Ty = MRI.getType(Res0);
5432 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5433
5434 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5435 .addUse(Val)
5436 .setMIFlags(Flags);
5437 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5438 .addUse(Val)
5439 .setMIFlags(Flags);
5440
5441 if (ST.hasFractBug()) {
5442 auto Fabs = B.buildFAbs(Ty, Val);
5443 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5444 auto IsFinite =
5445 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5446 auto Zero = B.buildConstant(InstrExpTy, 0);
5447 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5448 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5449 }
5450
5451 B.buildCopy(Res0, Mant);
5452 B.buildSExtOrTrunc(Res1, Exp);
5453
5454 MI.eraseFromParent();
5455 return true;
5456}
5457
5460 MachineIRBuilder &B) const {
5461 Register Res = MI.getOperand(0).getReg();
5462 Register LHS = MI.getOperand(2).getReg();
5463 Register RHS = MI.getOperand(3).getReg();
5464 uint16_t Flags = MI.getFlags();
5465
5466 LLT S32 = LLT::scalar(32);
5467 LLT S1 = LLT::scalar(1);
5468
5469 auto Abs = B.buildFAbs(S32, RHS, Flags);
5470 const APFloat C0Val(1.0f);
5471
5472 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5473 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5474 auto C2 = B.buildFConstant(S32, 1.0f);
5475
5476 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5477 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5478
5479 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5480
5481 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5482 .addUse(Mul0.getReg(0))
5483 .setMIFlags(Flags);
5484
5485 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5486
5487 B.buildFMul(Res, Sel, Mul1, Flags);
5488
5489 MI.eraseFromParent();
5490 return true;
5491}
5492
5495 MachineIRBuilder &B) const {
5496 // Bypass the correct expansion a standard promotion through G_FSQRT would
5497 // get. The f32 op is accurate enough for the f16 cas.
5498 unsigned Flags = MI.getFlags();
5499 assert(!ST.has16BitInsts());
5500 const LLT F32 = LLT::scalar(32);
5501 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5502 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5503 .addUse(Ext.getReg(0))
5504 .setMIFlags(Flags);
5505 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5506 MI.eraseFromParent();
5507 return true;
5508}
5509
5512 MachineIRBuilder &B) const {
5513 MachineFunction &MF = B.getMF();
5514 Register Dst = MI.getOperand(0).getReg();
5515 Register X = MI.getOperand(1).getReg();
5516 const unsigned Flags = MI.getFlags();
5517 const LLT S1 = LLT::scalar(1);
5518 const LLT F32 = LLT::scalar(32);
5519 const LLT I32 = LLT::scalar(32);
5520
5521 if (allowApproxFunc(MF, Flags)) {
5522 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5523 .addUse(X)
5524 .setMIFlags(Flags);
5525 MI.eraseFromParent();
5526 return true;
5527 }
5528
5529 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5530 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5531 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5532 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5533 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5534
5535 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5536 if (needsDenormHandlingF32(MF, X, Flags)) {
5537 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5538 .addUse(SqrtX.getReg(0))
5539 .setMIFlags(Flags);
5540
5541 auto NegOne = B.buildConstant(I32, -1);
5542 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5543
5544 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5545 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5546
5547 auto PosOne = B.buildConstant(I32, 1);
5548 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5549
5550 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5551 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5552
5553 auto Zero = B.buildFConstant(F32, 0.0f);
5554 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5555
5556 SqrtS =
5557 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5558
5559 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5560 SqrtS =
5561 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5562 } else {
5563 auto SqrtR =
5564 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5565 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5566
5567 auto Half = B.buildFConstant(F32, 0.5f);
5568 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5569 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5570 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5571 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5572 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5573 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5574 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5575 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5576 }
5577
5578 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5579
5580 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5581
5582 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5583
5584 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5585 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5586
5587 MI.eraseFromParent();
5588 return true;
5589}
5590
5593 MachineIRBuilder &B) const {
5594 // For double type, the SQRT and RSQ instructions don't have required
5595 // precision, we apply Goldschmidt's algorithm to improve the result:
5596 //
5597 // y0 = rsq(x)
5598 // g0 = x * y0
5599 // h0 = 0.5 * y0
5600 //
5601 // r0 = 0.5 - h0 * g0
5602 // g1 = g0 * r0 + g0
5603 // h1 = h0 * r0 + h0
5604 //
5605 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5606 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5607 // h2 = h1 * r1 + h1
5608 //
5609 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5610 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5611 //
5612 // sqrt(x) = g3
5613
5614 const LLT S1 = LLT::scalar(1);
5615 const LLT S32 = LLT::scalar(32);
5616 const LLT F64 = LLT::scalar(64);
5617
5618 Register Dst = MI.getOperand(0).getReg();
5619 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5620
5621 Register X = MI.getOperand(1).getReg();
5622 unsigned Flags = MI.getFlags();
5623
5624 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5625
5626 auto ZeroInt = B.buildConstant(S32, 0);
5627 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5628
5629 // Scale up input if it is too small.
5630 auto ScaleUpFactor = B.buildConstant(S32, 256);
5631 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5632 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5633
5634 auto SqrtY =
5635 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5636
5637 auto Half = B.buildFConstant(F64, 0.5);
5638 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5639 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5640
5641 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5642 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5643
5644 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5645 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5646
5647 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5648 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5649
5650 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5651
5652 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5653 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5654
5655 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5656
5657 // Scale down the result.
5658 auto ScaleDownFactor = B.buildConstant(S32, -128);
5659 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5660 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5661
5662 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5663 // with finite only or nsz because rsq(+/-0) = +/-inf
5664
5665 // TODO: Check for DAZ and expand to subnormals
5666 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5667
5668 // If x is +INF, +0, or -0, use its original value
5669 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5670
5671 MI.eraseFromParent();
5672 return true;
5673}
5674
5677 MachineIRBuilder &B) const {
5678 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5679 if (Ty == LLT::scalar(32))
5680 return legalizeFSQRTF32(MI, MRI, B);
5681 if (Ty == LLT::scalar(64))
5682 return legalizeFSQRTF64(MI, MRI, B);
5683 if (Ty == LLT::scalar(16))
5684 return legalizeFSQRTF16(MI, MRI, B);
5685 return false;
5686}
5687
5688// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5689// FIXME: Why do we handle this one but not other removed instructions?
5690//
5691// Reciprocal square root. The clamp prevents infinite results, clamping
5692// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5693// +-max_float.
5696 MachineIRBuilder &B) const {
5697 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5698 return true;
5699
5700 Register Dst = MI.getOperand(0).getReg();
5701 Register Src = MI.getOperand(2).getReg();
5702 auto Flags = MI.getFlags();
5703
5704 LLT Ty = MRI.getType(Dst);
5705
5706 const fltSemantics *FltSemantics;
5707 if (Ty == LLT::scalar(32))
5708 FltSemantics = &APFloat::IEEEsingle();
5709 else if (Ty == LLT::scalar(64))
5710 FltSemantics = &APFloat::IEEEdouble();
5711 else
5712 return false;
5713
5714 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5715 .addUse(Src)
5716 .setMIFlags(Flags);
5717
5718 // We don't need to concern ourselves with the snan handling difference, since
5719 // the rsq quieted (or not) so use the one which will directly select.
5720 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5721 const bool UseIEEE = MFI->getMode().IEEE;
5722
5723 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5724 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5725 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5726
5727 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5728
5729 if (UseIEEE)
5730 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5731 else
5732 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5733 MI.eraseFromParent();
5734 return true;
5735}
5736
5737// TODO: Fix pointer type handling
5740 Intrinsic::ID IID) const {
5741
5742 MachineIRBuilder &B = Helper.MIRBuilder;
5743 MachineRegisterInfo &MRI = *B.getMRI();
5744
5745 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5746 IID == Intrinsic::amdgcn_permlanex16;
5747 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5748 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5749
5750 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5751 Register Src2, LLT VT) -> Register {
5752 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5753 switch (IID) {
5754 case Intrinsic::amdgcn_readfirstlane:
5755 case Intrinsic::amdgcn_permlane64:
5756 return LaneOp.getReg(0);
5757 case Intrinsic::amdgcn_readlane:
5758 case Intrinsic::amdgcn_set_inactive:
5759 case Intrinsic::amdgcn_set_inactive_chain_arg:
5760 return LaneOp.addUse(Src1).getReg(0);
5761 case Intrinsic::amdgcn_writelane:
5762 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5763 case Intrinsic::amdgcn_permlane16:
5764 case Intrinsic::amdgcn_permlanex16: {
5765 Register Src3 = MI.getOperand(5).getReg();
5766 int64_t Src4 = MI.getOperand(6).getImm();
5767 int64_t Src5 = MI.getOperand(7).getImm();
5768 return LaneOp.addUse(Src1)
5769 .addUse(Src2)
5770 .addUse(Src3)
5771 .addImm(Src4)
5772 .addImm(Src5)
5773 .getReg(0);
5774 }
5775 case Intrinsic::amdgcn_mov_dpp8:
5776 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5777 case Intrinsic::amdgcn_update_dpp:
5778 return LaneOp.addUse(Src1)
5779 .addImm(MI.getOperand(4).getImm())
5780 .addImm(MI.getOperand(5).getImm())
5781 .addImm(MI.getOperand(6).getImm())
5782 .addImm(MI.getOperand(7).getImm())
5783 .getReg(0);
5784 default:
5785 llvm_unreachable("unhandled lane op");
5786 }
5787 };
5788
5789 Register DstReg = MI.getOperand(0).getReg();
5790 Register Src0 = MI.getOperand(2).getReg();
5791 Register Src1, Src2;
5792 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5793 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5794 Src1 = MI.getOperand(3).getReg();
5795 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5796 Src2 = MI.getOperand(4).getReg();
5797 }
5798 }
5799
5800 LLT Ty = MRI.getType(DstReg);
5801 unsigned Size = Ty.getSizeInBits();
5802
5803 unsigned SplitSize = 32;
5804 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5805 ST.hasDPALU_DPP() &&
5806 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
5807 SplitSize = 64;
5808
5809 if (Size == SplitSize) {
5810 // Already legal
5811 return true;
5812 }
5813
5814 if (Size < 32) {
5815 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5816
5817 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5818 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5819
5820 if (IID == Intrinsic::amdgcn_writelane)
5821 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5822
5823 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5824 B.buildTrunc(DstReg, LaneOpDst);
5825 MI.eraseFromParent();
5826 return true;
5827 }
5828
5829 if (Size % SplitSize != 0)
5830 return false;
5831
5832 LLT PartialResTy = LLT::scalar(SplitSize);
5833 bool NeedsBitcast = false;
5834 if (Ty.isVector()) {
5835 LLT EltTy = Ty.getElementType();
5836 unsigned EltSize = EltTy.getSizeInBits();
5837 if (EltSize == SplitSize) {
5838 PartialResTy = EltTy;
5839 } else if (EltSize == 16 || EltSize == 32) {
5840 unsigned NElem = SplitSize / EltSize;
5841 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5842 } else {
5843 // Handle all other cases via S32/S64 pieces
5844 NeedsBitcast = true;
5845 }
5846 }
5847
5848 SmallVector<Register, 4> PartialRes;
5849 unsigned NumParts = Size / SplitSize;
5850 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5851 MachineInstrBuilder Src1Parts, Src2Parts;
5852
5853 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5854 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5855
5856 if (IID == Intrinsic::amdgcn_writelane)
5857 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5858
5859 for (unsigned i = 0; i < NumParts; ++i) {
5860 Src0 = Src0Parts.getReg(i);
5861
5862 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5863 Src1 = Src1Parts.getReg(i);
5864
5865 if (IID == Intrinsic::amdgcn_writelane)
5866 Src2 = Src2Parts.getReg(i);
5867
5868 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5869 }
5870
5871 if (NeedsBitcast)
5872 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
5873 LLT::scalar(Ty.getSizeInBits()), PartialRes));
5874 else
5875 B.buildMergeLikeInstr(DstReg, PartialRes);
5876
5877 MI.eraseFromParent();
5878 return true;
5879}
5880
5883 MachineIRBuilder &B) const {
5885 ST.getTargetLowering()->getImplicitParameterOffset(
5887 LLT DstTy = MRI.getType(DstReg);
5888 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5889
5890 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5891 if (!loadInputValue(KernargPtrReg, B,
5893 return false;
5894
5895 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
5896 B.buildConstant(IdxTy, Offset).getReg(0));
5897 return true;
5898}
5899
5900/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5901/// bits of the pointer and replace them with the stride argument, then
5902/// merge_values everything together. In the common case of a raw buffer (the
5903/// stride component is 0), we can just AND off the upper half.
5906 Register Result = MI.getOperand(0).getReg();
5907 Register Pointer = MI.getOperand(2).getReg();
5908 Register Stride = MI.getOperand(3).getReg();
5909 Register NumRecords = MI.getOperand(4).getReg();
5910 Register Flags = MI.getOperand(5).getReg();
5911
5912 LLT S32 = LLT::scalar(32);
5913 LLT S64 = LLT::scalar(64);
5914
5915 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5916
5917 auto ExtStride = B.buildAnyExt(S32, Stride);
5918
5919 if (ST.has45BitNumRecordsBufferResource()) {
5920 Register Zero = B.buildConstant(S32, 0).getReg(0);
5921 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
5922 // num_records.
5923 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
5924 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
5925 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
5926 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
5927 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
5928
5929 // Build the higher 64-bit value, which has the higher 38-bit num_records,
5930 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
5931 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
5932 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
5933 auto ExtShiftedStride =
5934 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
5935 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
5936 auto ExtShiftedFlags =
5937 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
5938 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
5939 Register HighHalf =
5940 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
5941 B.buildMergeValues(Result, {LowHalf, HighHalf});
5942 } else {
5943 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
5944 auto Unmerge = B.buildUnmerge(S32, Pointer);
5945 auto LowHalf = Unmerge.getReg(0);
5946 auto HighHalf = Unmerge.getReg(1);
5947
5948 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5949 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5950 auto ShiftConst = B.buildConstant(S32, 16);
5951 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5952 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5953 Register NewHighHalfReg = NewHighHalf.getReg(0);
5954 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5955 }
5956
5957 MI.eraseFromParent();
5958 return true;
5959}
5960
5963 MachineIRBuilder &B) const {
5964 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5965 if (!MFI->isEntryFunction()) {
5968 }
5969
5970 Register DstReg = MI.getOperand(0).getReg();
5971 if (!getImplicitArgPtr(DstReg, MRI, B))
5972 return false;
5973
5974 MI.eraseFromParent();
5975 return true;
5976}
5977
5980 MachineIRBuilder &B) const {
5981 Function &F = B.getMF().getFunction();
5982 std::optional<uint32_t> KnownSize =
5984 if (KnownSize.has_value())
5985 B.buildConstant(DstReg, *KnownSize);
5986 return false;
5987}
5988
5991 MachineIRBuilder &B) const {
5992
5993 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5994 if (!MFI->isEntryFunction()) {
5997 }
5998
5999 Register DstReg = MI.getOperand(0).getReg();
6000 if (!getLDSKernelId(DstReg, MRI, B))
6001 return false;
6002
6003 MI.eraseFromParent();
6004 return true;
6005}
6006
6010 unsigned AddrSpace) const {
6011 const LLT S32 = LLT::scalar(32);
6012 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6013 Register Hi32 = Unmerge.getReg(1);
6014
6015 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6016 ST.hasGloballyAddressableScratch()) {
6017 Register FlatScratchBaseHi =
6018 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6019 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6020 .getReg(0);
6021 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6022 // Test bits 63..58 against the aperture address.
6023 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6024 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6025 B.buildConstant(S32, 1u << 26));
6026 } else {
6027 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6028 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6029 }
6030 MI.eraseFromParent();
6031 return true;
6032}
6033
6034// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6035// offset (the offset that is included in bounds checking and swizzling, to be
6036// split between the instruction's voffset and immoffset fields) and soffset
6037// (the offset that is excluded from bounds checking and swizzling, to go in
6038// the instruction's soffset field). This function takes the first kind of
6039// offset and figures out how to split it between voffset and immoffset.
6040std::pair<Register, unsigned>
6042 Register OrigOffset) const {
6043 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6044 Register BaseReg;
6045 unsigned ImmOffset;
6046 const LLT S32 = LLT::scalar(32);
6047 MachineRegisterInfo &MRI = *B.getMRI();
6048
6049 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6050 // being added, so we can only safely match a 32-bit addition with no unsigned
6051 // overflow.
6052 bool CheckNUW = AMDGPU::isGFX1250(ST);
6053 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6054 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6055
6056 // If BaseReg is a pointer, convert it to int.
6057 if (MRI.getType(BaseReg).isPointer())
6058 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6059
6060 // If the immediate value is too big for the immoffset field, put only bits
6061 // that would normally fit in the immoffset field. The remaining value that
6062 // is copied/added for the voffset field is a large power of 2, and it
6063 // stands more chance of being CSEd with the copy/add for another similar
6064 // load/store.
6065 // However, do not do that rounding down if that is a negative
6066 // number, as it appears to be illegal to have a negative offset in the
6067 // vgpr, even if adding the immediate offset makes it positive.
6068 unsigned Overflow = ImmOffset & ~MaxImm;
6069 ImmOffset -= Overflow;
6070 if ((int32_t)Overflow < 0) {
6071 Overflow += ImmOffset;
6072 ImmOffset = 0;
6073 }
6074
6075 if (Overflow != 0) {
6076 if (!BaseReg) {
6077 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6078 } else {
6079 auto OverflowVal = B.buildConstant(S32, Overflow);
6080 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6081 }
6082 }
6083
6084 if (!BaseReg)
6085 BaseReg = B.buildConstant(S32, 0).getReg(0);
6086
6087 return std::pair(BaseReg, ImmOffset);
6088}
6089
6090/// Handle register layout difference for f16 images for some subtargets.
6093 Register Reg,
6094 bool ImageStore) const {
6095 const LLT S16 = LLT::scalar(16);
6096 const LLT S32 = LLT::scalar(32);
6097 LLT StoreVT = MRI.getType(Reg);
6098 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6099
6100 if (ST.hasUnpackedD16VMem()) {
6101 auto Unmerge = B.buildUnmerge(S16, Reg);
6102
6103 SmallVector<Register, 4> WideRegs;
6104 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6105 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6106
6107 int NumElts = StoreVT.getNumElements();
6108
6109 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6110 .getReg(0);
6111 }
6112
6113 if (ImageStore && ST.hasImageStoreD16Bug()) {
6114 if (StoreVT.getNumElements() == 2) {
6115 SmallVector<Register, 4> PackedRegs;
6116 Reg = B.buildBitcast(S32, Reg).getReg(0);
6117 PackedRegs.push_back(Reg);
6118 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6119 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6120 .getReg(0);
6121 }
6122
6123 if (StoreVT.getNumElements() == 3) {
6124 SmallVector<Register, 4> PackedRegs;
6125 auto Unmerge = B.buildUnmerge(S16, Reg);
6126 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6127 PackedRegs.push_back(Unmerge.getReg(I));
6128 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6129 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6130 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6131 }
6132
6133 if (StoreVT.getNumElements() == 4) {
6134 SmallVector<Register, 4> PackedRegs;
6135 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6136 auto Unmerge = B.buildUnmerge(S32, Reg);
6137 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6138 PackedRegs.push_back(Unmerge.getReg(I));
6139 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6140 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6141 .getReg(0);
6142 }
6143
6144 llvm_unreachable("invalid data type");
6145 }
6146
6147 if (StoreVT == LLT::fixed_vector(3, S16)) {
6148 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6149 .getReg(0);
6150 }
6151 return Reg;
6152}
6153
6155 Register VData, LLT MemTy,
6156 bool IsFormat) const {
6157 MachineRegisterInfo *MRI = B.getMRI();
6158 LLT Ty = MRI->getType(VData);
6159
6160 const LLT S16 = LLT::scalar(16);
6161
6162 // Fixup buffer resources themselves needing to be v4i128.
6164 return castBufferRsrcToV4I32(VData, B);
6165
6166 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6167 Ty = getBitcastRegisterType(Ty);
6168 VData = B.buildBitcast(Ty, VData).getReg(0);
6169 }
6170 // Fixup illegal register types for i8 stores.
6171 if (Ty == LLT::scalar(8) || Ty == S16) {
6172 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6173 return AnyExt;
6174 }
6175
6176 if (Ty.isVector()) {
6177 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6178 if (IsFormat)
6179 return handleD16VData(B, *MRI, VData);
6180 }
6181 }
6182
6183 return VData;
6184}
6185
6187 LegalizerHelper &Helper,
6188 bool IsTyped,
6189 bool IsFormat) const {
6190 MachineIRBuilder &B = Helper.MIRBuilder;
6191 MachineRegisterInfo &MRI = *B.getMRI();
6192
6193 Register VData = MI.getOperand(1).getReg();
6194 LLT Ty = MRI.getType(VData);
6195 LLT EltTy = Ty.getScalarType();
6196 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6197 const LLT S32 = LLT::scalar(32);
6198
6199 MachineMemOperand *MMO = *MI.memoperands_begin();
6200 const int MemSize = MMO->getSize().getValue();
6201 LLT MemTy = MMO->getMemoryType();
6202
6203 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6204
6206 Register RSrc = MI.getOperand(2).getReg();
6207
6208 unsigned ImmOffset;
6209
6210 // The typed intrinsics add an immediate after the registers.
6211 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6212
6213 // The struct intrinsic variants add one additional operand over raw.
6214 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6215 Register VIndex;
6216 int OpOffset = 0;
6217 if (HasVIndex) {
6218 VIndex = MI.getOperand(3).getReg();
6219 OpOffset = 1;
6220 } else {
6221 VIndex = B.buildConstant(S32, 0).getReg(0);
6222 }
6223
6224 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6225 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6226
6227 unsigned Format = 0;
6228 if (IsTyped) {
6229 Format = MI.getOperand(5 + OpOffset).getImm();
6230 ++OpOffset;
6231 }
6232
6233 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6234
6235 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6236
6237 unsigned Opc;
6238 if (IsTyped) {
6239 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6240 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6241 } else if (IsFormat) {
6242 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6243 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6244 } else {
6245 switch (MemSize) {
6246 case 1:
6247 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6248 break;
6249 case 2:
6250 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6251 break;
6252 default:
6253 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6254 break;
6255 }
6256 }
6257
6258 auto MIB = B.buildInstr(Opc)
6259 .addUse(VData) // vdata
6260 .addUse(RSrc) // rsrc
6261 .addUse(VIndex) // vindex
6262 .addUse(VOffset) // voffset
6263 .addUse(SOffset) // soffset
6264 .addImm(ImmOffset); // offset(imm)
6265
6266 if (IsTyped)
6267 MIB.addImm(Format);
6268
6269 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6270 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6271 .addMemOperand(MMO);
6272
6273 MI.eraseFromParent();
6274 return true;
6275}
6276
6277static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6278 Register VIndex, Register VOffset, Register SOffset,
6279 unsigned ImmOffset, unsigned Format,
6280 unsigned AuxiliaryData, MachineMemOperand *MMO,
6281 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6282 auto MIB = B.buildInstr(Opc)
6283 .addDef(LoadDstReg) // vdata
6284 .addUse(RSrc) // rsrc
6285 .addUse(VIndex) // vindex
6286 .addUse(VOffset) // voffset
6287 .addUse(SOffset) // soffset
6288 .addImm(ImmOffset); // offset(imm)
6289
6290 if (IsTyped)
6291 MIB.addImm(Format);
6292
6293 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6294 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6295 .addMemOperand(MMO);
6296}
6297
6299 LegalizerHelper &Helper,
6300 bool IsFormat,
6301 bool IsTyped) const {
6302 MachineIRBuilder &B = Helper.MIRBuilder;
6303 MachineRegisterInfo &MRI = *B.getMRI();
6304 GISelChangeObserver &Observer = Helper.Observer;
6305
6306 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6307 MachineMemOperand *MMO = *MI.memoperands_begin();
6308 const LLT MemTy = MMO->getMemoryType();
6309 const LLT S32 = LLT::scalar(32);
6310
6311 Register Dst = MI.getOperand(0).getReg();
6312
6313 Register StatusDst;
6314 int OpOffset = 0;
6315 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6316 bool IsTFE = MI.getNumExplicitDefs() == 2;
6317 if (IsTFE) {
6318 StatusDst = MI.getOperand(1).getReg();
6319 ++OpOffset;
6320 }
6321
6322 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6323 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6324
6325 // The typed intrinsics add an immediate after the registers.
6326 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6327
6328 // The struct intrinsic variants add one additional operand over raw.
6329 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6330 Register VIndex;
6331 if (HasVIndex) {
6332 VIndex = MI.getOperand(3 + OpOffset).getReg();
6333 ++OpOffset;
6334 } else {
6335 VIndex = B.buildConstant(S32, 0).getReg(0);
6336 }
6337
6338 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6339 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6340
6341 unsigned Format = 0;
6342 if (IsTyped) {
6343 Format = MI.getOperand(5 + OpOffset).getImm();
6344 ++OpOffset;
6345 }
6346
6347 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6348 unsigned ImmOffset;
6349
6350 LLT Ty = MRI.getType(Dst);
6351 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6352 // logic doesn't have to handle that case.
6353 if (hasBufferRsrcWorkaround(Ty)) {
6354 Observer.changingInstr(MI);
6355 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6356 Observer.changedInstr(MI);
6357 Dst = MI.getOperand(0).getReg();
6358 B.setInsertPt(B.getMBB(), MI);
6359 }
6360 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6361 Ty = getBitcastRegisterType(Ty);
6362 Observer.changingInstr(MI);
6363 Helper.bitcastDst(MI, Ty, 0);
6364 Observer.changedInstr(MI);
6365 Dst = MI.getOperand(0).getReg();
6366 B.setInsertPt(B.getMBB(), MI);
6367 }
6368
6369 LLT EltTy = Ty.getScalarType();
6370 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6371 const bool Unpacked = ST.hasUnpackedD16VMem();
6372
6373 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6374
6375 unsigned Opc;
6376
6377 // TODO: Support TFE for typed and narrow loads.
6378 if (IsTyped) {
6379 if (IsTFE)
6380 return false;
6381 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6382 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6383 } else if (IsFormat) {
6384 if (IsD16) {
6385 if (IsTFE)
6386 return false;
6387 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6388 } else {
6389 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6390 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6391 }
6392 } else {
6393 switch (MemTy.getSizeInBits()) {
6394 case 8:
6395 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6396 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6397 break;
6398 case 16:
6399 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6400 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6401 break;
6402 default:
6403 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6404 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6405 break;
6406 }
6407 }
6408
6409 if (IsTFE) {
6410 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6411 unsigned NumLoadDWords = NumValueDWords + 1;
6412 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6413 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6414 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6415 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6416 if (MemTy.getSizeInBits() < 32) {
6417 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6418 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6419 B.buildTrunc(Dst, ExtDst);
6420 } else if (NumValueDWords == 1) {
6421 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6422 } else {
6423 SmallVector<Register, 5> LoadElts;
6424 for (unsigned I = 0; I != NumValueDWords; ++I)
6425 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6426 LoadElts.push_back(StatusDst);
6427 B.buildUnmerge(LoadElts, LoadDstReg);
6428 LoadElts.truncate(NumValueDWords);
6429 B.buildMergeLikeInstr(Dst, LoadElts);
6430 }
6431 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6432 (IsD16 && !Ty.isVector())) {
6433 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6434 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6435 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6436 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6437 B.buildTrunc(Dst, LoadDstReg);
6438 } else if (Unpacked && IsD16 && Ty.isVector()) {
6439 LLT UnpackedTy = Ty.changeElementSize(32);
6440 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6441 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6442 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6443 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6444 // FIXME: G_TRUNC should work, but legalization currently fails
6445 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6447 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6448 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6449 B.buildMergeLikeInstr(Dst, Repack);
6450 } else {
6451 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6452 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6453 }
6454
6455 MI.eraseFromParent();
6456 return true;
6457}
6458
6459static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6460 switch (IntrID) {
6461 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6463 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6465 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6466 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6468 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6471 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6473 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6475 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6476 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6478 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6480 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6481 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6482 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6483 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6486 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6487 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6488 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6490 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6491 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6493 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6495 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6496 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6497 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6498 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6500 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6501 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6502 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6503 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6505 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6506 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6507 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6508 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6509 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6510 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6511 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6512 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6513 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6514 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6515 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6516 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6518 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6520 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6521 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6522 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6523 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6525 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6526 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6527 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6528 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6529 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6530 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6531 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6532 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6533 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6534 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6535 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6536 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6537 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6538 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6539 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6540 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6541 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6542 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6543 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6544 default:
6545 llvm_unreachable("unhandled atomic opcode");
6546 }
6547}
6548
6551 Intrinsic::ID IID) const {
6552 const bool IsCmpSwap =
6553 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6554 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6555 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6556 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6557
6558 Register Dst = MI.getOperand(0).getReg();
6559 // Since we don't have 128-bit atomics, we don't need to handle the case of
6560 // p8 argmunents to the atomic itself
6561 Register VData = MI.getOperand(2).getReg();
6562
6563 Register CmpVal;
6564 int OpOffset = 0;
6565
6566 if (IsCmpSwap) {
6567 CmpVal = MI.getOperand(3).getReg();
6568 ++OpOffset;
6569 }
6570
6571 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6572 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6573 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6574
6575 // The struct intrinsic variants add one additional operand over raw.
6576 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6577 Register VIndex;
6578 if (HasVIndex) {
6579 VIndex = MI.getOperand(4 + OpOffset).getReg();
6580 ++OpOffset;
6581 } else {
6582 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6583 }
6584
6585 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6586 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6587 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6588
6589 MachineMemOperand *MMO = *MI.memoperands_begin();
6590
6591 unsigned ImmOffset;
6592 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6593
6594 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6595 .addDef(Dst)
6596 .addUse(VData); // vdata
6597
6598 if (IsCmpSwap)
6599 MIB.addReg(CmpVal);
6600
6601 MIB.addUse(RSrc) // rsrc
6602 .addUse(VIndex) // vindex
6603 .addUse(VOffset) // voffset
6604 .addUse(SOffset) // soffset
6605 .addImm(ImmOffset) // offset(imm)
6606 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6607 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6608 .addMemOperand(MMO);
6609
6610 MI.eraseFromParent();
6611 return true;
6612}
6613
6614/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6615/// vector with s16 typed elements.
6617 SmallVectorImpl<Register> &PackedAddrs,
6618 unsigned ArgOffset,
6620 bool IsA16, bool IsG16) {
6621 const LLT S16 = LLT::scalar(16);
6622 const LLT V2S16 = LLT::fixed_vector(2, 16);
6623 auto EndIdx = Intr->VAddrEnd;
6624
6625 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6626 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6627 if (!SrcOp.isReg())
6628 continue; // _L to _LZ may have eliminated this.
6629
6630 Register AddrReg = SrcOp.getReg();
6631
6632 if ((I < Intr->GradientStart) ||
6633 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6634 (I >= Intr->CoordStart && !IsA16)) {
6635 if ((I < Intr->GradientStart) && IsA16 &&
6636 (B.getMRI()->getType(AddrReg) == S16)) {
6637 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6638 // Special handling of bias when A16 is on. Bias is of type half but
6639 // occupies full 32-bit.
6640 PackedAddrs.push_back(
6641 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6642 .getReg(0));
6643 } else {
6644 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6645 "Bias needs to be converted to 16 bit in A16 mode");
6646 // Handle any gradient or coordinate operands that should not be packed
6647 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6648 PackedAddrs.push_back(AddrReg);
6649 }
6650 } else {
6651 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6652 // derivatives dx/dh and dx/dv are packed with undef.
6653 if (((I + 1) >= EndIdx) ||
6654 ((Intr->NumGradients / 2) % 2 == 1 &&
6655 (I == static_cast<unsigned>(Intr->GradientStart +
6656 (Intr->NumGradients / 2) - 1) ||
6657 I == static_cast<unsigned>(Intr->GradientStart +
6658 Intr->NumGradients - 1))) ||
6659 // Check for _L to _LZ optimization
6660 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6661 PackedAddrs.push_back(
6662 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6663 .getReg(0));
6664 } else {
6665 PackedAddrs.push_back(
6666 B.buildBuildVector(
6667 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6668 .getReg(0));
6669 ++I;
6670 }
6671 }
6672 }
6673}
6674
6675/// Convert from separate vaddr components to a single vector address register,
6676/// and replace the remaining operands with $noreg.
6678 int DimIdx, int NumVAddrs) {
6679 const LLT S32 = LLT::scalar(32);
6680 (void)S32;
6681 SmallVector<Register, 8> AddrRegs;
6682 for (int I = 0; I != NumVAddrs; ++I) {
6683 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6684 if (SrcOp.isReg()) {
6685 AddrRegs.push_back(SrcOp.getReg());
6686 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6687 }
6688 }
6689
6690 int NumAddrRegs = AddrRegs.size();
6691 if (NumAddrRegs != 1) {
6692 auto VAddr =
6693 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6694 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6695 }
6696
6697 for (int I = 1; I != NumVAddrs; ++I) {
6698 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6699 if (SrcOp.isReg())
6700 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6701 }
6702}
6703
6704/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6705///
6706/// Depending on the subtarget, load/store with 16-bit element data need to be
6707/// rewritten to use the low half of 32-bit registers, or directly use a packed
6708/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6709/// registers.
6710///
6711/// We don't want to directly select image instructions just yet, but also want
6712/// to exposes all register repacking to the legalizer/combiners. We also don't
6713/// want a selected instruction entering RegBankSelect. In order to avoid
6714/// defining a multitude of intermediate image instructions, directly hack on
6715/// the intrinsic's arguments. In cases like a16 addresses, this requires
6716/// padding now unnecessary arguments with $noreg.
6719 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6720
6721 const MachineFunction &MF = *MI.getMF();
6722 const unsigned NumDefs = MI.getNumExplicitDefs();
6723 const unsigned ArgOffset = NumDefs + 1;
6724 bool IsTFE = NumDefs == 2;
6725 // We are only processing the operands of d16 image operations on subtargets
6726 // that use the unpacked register layout, or need to repack the TFE result.
6727
6728 // TODO: Do we need to guard against already legalized intrinsics?
6729 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6731
6732 MachineRegisterInfo *MRI = B.getMRI();
6733 const LLT S32 = LLT::scalar(32);
6734 const LLT S16 = LLT::scalar(16);
6735 const LLT V2S16 = LLT::fixed_vector(2, 16);
6736
6737 unsigned DMask = 0;
6738 Register VData;
6739 LLT Ty;
6740
6741 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6742 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6743 Ty = MRI->getType(VData);
6744 }
6745
6746 const bool IsAtomicPacked16Bit =
6747 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6748 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6749
6750 // Check for 16 bit addresses and pack if true.
6751 LLT GradTy =
6752 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6753 LLT AddrTy =
6754 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6755 const bool IsG16 =
6756 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6757 const bool IsA16 = AddrTy == S16;
6758 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6759
6760 int DMaskLanes = 0;
6761 if (!BaseOpcode->Atomic) {
6762 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6763 if (BaseOpcode->Gather4) {
6764 DMaskLanes = 4;
6765 } else if (DMask != 0) {
6766 DMaskLanes = llvm::popcount(DMask);
6767 } else if (!IsTFE && !BaseOpcode->Store) {
6768 // If dmask is 0, this is a no-op load. This can be eliminated.
6769 B.buildUndef(MI.getOperand(0));
6770 MI.eraseFromParent();
6771 return true;
6772 }
6773 }
6774
6775 Observer.changingInstr(MI);
6776 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6777
6778 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6779 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6780 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6781 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6782 unsigned NewOpcode = LoadOpcode;
6783 if (BaseOpcode->Store)
6784 NewOpcode = StoreOpcode;
6785 else if (BaseOpcode->NoReturn)
6786 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6787
6788 // Track that we legalized this
6789 MI.setDesc(B.getTII().get(NewOpcode));
6790
6791 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6792 // dmask to be at least 1 otherwise the instruction will fail
6793 if (IsTFE && DMask == 0) {
6794 DMask = 0x1;
6795 DMaskLanes = 1;
6796 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6797 }
6798
6799 if (BaseOpcode->Atomic) {
6800 Register VData0 = MI.getOperand(2).getReg();
6801 LLT Ty = MRI->getType(VData0);
6802
6803 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6804 if (Ty.isVector() && !IsAtomicPacked16Bit)
6805 return false;
6806
6807 if (BaseOpcode->AtomicX2) {
6808 Register VData1 = MI.getOperand(3).getReg();
6809 // The two values are packed in one register.
6810 LLT PackedTy = LLT::fixed_vector(2, Ty);
6811 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6812 MI.getOperand(2).setReg(Concat.getReg(0));
6813 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6814 }
6815 }
6816
6817 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6818
6819 // Rewrite the addressing register layout before doing anything else.
6820 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6821 // 16 bit gradients are supported, but are tied to the A16 control
6822 // so both gradients and addresses must be 16 bit
6823 return false;
6824 }
6825
6826 if (IsA16 && !ST.hasA16()) {
6827 // A16 not supported
6828 return false;
6829 }
6830
6831 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6832 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6833
6834 if (IsA16 || IsG16) {
6835 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6836 // instructions expect VGPR_32
6837 SmallVector<Register, 4> PackedRegs;
6838
6839 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6840
6841 // See also below in the non-a16 branch
6842 const bool UseNSA = ST.hasNSAEncoding() &&
6843 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6844 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6845 const bool UsePartialNSA =
6846 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6847
6848 if (UsePartialNSA) {
6849 // Pack registers that would go over NSAMaxSize into last VAddr register
6850 LLT PackedAddrTy =
6851 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6852 auto Concat = B.buildConcatVectors(
6853 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6854 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6855 PackedRegs.resize(NSAMaxSize);
6856 } else if (!UseNSA && PackedRegs.size() > 1) {
6857 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6858 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6859 PackedRegs[0] = Concat.getReg(0);
6860 PackedRegs.resize(1);
6861 }
6862
6863 const unsigned NumPacked = PackedRegs.size();
6864 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6865 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6866 if (!SrcOp.isReg()) {
6867 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6868 continue;
6869 }
6870
6871 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6872
6873 if (I - Intr->VAddrStart < NumPacked)
6874 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6875 else
6876 SrcOp.setReg(AMDGPU::NoRegister);
6877 }
6878 } else {
6879 // If the register allocator cannot place the address registers contiguously
6880 // without introducing moves, then using the non-sequential address encoding
6881 // is always preferable, since it saves VALU instructions and is usually a
6882 // wash in terms of code size or even better.
6883 //
6884 // However, we currently have no way of hinting to the register allocator
6885 // that MIMG addresses should be placed contiguously when it is possible to
6886 // do so, so force non-NSA for the common 2-address case as a heuristic.
6887 //
6888 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6889 // allocation when possible.
6890 //
6891 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6892 // set of the remaining addresses.
6893 const bool UseNSA = ST.hasNSAEncoding() &&
6894 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6895 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6896 const bool UsePartialNSA =
6897 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6898
6899 if (UsePartialNSA) {
6901 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6902 Intr->NumVAddrs - NSAMaxSize + 1);
6903 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6904 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6905 Intr->NumVAddrs);
6906 }
6907 }
6908
6909 int Flags = 0;
6910 if (IsA16)
6911 Flags |= 1;
6912 if (IsG16)
6913 Flags |= 2;
6914 MI.addOperand(MachineOperand::CreateImm(Flags));
6915
6916 if (BaseOpcode->NoReturn) { // No TFE for stores?
6917 // TODO: Handle dmask trim
6918 if (!Ty.isVector() || !IsD16)
6919 return true;
6920
6921 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6922 if (RepackedReg != VData) {
6923 MI.getOperand(1).setReg(RepackedReg);
6924 }
6925
6926 return true;
6927 }
6928
6929 Register DstReg = MI.getOperand(0).getReg();
6930 const LLT EltTy = Ty.getScalarType();
6931 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6932
6933 // Confirm that the return type is large enough for the dmask specified
6934 if (NumElts < DMaskLanes)
6935 return false;
6936
6937 if (NumElts > 4 || DMaskLanes > 4)
6938 return false;
6939
6940 // Image atomic instructions are using DMask to specify how many bits
6941 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6942 // DMaskLanes for image atomic has default value '0'.
6943 // We must be sure that atomic variants (especially packed) will not be
6944 // truncated from v2s16 or v4s16 to s16 type.
6945 //
6946 // ChangeElementCount will be needed for image load where Ty is always scalar.
6947 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6948 const LLT AdjustedTy =
6949 DMaskLanes == 0
6950 ? Ty
6951 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6952
6953 // The raw dword aligned data component of the load. The only legal cases
6954 // where this matters should be when using the packed D16 format, for
6955 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6956 LLT RoundedTy;
6957
6958 // S32 vector to cover all data, plus TFE result element.
6959 LLT TFETy;
6960
6961 // Register type to use for each loaded component. Will be S32 or V2S16.
6962 LLT RegTy;
6963
6964 if (IsD16 && ST.hasUnpackedD16VMem()) {
6965 RoundedTy =
6966 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6967 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6968 RegTy = S32;
6969 } else {
6970 unsigned EltSize = EltTy.getSizeInBits();
6971 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6972 unsigned RoundedSize = 32 * RoundedElts;
6973 RoundedTy = LLT::scalarOrVector(
6974 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6975 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6976 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6977 }
6978
6979 // The return type does not need adjustment.
6980 // TODO: Should we change s16 case to s32 or <2 x s16>?
6981 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6982 return true;
6983
6984 Register Dst1Reg;
6985
6986 // Insert after the instruction.
6987 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6988
6989 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6990 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6991 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6992 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6993
6994 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6995
6996 MI.getOperand(0).setReg(NewResultReg);
6997
6998 // In the IR, TFE is supposed to be used with a 2 element struct return
6999 // type. The instruction really returns these two values in one contiguous
7000 // register, with one additional dword beyond the loaded data. Rewrite the
7001 // return type to use a single register result.
7002
7003 if (IsTFE) {
7004 Dst1Reg = MI.getOperand(1).getReg();
7005 if (MRI->getType(Dst1Reg) != S32)
7006 return false;
7007
7008 // TODO: Make sure the TFE operand bit is set.
7009 MI.removeOperand(1);
7010
7011 // Handle the easy case that requires no repack instructions.
7012 if (Ty == S32) {
7013 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7014 return true;
7015 }
7016 }
7017
7018 // Now figure out how to copy the new result register back into the old
7019 // result.
7020 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7021
7022 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7023
7024 if (ResultNumRegs == 1) {
7025 assert(!IsTFE);
7026 ResultRegs[0] = NewResultReg;
7027 } else {
7028 // We have to repack into a new vector of some kind.
7029 for (int I = 0; I != NumDataRegs; ++I)
7030 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7031 B.buildUnmerge(ResultRegs, NewResultReg);
7032
7033 // Drop the final TFE element to get the data part. The TFE result is
7034 // directly written to the right place already.
7035 if (IsTFE)
7036 ResultRegs.resize(NumDataRegs);
7037 }
7038
7039 // For an s16 scalar result, we form an s32 result with a truncate regardless
7040 // of packed vs. unpacked.
7041 if (IsD16 && !Ty.isVector()) {
7042 B.buildTrunc(DstReg, ResultRegs[0]);
7043 return true;
7044 }
7045
7046 // Avoid a build/concat_vector of 1 entry.
7047 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7048 B.buildBitcast(DstReg, ResultRegs[0]);
7049 return true;
7050 }
7051
7052 assert(Ty.isVector());
7053
7054 if (IsD16) {
7055 // For packed D16 results with TFE enabled, all the data components are
7056 // S32. Cast back to the expected type.
7057 //
7058 // TODO: We don't really need to use load s32 elements. We would only need one
7059 // cast for the TFE result if a multiple of v2s16 was used.
7060 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7061 for (Register &Reg : ResultRegs)
7062 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7063 } else if (ST.hasUnpackedD16VMem()) {
7064 for (Register &Reg : ResultRegs)
7065 Reg = B.buildTrunc(S16, Reg).getReg(0);
7066 }
7067 }
7068
7069 auto padWithUndef = [&](LLT Ty, int NumElts) {
7070 if (NumElts == 0)
7071 return;
7072 Register Undef = B.buildUndef(Ty).getReg(0);
7073 for (int I = 0; I != NumElts; ++I)
7074 ResultRegs.push_back(Undef);
7075 };
7076
7077 // Pad out any elements eliminated due to the dmask.
7078 LLT ResTy = MRI->getType(ResultRegs[0]);
7079 if (!ResTy.isVector()) {
7080 padWithUndef(ResTy, NumElts - ResultRegs.size());
7081 B.buildBuildVector(DstReg, ResultRegs);
7082 return true;
7083 }
7084
7085 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7086 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7087
7088 // Deal with the one annoying legal case.
7089 const LLT V3S16 = LLT::fixed_vector(3, 16);
7090 if (Ty == V3S16) {
7091 if (IsTFE) {
7092 if (ResultRegs.size() == 1) {
7093 NewResultReg = ResultRegs[0];
7094 } else if (ResultRegs.size() == 2) {
7095 LLT V4S16 = LLT::fixed_vector(4, 16);
7096 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7097 } else {
7098 return false;
7099 }
7100 }
7101
7102 if (MRI->getType(DstReg).getNumElements() <
7103 MRI->getType(NewResultReg).getNumElements()) {
7104 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7105 } else {
7106 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7107 }
7108 return true;
7109 }
7110
7111 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7112 B.buildConcatVectors(DstReg, ResultRegs);
7113 return true;
7114}
7115
7117 MachineInstr &MI) const {
7118 MachineIRBuilder &B = Helper.MIRBuilder;
7119 GISelChangeObserver &Observer = Helper.Observer;
7120
7121 Register OrigDst = MI.getOperand(0).getReg();
7122 Register Dst;
7123 LLT Ty = B.getMRI()->getType(OrigDst);
7124 unsigned Size = Ty.getSizeInBits();
7125 MachineFunction &MF = B.getMF();
7126 unsigned Opc = 0;
7127 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7128 assert(Size == 8 || Size == 16);
7129 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7130 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7131 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7132 // destination register.
7133 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7134 } else {
7135 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7136 Dst = OrigDst;
7137 }
7138
7139 Observer.changingInstr(MI);
7140
7141 // Handle needing to s.buffer.load() a p8 value.
7142 if (hasBufferRsrcWorkaround(Ty)) {
7143 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7144 B.setInsertPt(B.getMBB(), MI);
7145 }
7147 Ty = getBitcastRegisterType(Ty);
7148 Helper.bitcastDst(MI, Ty, 0);
7149 B.setInsertPt(B.getMBB(), MI);
7150 }
7151
7152 // FIXME: We don't really need this intermediate instruction. The intrinsic
7153 // should be fixed to have a memory operand. Since it's readnone, we're not
7154 // allowed to add one.
7155 MI.setDesc(B.getTII().get(Opc));
7156 MI.removeOperand(1); // Remove intrinsic ID
7157
7158 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7159 const unsigned MemSize = (Size + 7) / 8;
7160 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7166 MemSize, MemAlign);
7167 MI.addMemOperand(MF, MMO);
7168 if (Dst != OrigDst) {
7169 MI.getOperand(0).setReg(Dst);
7170 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7171 B.buildTrunc(OrigDst, Dst);
7172 }
7173
7174 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7175 // always be legal. We may need to restore this to a 96-bit result if it turns
7176 // out this needs to be converted to a vector load during RegBankSelect.
7177 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7178 if (Ty.isVector())
7180 else
7181 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7182 }
7183
7184 Observer.changedInstr(MI);
7185 return true;
7186}
7187
7189 MachineInstr &MI) const {
7190 MachineIRBuilder &B = Helper.MIRBuilder;
7191 GISelChangeObserver &Observer = Helper.Observer;
7192 Observer.changingInstr(MI);
7193 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7194 MI.removeOperand(0); // Remove intrinsic ID
7196 Observer.changedInstr(MI);
7197 return true;
7198}
7199
7200// TODO: Move to selection
7203 MachineIRBuilder &B) const {
7204 if (!ST.isTrapHandlerEnabled() ||
7205 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7206 return legalizeTrapEndpgm(MI, MRI, B);
7207
7208 return ST.supportsGetDoorbellID() ?
7210}
7211
7214 const DebugLoc &DL = MI.getDebugLoc();
7215 MachineBasicBlock &BB = B.getMBB();
7216 MachineFunction *MF = BB.getParent();
7217
7218 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7219 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7220 .addImm(0);
7221 MI.eraseFromParent();
7222 return true;
7223 }
7224
7225 // We need a block split to make the real endpgm a terminator. We also don't
7226 // want to break phis in successor blocks, so we can't just delete to the
7227 // end of the block.
7228 BB.splitAt(MI, false /*UpdateLiveIns*/);
7230 MF->push_back(TrapBB);
7231 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7232 .addImm(0);
7233 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7234 .addMBB(TrapBB);
7235
7236 BB.addSuccessor(TrapBB);
7237 MI.eraseFromParent();
7238 return true;
7239}
7240
7243 MachineFunction &MF = B.getMF();
7244 const LLT S64 = LLT::scalar(64);
7245
7246 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7247 // For code object version 5, queue_ptr is passed through implicit kernarg.
7253 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7254
7255 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7257
7258 if (!loadInputValue(KernargPtrReg, B,
7260 return false;
7261
7262 // TODO: can we be smarter about machine pointer info?
7265 PtrInfo,
7269
7270 // Pointer address
7271 Register LoadAddr = MRI.createGenericVirtualRegister(
7273 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7274 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7275 // Load address
7276 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7277 B.buildCopy(SGPR01, Temp);
7278 B.buildInstr(AMDGPU::S_TRAP)
7279 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7280 .addReg(SGPR01, RegState::Implicit);
7281 MI.eraseFromParent();
7282 return true;
7283 }
7284
7285 // Pass queue pointer to trap handler as input, and insert trap instruction
7286 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7287 Register LiveIn =
7288 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
7290 return false;
7291
7292 B.buildCopy(SGPR01, LiveIn);
7293 B.buildInstr(AMDGPU::S_TRAP)
7294 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7295 .addReg(SGPR01, RegState::Implicit);
7296
7297 MI.eraseFromParent();
7298 return true;
7299}
7300
7303 MachineIRBuilder &B) const {
7304 // We need to simulate the 's_trap 2' instruction on targets that run in
7305 // PRIV=1 (where it is treated as a nop).
7306 if (ST.hasPrivEnabledTrap2NopBug()) {
7307 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7308 MI.getDebugLoc());
7309 MI.eraseFromParent();
7310 return true;
7311 }
7312
7313 B.buildInstr(AMDGPU::S_TRAP)
7314 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7315 MI.eraseFromParent();
7316 return true;
7317}
7318
7321 MachineIRBuilder &B) const {
7322 // Is non-HSA path or trap-handler disabled? Then, report a warning
7323 // accordingly
7324 if (!ST.isTrapHandlerEnabled() ||
7325 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7326 Function &Fn = B.getMF().getFunction();
7328 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7329 } else {
7330 // Insert debug-trap instruction
7331 B.buildInstr(AMDGPU::S_TRAP)
7332 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7333 }
7334
7335 MI.eraseFromParent();
7336 return true;
7337}
7338
7340 MachineInstr &MI, MachineIRBuilder &B) const {
7341 MachineRegisterInfo &MRI = *B.getMRI();
7342 const LLT S16 = LLT::scalar(16);
7343 const LLT S32 = LLT::scalar(32);
7344 const LLT V2S16 = LLT::fixed_vector(2, 16);
7345 const LLT V3S32 = LLT::fixed_vector(3, 32);
7346
7347 Register DstReg = MI.getOperand(0).getReg();
7348 Register NodePtr = MI.getOperand(2).getReg();
7349 Register RayExtent = MI.getOperand(3).getReg();
7350 Register RayOrigin = MI.getOperand(4).getReg();
7351 Register RayDir = MI.getOperand(5).getReg();
7352 Register RayInvDir = MI.getOperand(6).getReg();
7353 Register TDescr = MI.getOperand(7).getReg();
7354
7355 if (!ST.hasGFX10_AEncoding()) {
7356 Function &Fn = B.getMF().getFunction();
7358 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7359 return false;
7360 }
7361
7362 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7363 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7364 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7365 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7366 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7367 const unsigned NumVDataDwords = 4;
7368 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7369 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7370 const bool UseNSA =
7371 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7372
7373 const unsigned BaseOpcodes[2][2] = {
7374 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7375 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7376 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7377 int Opcode;
7378 if (UseNSA) {
7379 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7380 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7381 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7382 : AMDGPU::MIMGEncGfx10NSA,
7383 NumVDataDwords, NumVAddrDwords);
7384 } else {
7385 assert(!IsGFX12Plus);
7386 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7387 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7388 : AMDGPU::MIMGEncGfx10Default,
7389 NumVDataDwords, NumVAddrDwords);
7390 }
7391 assert(Opcode != -1);
7392
7394 if (UseNSA && IsGFX11Plus) {
7395 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7396 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7397 auto Merged = B.buildMergeLikeInstr(
7398 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7399 Ops.push_back(Merged.getReg(0));
7400 };
7401
7402 Ops.push_back(NodePtr);
7403 Ops.push_back(RayExtent);
7404 packLanes(RayOrigin);
7405
7406 if (IsA16) {
7407 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7408 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7409 auto MergedDir = B.buildMergeLikeInstr(
7410 V3S32,
7411 {B.buildBitcast(
7412 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7413 UnmergeRayDir.getReg(0)}))
7414 .getReg(0),
7415 B.buildBitcast(
7416 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7417 UnmergeRayDir.getReg(1)}))
7418 .getReg(0),
7419 B.buildBitcast(
7420 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7421 UnmergeRayDir.getReg(2)}))
7422 .getReg(0)});
7423 Ops.push_back(MergedDir.getReg(0));
7424 } else {
7425 packLanes(RayDir);
7426 packLanes(RayInvDir);
7427 }
7428 } else {
7429 if (Is64) {
7430 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7431 Ops.push_back(Unmerge.getReg(0));
7432 Ops.push_back(Unmerge.getReg(1));
7433 } else {
7434 Ops.push_back(NodePtr);
7435 }
7436 Ops.push_back(RayExtent);
7437
7438 auto packLanes = [&Ops, &S32, &B](Register Src) {
7439 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7440 Ops.push_back(Unmerge.getReg(0));
7441 Ops.push_back(Unmerge.getReg(1));
7442 Ops.push_back(Unmerge.getReg(2));
7443 };
7444
7445 packLanes(RayOrigin);
7446 if (IsA16) {
7447 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7448 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7449 Register R1 = MRI.createGenericVirtualRegister(S32);
7450 Register R2 = MRI.createGenericVirtualRegister(S32);
7451 Register R3 = MRI.createGenericVirtualRegister(S32);
7452 B.buildMergeLikeInstr(R1,
7453 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7454 B.buildMergeLikeInstr(
7455 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7456 B.buildMergeLikeInstr(
7457 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7458 Ops.push_back(R1);
7459 Ops.push_back(R2);
7460 Ops.push_back(R3);
7461 } else {
7462 packLanes(RayDir);
7463 packLanes(RayInvDir);
7464 }
7465 }
7466
7467 if (!UseNSA) {
7468 // Build a single vector containing all the operands so far prepared.
7469 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7470 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7471 Ops.clear();
7472 Ops.push_back(MergedOps);
7473 }
7474
7475 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7476 .addDef(DstReg)
7477 .addImm(Opcode);
7478
7479 for (Register R : Ops) {
7480 MIB.addUse(R);
7481 }
7482
7483 MIB.addUse(TDescr)
7484 .addImm(IsA16 ? 1 : 0)
7485 .cloneMemRefs(MI);
7486
7487 MI.eraseFromParent();
7488 return true;
7489}
7490
7492 MachineInstr &MI, MachineIRBuilder &B) const {
7493 const LLT S32 = LLT::scalar(32);
7494 const LLT V2S32 = LLT::fixed_vector(2, 32);
7495
7496 Register DstReg = MI.getOperand(0).getReg();
7497 Register DstOrigin = MI.getOperand(1).getReg();
7498 Register DstDir = MI.getOperand(2).getReg();
7499 Register NodePtr = MI.getOperand(4).getReg();
7500 Register RayExtent = MI.getOperand(5).getReg();
7501 Register InstanceMask = MI.getOperand(6).getReg();
7502 Register RayOrigin = MI.getOperand(7).getReg();
7503 Register RayDir = MI.getOperand(8).getReg();
7504 Register Offsets = MI.getOperand(9).getReg();
7505 Register TDescr = MI.getOperand(10).getReg();
7506
7507 if (!ST.hasBVHDualAndBVH8Insts()) {
7508 Function &Fn = B.getMF().getFunction();
7510 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7511 return false;
7512 }
7513
7514 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7515 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7516 const unsigned NumVDataDwords = 10;
7517 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7518 int Opcode = AMDGPU::getMIMGOpcode(
7519 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7520 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7521 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7522 assert(Opcode != -1);
7523
7524 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7525 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7526
7527 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7528 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7529 .addDef(DstReg)
7530 .addDef(DstOrigin)
7531 .addDef(DstDir)
7532 .addImm(Opcode)
7533 .addUse(NodePtr)
7534 .addUse(RayExtentInstanceMaskVec.getReg(0))
7535 .addUse(RayOrigin)
7536 .addUse(RayDir)
7537 .addUse(Offsets)
7538 .addUse(TDescr)
7539 .cloneMemRefs(MI);
7540
7541 MI.eraseFromParent();
7542 return true;
7543}
7544
7546 MachineIRBuilder &B) const {
7547 const SITargetLowering *TLI = ST.getTargetLowering();
7549 Register DstReg = MI.getOperand(0).getReg();
7550 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7551 MI.eraseFromParent();
7552 return true;
7553}
7554
7556 MachineIRBuilder &B) const {
7557 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7558 if (!ST.hasArchitectedSGPRs())
7559 return false;
7560 LLT S32 = LLT::scalar(32);
7561 Register DstReg = MI.getOperand(0).getReg();
7562 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7563 auto LSB = B.buildConstant(S32, 25);
7564 auto Width = B.buildConstant(S32, 5);
7565 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7566 MI.eraseFromParent();
7567 return true;
7568}
7569
7572 AMDGPU::Hwreg::Id HwReg,
7573 unsigned LowBit,
7574 unsigned Width) const {
7575 MachineRegisterInfo &MRI = *B.getMRI();
7576 Register DstReg = MI.getOperand(0).getReg();
7577 if (!MRI.getRegClassOrNull(DstReg))
7578 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7579 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7580 .addDef(DstReg)
7581 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7582 MI.eraseFromParent();
7583 return true;
7584}
7585
7586static constexpr unsigned FPEnvModeBitField =
7588
7589static constexpr unsigned FPEnvTrapBitField =
7591
7594 MachineIRBuilder &B) const {
7595 Register Src = MI.getOperand(0).getReg();
7596 if (MRI.getType(Src) != S64)
7597 return false;
7598
7599 auto ModeReg =
7600 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7601 /*HasSideEffects=*/true, /*isConvergent=*/false)
7602 .addImm(FPEnvModeBitField);
7603 auto TrapReg =
7604 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7605 /*HasSideEffects=*/true, /*isConvergent=*/false)
7606 .addImm(FPEnvTrapBitField);
7607 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7608 MI.eraseFromParent();
7609 return true;
7610}
7611
7614 MachineIRBuilder &B) const {
7615 Register Src = MI.getOperand(0).getReg();
7616 if (MRI.getType(Src) != S64)
7617 return false;
7618
7619 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7620 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7621 /*HasSideEffects=*/true, /*isConvergent=*/false)
7622 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7623 .addReg(Unmerge.getReg(0));
7624 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7625 /*HasSideEffects=*/true, /*isConvergent=*/false)
7626 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7627 .addReg(Unmerge.getReg(1));
7628 MI.eraseFromParent();
7629 return true;
7630}
7631
7633 MachineInstr &MI) const {
7634 MachineIRBuilder &B = Helper.MIRBuilder;
7635 MachineRegisterInfo &MRI = *B.getMRI();
7636
7637 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7638 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7639 switch (IntrID) {
7640 case Intrinsic::amdgcn_if:
7641 case Intrinsic::amdgcn_else: {
7642 MachineInstr *Br = nullptr;
7643 MachineBasicBlock *UncondBrTarget = nullptr;
7644 bool Negated = false;
7645 if (MachineInstr *BrCond =
7646 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7647 const SIRegisterInfo *TRI
7648 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7649
7650 Register Def = MI.getOperand(1).getReg();
7651 Register Use = MI.getOperand(3).getReg();
7652
7653 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7654
7655 if (Negated)
7656 std::swap(CondBrTarget, UncondBrTarget);
7657
7658 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7659 if (IntrID == Intrinsic::amdgcn_if) {
7660 B.buildInstr(AMDGPU::SI_IF)
7661 .addDef(Def)
7662 .addUse(Use)
7663 .addMBB(UncondBrTarget);
7664 } else {
7665 B.buildInstr(AMDGPU::SI_ELSE)
7666 .addDef(Def)
7667 .addUse(Use)
7668 .addMBB(UncondBrTarget);
7669 }
7670
7671 if (Br) {
7672 Br->getOperand(0).setMBB(CondBrTarget);
7673 } else {
7674 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7675 // since we're swapping branch targets it needs to be reinserted.
7676 // FIXME: IRTranslator should probably not do this
7677 B.buildBr(*CondBrTarget);
7678 }
7679
7680 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7681 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7682 MI.eraseFromParent();
7683 BrCond->eraseFromParent();
7684 return true;
7685 }
7686
7687 return false;
7688 }
7689 case Intrinsic::amdgcn_loop: {
7690 MachineInstr *Br = nullptr;
7691 MachineBasicBlock *UncondBrTarget = nullptr;
7692 bool Negated = false;
7693 if (MachineInstr *BrCond =
7694 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7695 const SIRegisterInfo *TRI
7696 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7697
7698 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7699 Register Reg = MI.getOperand(2).getReg();
7700
7701 if (Negated)
7702 std::swap(CondBrTarget, UncondBrTarget);
7703
7704 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7705 B.buildInstr(AMDGPU::SI_LOOP)
7706 .addUse(Reg)
7707 .addMBB(UncondBrTarget);
7708
7709 if (Br)
7710 Br->getOperand(0).setMBB(CondBrTarget);
7711 else
7712 B.buildBr(*CondBrTarget);
7713
7714 MI.eraseFromParent();
7715 BrCond->eraseFromParent();
7716 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7717 return true;
7718 }
7719
7720 return false;
7721 }
7722 case Intrinsic::amdgcn_addrspacecast_nonnull:
7723 return legalizeAddrSpaceCast(MI, MRI, B);
7724 case Intrinsic::amdgcn_make_buffer_rsrc:
7726 case Intrinsic::amdgcn_kernarg_segment_ptr:
7727 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7728 // This only makes sense to call in a kernel, so just lower to null.
7729 B.buildConstant(MI.getOperand(0).getReg(), 0);
7730 MI.eraseFromParent();
7731 return true;
7732 }
7733
7736 case Intrinsic::amdgcn_implicitarg_ptr:
7737 return legalizeImplicitArgPtr(MI, MRI, B);
7738 case Intrinsic::amdgcn_workitem_id_x:
7739 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7741 case Intrinsic::amdgcn_workitem_id_y:
7742 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7744 case Intrinsic::amdgcn_workitem_id_z:
7745 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7747 case Intrinsic::amdgcn_workgroup_id_x:
7748 return legalizeWorkGroupId(
7752 case Intrinsic::amdgcn_workgroup_id_y:
7753 return legalizeWorkGroupId(
7757 case Intrinsic::amdgcn_workgroup_id_z:
7758 return legalizeWorkGroupId(
7762 case Intrinsic::amdgcn_cluster_id_x:
7763 return ST.hasClusters() &&
7766 case Intrinsic::amdgcn_cluster_id_y:
7767 return ST.hasClusters() &&
7770 case Intrinsic::amdgcn_cluster_id_z:
7771 return ST.hasClusters() &&
7774 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7775 return ST.hasClusters() &&
7778 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7779 return ST.hasClusters() &&
7782 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7783 return ST.hasClusters() &&
7786 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7787 return ST.hasClusters() &&
7789 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7790 return ST.hasClusters() &&
7793 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7794 return ST.hasClusters() &&
7797 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7798 return ST.hasClusters() &&
7801 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7802 return ST.hasClusters() &&
7804 MI, MRI, B,
7806 case Intrinsic::amdgcn_wave_id:
7807 return legalizeWaveID(MI, B);
7808 case Intrinsic::amdgcn_lds_kernel_id:
7811 case Intrinsic::amdgcn_dispatch_ptr:
7814 case Intrinsic::amdgcn_queue_ptr:
7817 case Intrinsic::amdgcn_implicit_buffer_ptr:
7820 case Intrinsic::amdgcn_dispatch_id:
7823 case Intrinsic::r600_read_ngroups_x:
7824 // TODO: Emit error for hsa
7827 case Intrinsic::r600_read_ngroups_y:
7830 case Intrinsic::r600_read_ngroups_z:
7833 case Intrinsic::r600_read_local_size_x:
7834 // TODO: Could insert G_ASSERT_ZEXT from s16
7836 case Intrinsic::r600_read_local_size_y:
7837 // TODO: Could insert G_ASSERT_ZEXT from s16
7839 // TODO: Could insert G_ASSERT_ZEXT from s16
7840 case Intrinsic::r600_read_local_size_z:
7843 case Intrinsic::amdgcn_fdiv_fast:
7844 return legalizeFDIVFastIntrin(MI, MRI, B);
7845 case Intrinsic::amdgcn_is_shared:
7847 case Intrinsic::amdgcn_is_private:
7849 case Intrinsic::amdgcn_wavefrontsize: {
7850 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7851 MI.eraseFromParent();
7852 return true;
7853 }
7854 case Intrinsic::amdgcn_s_buffer_load:
7855 return legalizeSBufferLoad(Helper, MI);
7856 case Intrinsic::amdgcn_raw_buffer_store:
7857 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7858 case Intrinsic::amdgcn_struct_buffer_store:
7859 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7860 return legalizeBufferStore(MI, Helper, false, false);
7861 case Intrinsic::amdgcn_raw_buffer_store_format:
7862 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7863 case Intrinsic::amdgcn_struct_buffer_store_format:
7864 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7865 return legalizeBufferStore(MI, Helper, false, true);
7866 case Intrinsic::amdgcn_raw_tbuffer_store:
7867 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7868 case Intrinsic::amdgcn_struct_tbuffer_store:
7869 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7870 return legalizeBufferStore(MI, Helper, true, true);
7871 case Intrinsic::amdgcn_raw_buffer_load:
7872 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7873 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7874 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7875 case Intrinsic::amdgcn_struct_buffer_load:
7876 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7877 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7878 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7879 return legalizeBufferLoad(MI, Helper, false, false);
7880 case Intrinsic::amdgcn_raw_buffer_load_format:
7881 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7882 case Intrinsic::amdgcn_struct_buffer_load_format:
7883 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7884 return legalizeBufferLoad(MI, Helper, true, false);
7885 case Intrinsic::amdgcn_raw_tbuffer_load:
7886 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7887 case Intrinsic::amdgcn_struct_tbuffer_load:
7888 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7889 return legalizeBufferLoad(MI, Helper, true, true);
7890 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7892 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7893 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7894 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7895 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7896 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7897 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7898 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7900 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7901 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7902 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7904 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7905 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7906 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7908 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7910 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7912 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7913 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7914 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7916 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7917 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7918 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7919 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7920 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7921 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7922 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7923 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7924 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7925 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7926 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7928 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7930 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7931 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7932 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7934 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7935 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7936 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7937 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7938 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7939 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7940 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7941 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7942 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7943 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7944 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7945 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7946 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7948 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7950 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7951 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7952 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7954 return legalizeBufferAtomic(MI, B, IntrID);
7955 case Intrinsic::amdgcn_rsq_clamp:
7957 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7959 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7960 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7962 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
7963 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
7964 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
7965 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
7966 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
7967 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
7968 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
7969 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
7970 Register Index = MI.getOperand(5).getReg();
7971 LLT S64 = LLT::scalar(64);
7972 if (MRI.getType(Index) != S64)
7973 MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0));
7974 return true;
7975 }
7976 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7977 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7978 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7979 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7980 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7981 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7982 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7983 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7984 Register Index = MI.getOperand(5).getReg();
7985 LLT S32 = LLT::scalar(32);
7986 if (MRI.getType(Index) != S32)
7987 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7988 return true;
7989 }
7990 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
7991 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
7992 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
7993 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
7994 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
7995 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
7996 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7997 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7998 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7999 Register Index = MI.getOperand(7).getReg();
8000 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8001 ? LLT::scalar(64)
8002 : LLT::scalar(32);
8003 if (MRI.getType(Index) != IdxTy)
8004 MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0));
8005 return true;
8006 }
8007
8008 case Intrinsic::amdgcn_fmed3: {
8009 GISelChangeObserver &Observer = Helper.Observer;
8010
8011 // FIXME: This is to workaround the inability of tablegen match combiners to
8012 // match intrinsics in patterns.
8013 Observer.changingInstr(MI);
8014 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8015 MI.removeOperand(1);
8016 Observer.changedInstr(MI);
8017 return true;
8018 }
8019 case Intrinsic::amdgcn_readlane:
8020 case Intrinsic::amdgcn_writelane:
8021 case Intrinsic::amdgcn_readfirstlane:
8022 case Intrinsic::amdgcn_permlane16:
8023 case Intrinsic::amdgcn_permlanex16:
8024 case Intrinsic::amdgcn_permlane64:
8025 case Intrinsic::amdgcn_set_inactive:
8026 case Intrinsic::amdgcn_set_inactive_chain_arg:
8027 case Intrinsic::amdgcn_mov_dpp8:
8028 case Intrinsic::amdgcn_update_dpp:
8029 return legalizeLaneOp(Helper, MI, IntrID);
8030 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8031 return legalizeSBufferPrefetch(Helper, MI);
8032 case Intrinsic::amdgcn_dead: {
8033 // TODO: Use poison instead of undef
8034 for (const MachineOperand &Def : MI.defs())
8035 B.buildUndef(Def);
8036 MI.eraseFromParent();
8037 return true;
8038 }
8039 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8040 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8041 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8042 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8043 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8044 MI.eraseFromParent();
8045 return true;
8046 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8047 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8048 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8049 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8050 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8051 MI.eraseFromParent();
8052 return true;
8053 default: {
8054 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8056 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8057 return true;
8058 }
8059 }
8060
8061 return true;
8062}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:71
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1257
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1120
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:64
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:299
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:388
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:314
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:915
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:477
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:2034
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:651
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1719
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.