LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::scalar(32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::scalar(64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
327constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
328constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
329constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
330constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
331constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
332constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
333constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
334constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
335constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
336constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
337constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
338constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
341constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
342constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
343constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
344constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
345constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
346constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
347constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
350constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
364
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
375
378 (ST.useRealTrue16Insts() && Ty == S16) ||
380}
381
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 Query.MMODescrs[0].Ordering !=
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
500 Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
601 RoundedSize, AddrSpace, Align(AlignInBits / 8),
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
612 Query.MMODescrs[0].AlignInBits,
613 Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(Idx);
622
623 const LLT PointerTy = MRI.getType(MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
647 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Unmerged.getReg(I));
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 }
675 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
677}
678
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
686 return;
688}
689
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
736
737 // s1 for VCC branches, s32 for SCC branches.
739
740 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
741 // elements for v3s16
744 .legalFor(AllS32Vectors)
746 .legalFor(AddrSpaces64)
747 .legalFor(AddrSpaces32)
748 .legalFor(AddrSpaces128)
749 .legalIf(isPointer(0))
750 .clampScalar(0, S16, S256)
752 .clampMaxNumElements(0, S32, 16)
754 .scalarize(0);
755
756 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
757 // Full set of gfx9 features.
758 if (ST.hasScalarAddSub64()) {
759 getActionDefinitionsBuilder({G_ADD, G_SUB})
760 .legalFor({S64, S32, S16, V2S16})
761 .clampMaxNumElementsStrict(0, S16, 2)
762 .scalarize(0)
763 .minScalar(0, S16)
765 .maxScalar(0, S32);
766 } else {
767 getActionDefinitionsBuilder({G_ADD, G_SUB})
768 .legalFor({S32, S16, V2S16})
769 .clampMaxNumElementsStrict(0, S16, 2)
770 .scalarize(0)
771 .minScalar(0, S16)
773 .maxScalar(0, S32);
774 }
775
776 if (ST.hasScalarSMulU64()) {
778 .legalFor({S64, S32, S16, V2S16})
779 .clampMaxNumElementsStrict(0, S16, 2)
780 .scalarize(0)
781 .minScalar(0, S16)
783 .custom();
784 } else {
786 .legalFor({S32, S16, V2S16})
787 .clampMaxNumElementsStrict(0, S16, 2)
788 .scalarize(0)
789 .minScalar(0, S16)
791 .custom();
792 }
793 assert(ST.hasMad64_32());
794
795 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
796 .legalFor({S32, S16, V2S16}) // Clamp modifier
797 .minScalarOrElt(0, S16)
799 .scalarize(0)
801 .lower();
802 } else if (ST.has16BitInsts()) {
803 getActionDefinitionsBuilder({G_ADD, G_SUB})
804 .legalFor({S32, S16})
805 .minScalar(0, S16)
807 .maxScalar(0, S32)
808 .scalarize(0);
809
811 .legalFor({S32, S16})
812 .scalarize(0)
813 .minScalar(0, S16)
815 .custom();
816 assert(ST.hasMad64_32());
817
818 // Technically the saturating operations require clamp bit support, but this
819 // was introduced at the same time as 16-bit operations.
820 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
821 .legalFor({S32, S16}) // Clamp modifier
822 .minScalar(0, S16)
823 .scalarize(0)
825 .lower();
826
827 // We're just lowering this, but it helps get a better result to try to
828 // coerce to the desired type first.
829 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
830 .minScalar(0, S16)
831 .scalarize(0)
832 .lower();
833 } else {
834 getActionDefinitionsBuilder({G_ADD, G_SUB})
835 .legalFor({S32})
836 .widenScalarToNextMultipleOf(0, 32)
837 .clampScalar(0, S32, S32)
838 .scalarize(0);
839
840 auto &Mul = getActionDefinitionsBuilder(G_MUL)
841 .legalFor({S32})
842 .scalarize(0)
843 .minScalar(0, S32)
845
846 if (ST.hasMad64_32())
847 Mul.custom();
848 else
849 Mul.maxScalar(0, S32);
850
851 if (ST.hasIntClamp()) {
852 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
853 .legalFor({S32}) // Clamp modifier.
854 .scalarize(0)
856 .lower();
857 } else {
858 // Clamp bit support was added in VI, along with 16-bit operations.
859 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
860 .minScalar(0, S32)
861 .scalarize(0)
862 .lower();
863 }
864
865 // FIXME: DAG expansion gets better results. The widening uses the smaller
866 // range values and goes for the min/max lowering directly.
867 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
868 .minScalar(0, S32)
869 .scalarize(0)
870 .lower();
871 }
872
874 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
875 .customFor({S32, S64})
876 .clampScalar(0, S32, S64)
878 .scalarize(0);
879
880 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
881 .legalFor({S32})
882 .maxScalar(0, S32);
883
884 if (ST.hasVOP3PInsts()) {
885 Mulh
886 .clampMaxNumElements(0, S8, 2)
887 .lowerFor({V2S8});
888 }
889
890 Mulh
891 .scalarize(0)
892 .lower();
893
894 // Report legal for any types we can handle anywhere. For the cases only legal
895 // on the SALU, RegBankSelect will be able to re-legalize.
896 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
897 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
898 .clampScalar(0, S32, S64)
904 .scalarize(0);
905
907 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
908 .legalFor({{S32, S1}, {S32, S32}})
909 .clampScalar(0, S32, S32)
910 .scalarize(0);
911
913 // Don't worry about the size constraint.
915 .lower();
916
918 .legalFor({S1, S32, S64, S16, GlobalPtr,
919 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
920 .legalIf(isPointer(0))
921 .clampScalar(0, S32, S64)
923
924 getActionDefinitionsBuilder(G_FCONSTANT)
925 .legalFor({S32, S64, S16})
926 .clampScalar(0, S16, S64);
927
928 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
929 .legalIf(isRegisterClassType(ST, 0))
930 // s1 and s16 are special cases because they have legal operations on
931 // them, but don't really occupy registers in the normal way.
932 .legalFor({S1, S16})
933 .clampNumElements(0, V16S32, V32S32)
937 .clampMaxNumElements(0, S32, 16);
938
939 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
940
941 // If the amount is divergent, we have to do a wave reduction to get the
942 // maximum value, so this is expanded during RegBankSelect.
943 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
944 .legalFor({{PrivatePtr, S32}});
945
946 getActionDefinitionsBuilder(G_STACKSAVE)
947 .customFor({PrivatePtr});
948 getActionDefinitionsBuilder(G_STACKRESTORE)
949 .legalFor({PrivatePtr});
950
951 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
952
953 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
954 .customIf(typeIsNot(0, PrivatePtr));
955
956 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
957
958 auto &FPOpActions = getActionDefinitionsBuilder(
959 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
960 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
961 .legalFor({S32, S64});
962 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
963 .customFor({S32, S64});
964 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
965 .customFor({S32, S64});
966
967 if (ST.has16BitInsts()) {
968 if (ST.hasVOP3PInsts())
969 FPOpActions.legalFor({S16, V2S16});
970 else
971 FPOpActions.legalFor({S16});
972
973 TrigActions.customFor({S16});
974 FDIVActions.customFor({S16});
975 }
976
977 if (ST.hasPackedFP32Ops()) {
978 FPOpActions.legalFor({V2S32});
979 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
980 }
981
982 auto &MinNumMaxNumIeee =
983 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
984
985 if (ST.hasVOP3PInsts()) {
986 MinNumMaxNumIeee.legalFor(FPTypesPK16)
987 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
988 .clampMaxNumElements(0, S16, 2)
989 .clampScalar(0, S16, S64)
990 .scalarize(0);
991 } else if (ST.has16BitInsts()) {
992 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
993 } else {
994 MinNumMaxNumIeee.legalFor(FPTypesBase)
995 .clampScalar(0, S32, S64)
996 .scalarize(0);
997 }
998
999 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1000 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1001
1002 if (ST.hasVOP3PInsts()) {
1003 MinNumMaxNum.customFor(FPTypesPK16)
1004 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1005 .clampMaxNumElements(0, S16, 2)
1006 .clampScalar(0, S16, S64)
1007 .scalarize(0);
1008 } else if (ST.has16BitInsts()) {
1009 MinNumMaxNum.customFor(FPTypes16)
1010 .clampScalar(0, S16, S64)
1011 .scalarize(0);
1012 } else {
1013 MinNumMaxNum.customFor(FPTypesBase)
1014 .clampScalar(0, S32, S64)
1015 .scalarize(0);
1016 }
1017
1018 if (ST.hasVOP3PInsts())
1019 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1020
1021 FPOpActions
1022 .scalarize(0)
1023 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1024
1025 TrigActions
1026 .scalarize(0)
1027 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1028
1029 FDIVActions
1030 .scalarize(0)
1031 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1032
1033 auto &FNegAbs = getActionDefinitionsBuilder({G_FNEG, G_FABS});
1034 FNegAbs.legalFor(FPTypesPK16)
1035 .legalFor(ST.hasPackedFP32Ops(), {V2S32})
1037 if (ST.hasPackedFP32Ops())
1038 FNegAbs.clampMaxNumElementsStrict(0, S32, 2);
1039 FNegAbs.scalarize(0).clampScalar(0, S16, S64);
1040
1041 if (ST.has16BitInsts()) {
1043 .legalFor({S16})
1044 .customFor({S32, S64})
1045 .scalarize(0)
1046 .unsupported();
1048 .legalFor({S32, S64, S16})
1049 .scalarize(0)
1050 .clampScalar(0, S16, S64);
1051
1052 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1053 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1054 .scalarize(0)
1055 .maxScalarIf(typeIs(0, S16), 1, S16)
1056 .clampScalar(1, S32, S32)
1057 .lower();
1058
1060 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1061 .scalarize(0)
1062 .lower();
1063
1065 .lowerFor({S16, S32, S64})
1066 .scalarize(0)
1067 .lower();
1068 } else {
1070 .customFor({S32, S64, S16})
1071 .scalarize(0)
1072 .unsupported();
1073
1074
1075 if (ST.hasFractBug()) {
1077 .customFor({S64})
1078 .legalFor({S32, S64})
1079 .scalarize(0)
1080 .clampScalar(0, S32, S64);
1081 } else {
1083 .legalFor({S32, S64})
1084 .scalarize(0)
1085 .clampScalar(0, S32, S64);
1086 }
1087
1088 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1089 .legalFor({{S32, S32}, {S64, S32}})
1090 .scalarize(0)
1091 .clampScalar(0, S32, S64)
1092 .clampScalar(1, S32, S32)
1093 .lower();
1094
1096 .customFor({{S32, S32}, {S64, S32}})
1097 .scalarize(0)
1098 .minScalar(0, S32)
1099 .clampScalar(1, S32, S32)
1100 .lower();
1101
1103 .lowerFor({S32, S64})
1104 .scalarize(0)
1105 .lower();
1106 }
1107
1108 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1109 if (ST.hasCvtPkF16F32Inst()) {
1110 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1111 .clampMaxNumElements(0, S16, 2);
1112 } else {
1113 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1114 }
1115 FPTruncActions.scalarize(0).lower();
1116
1118 .legalFor({{S64, S32}, {S32, S16}})
1119 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1120 .scalarize(0);
1121
1122 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1123 if (ST.has16BitInsts()) {
1124 FSubActions
1125 // Use actual fsub instruction
1126 .legalFor({S32, S16})
1127 // Must use fadd + fneg
1128 .lowerFor({S64, V2S16});
1129 } else {
1130 FSubActions
1131 // Use actual fsub instruction
1132 .legalFor({S32})
1133 // Must use fadd + fneg
1134 .lowerFor({S64, S16, V2S16});
1135 }
1136
1137 if (ST.hasPackedFP32Ops())
1138 FSubActions.lowerFor({V2S32}).clampMaxNumElements(0, S32, 2);
1139
1140 FSubActions
1141 .scalarize(0)
1142 .clampScalar(0, S32, S64);
1143
1144 // Whether this is legal depends on the floating point mode for the function.
1145 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1146 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1147 FMad.customFor({S32, S16});
1148 else if (ST.hasMadMacF32Insts())
1149 FMad.customFor({S32});
1150 else if (ST.hasMadF16())
1151 FMad.customFor({S16});
1152 FMad.scalarize(0)
1153 .lower();
1154
1155 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1156 if (ST.has16BitInsts()) {
1157 FRem.customFor({S16, S32, S64});
1158 } else {
1159 FRem.minScalar(0, S32)
1160 .customFor({S32, S64});
1161 }
1162 FRem.scalarize(0);
1163
1164 // TODO: Do we need to clamp maximum bitwidth?
1166 .legalIf(isScalar(0))
1167 .legalFor({{V2S16, V2S32}})
1168 .clampMaxNumElements(0, S16, 2)
1169 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1170 // situations (like an invalid implicit use), we don't want to infinite loop
1171 // in the legalizer.
1173 .alwaysLegal();
1174
1175 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1176 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1177 {S32, S1}, {S64, S1}, {S16, S1}})
1178 .scalarize(0)
1179 .clampScalar(0, S32, S64)
1180 .widenScalarToNextPow2(1, 32);
1181
1182 // TODO: Split s1->s64 during regbankselect for VALU.
1183 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1184 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1185 .lowerIf(typeIs(1, S1))
1186 .customFor({{S32, S64}, {S64, S64}});
1187 if (ST.has16BitInsts())
1188 IToFP.legalFor({{S16, S16}});
1189 IToFP.clampScalar(1, S32, S64)
1190 .minScalar(0, S32)
1191 .scalarize(0)
1193
1194 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1195 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1196 .customFor({{S64, S32}, {S64, S64}})
1197 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1198 if (ST.has16BitInsts())
1199 FPToI.legalFor({{S16, S16}});
1200 else
1201 FPToI.minScalar(1, S32);
1202
1203 FPToI.minScalar(0, S32)
1204 .widenScalarToNextPow2(0, 32)
1205 .scalarize(0)
1206 .lower();
1207
1208 // clang-format off
1209 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1210 .legalFor({{S32, S32}, {S32, S64}})
1211 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1212 if (ST.has16BitInsts())
1213 FPToISat.legalFor({{S16, S16}});
1214
1215 FPToISat.minScalar(1, S32);
1216 FPToISat.minScalar(0, S32)
1217 .widenScalarToNextPow2(0, 32)
1218 .scalarize(0)
1219 .lower();
1220 // clang-format on
1221
1222 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1223 .clampScalar(0, S16, S64)
1224 .scalarize(0)
1225 .lower();
1226
1227 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1228 .legalFor({S16, S32})
1229 .scalarize(0)
1230 .lower();
1231
1232 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1233 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1234 .scalarize(0)
1235 .lower();
1236
1237 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1238 .clampScalar(0, S16, S64)
1239 .scalarize(0)
1240 .lower();
1241
1242 if (ST.has16BitInsts()) {
1244 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1245 .legalFor({S16, S32, S64})
1246 .clampScalar(0, S16, S64)
1247 .scalarize(0);
1248 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1251 .legalFor({S32, S64})
1252 .clampScalar(0, S32, S64)
1253 .scalarize(0);
1254 } else {
1256 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1257 .legalFor({S32})
1258 .customFor({S64})
1259 .clampScalar(0, S32, S64)
1260 .scalarize(0);
1261 }
1262
1264 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1265 .legalIf(all(isPointer(0), sameSize(0, 1)))
1266 .scalarize(0)
1267 .scalarSameSizeAs(1, 0);
1268
1270 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1271 .scalarSameSizeAs(1, 0)
1272 .scalarize(0);
1273
1274 auto &CmpBuilder =
1276 // The compare output type differs based on the register bank of the output,
1277 // so make both s1 and s32 legal.
1278 //
1279 // Scalar compares producing output in scc will be promoted to s32, as that
1280 // is the allocatable register type that will be needed for the copy from
1281 // scc. This will be promoted during RegBankSelect, and we assume something
1282 // before that won't try to use s32 result types.
1283 //
1284 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1285 // bank.
1287 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1288 .legalForCartesianProduct(
1289 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1290 if (ST.has16BitInsts()) {
1291 CmpBuilder.legalFor({{S1, S16}});
1292 }
1293
1294 CmpBuilder
1296 .clampScalar(1, S32, S64)
1297 .scalarize(0)
1298 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1299
1300 auto &FCmpBuilder =
1302 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1303
1304 if (ST.hasSALUFloatInsts())
1305 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1306
1307 FCmpBuilder
1309 .clampScalar(1, S32, S64)
1310 .scalarize(0);
1311
1312 // FIXME: fpow has a selection pattern that should move to custom lowering.
1313 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1314 if (ST.has16BitInsts())
1315 ExpOps.customFor({{S32}, {S16}});
1316 else
1317 ExpOps.customFor({S32});
1318 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1319 .scalarize(0);
1320
1322 .clampScalar(0, MinScalarFPTy, S32)
1323 .lower();
1324
1326 .legalFor(ST.has16BitInsts(), {S16})
1327 .customFor({S32, S16})
1328 .scalarize(0)
1329 .lower();
1330
1332 .legalFor(ST.has16BitInsts(), {S16})
1333 .customFor({S32, S64, S16})
1334 .scalarize(0)
1335 .lower();
1336
1337 auto &LogOps =
1338 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1339 LogOps.customFor({S32, S16, S64});
1340 LogOps.clampScalar(0, MinScalarFPTy, S32)
1341 .scalarize(0);
1342
1343 // The 64-bit versions produce 32-bit results, but only on the SALU.
1345 .legalFor({{S32, S32}, {S32, S64}})
1346 .clampScalar(0, S32, S32)
1347 .widenScalarToNextPow2(1, 32)
1348 .clampScalar(1, S32, S64)
1349 .scalarize(0)
1350 .widenScalarToNextPow2(0, 32);
1351
1352 // If no 16 bit instr is available, lower into different instructions.
1353 if (ST.has16BitInsts())
1354 getActionDefinitionsBuilder(G_IS_FPCLASS)
1355 .legalForCartesianProduct({S1}, FPTypes16)
1356 .widenScalarToNextPow2(1)
1357 .scalarize(0)
1358 .lower();
1359 else
1360 getActionDefinitionsBuilder(G_IS_FPCLASS)
1361 .legalForCartesianProduct({S1}, FPTypesBase)
1362 .lowerFor({S1, S16})
1363 .widenScalarToNextPow2(1)
1364 .scalarize(0)
1365 .lower();
1366
1367 // The hardware instructions return a different result on 0 than the generic
1368 // instructions expect. The hardware produces -1, but these produce the
1369 // bitwidth.
1370 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1371 .scalarize(0)
1372 .clampScalar(0, S32, S32)
1373 .clampScalar(1, S32, S64)
1374 .widenScalarToNextPow2(0, 32)
1375 .widenScalarToNextPow2(1, 32)
1376 .custom();
1377
1378 // The 64-bit versions produce 32-bit results, but only on the SALU.
1379 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1380 .legalFor({{S32, S32}, {S32, S64}})
1381 .customIf(scalarNarrowerThan(1, 32))
1382 .clampScalar(0, S32, S32)
1383 .clampScalar(1, S32, S64)
1384 .scalarize(0)
1385 .widenScalarToNextPow2(0, 32)
1386 .widenScalarToNextPow2(1, 32);
1387
1388 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1389 .legalFor({{S32, S32}, {S32, S64}})
1390 .clampScalar(0, S32, S32)
1391 .clampScalar(1, S32, S64)
1392 .scalarize(0)
1393 .widenScalarToNextPow2(0, 32)
1394 .widenScalarToNextPow2(1, 32);
1395
1397 .customFor({{S32, S32}})
1398 .scalarize(0)
1399 .clampScalar(0, S32, S32)
1400 .clampScalar(1, S32, S32);
1401
1402 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1403 // RegBankSelect.
1404 getActionDefinitionsBuilder(G_BITREVERSE)
1405 .legalFor({S32, S64})
1406 .clampScalar(0, S32, S64)
1407 .scalarize(0)
1409
1410 if (ST.has16BitInsts()) {
1412 .legalFor({S16, S32, V2S16})
1413 .clampMaxNumElementsStrict(0, S16, 2)
1414 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1415 // narrowScalar limitation.
1417 .clampScalar(0, S16, S32)
1418 .scalarize(0);
1419
1420 if (ST.hasVOP3PInsts()) {
1422 .legalFor({S32, S16, V2S16})
1423 .clampMaxNumElements(0, S16, 2)
1424 .minScalar(0, S16)
1426 .scalarize(0)
1427 .lower();
1428 if (ST.hasIntMinMax64()) {
1429 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1430 .legalFor({S32, S16, S64, V2S16})
1431 .clampMaxNumElements(0, S16, 2)
1432 .minScalar(0, S16)
1434 .scalarize(0)
1435 .lower();
1436 } else {
1437 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1438 .legalFor({S32, S16, V2S16})
1439 .clampMaxNumElements(0, S16, 2)
1440 .minScalar(0, S16)
1442 .scalarize(0)
1443 .lower();
1444 }
1445 } else {
1446 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1447 .legalFor({S32, S16})
1448 .widenScalarToNextPow2(0)
1449 .minScalar(0, S16)
1450 .scalarize(0)
1451 .lower();
1452 }
1453 } else {
1454 // TODO: Should have same legality without v_perm_b32
1456 .legalFor({S32})
1457 .lowerIf(scalarNarrowerThan(0, 32))
1458 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1459 // narrowScalar limitation.
1461 .maxScalar(0, S32)
1462 .scalarize(0)
1463 .lower();
1464
1465 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1466 .legalFor({S32})
1467 .minScalar(0, S32)
1469 .scalarize(0)
1470 .lower();
1471 }
1472
1473 getActionDefinitionsBuilder(G_INTTOPTR)
1474 // List the common cases
1475 .legalForCartesianProduct(AddrSpaces64, {S64})
1476 .legalForCartesianProduct(AddrSpaces32, {S32})
1477 .scalarize(0)
1478 // Accept any address space as long as the size matches
1479 .legalIf(sameSize(0, 1))
1481 [](const LegalityQuery &Query) {
1482 return std::pair(
1483 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1484 })
1485 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1486 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1487 });
1488
1489 getActionDefinitionsBuilder(G_PTRTOINT)
1490 // List the common cases
1491 .legalForCartesianProduct(AddrSpaces64, {S64})
1492 .legalForCartesianProduct(AddrSpaces32, {S32})
1493 .scalarize(0)
1494 // Accept any address space as long as the size matches
1495 .legalIf(sameSize(0, 1))
1497 [](const LegalityQuery &Query) {
1498 return std::pair(
1499 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1500 })
1501 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1502 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1503 });
1504
1505 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1506 .scalarize(0)
1507 .custom();
1508
1509 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1510 bool IsLoad) -> bool {
1511 const LLT DstTy = Query.Types[0];
1512
1513 // Split vector extloads.
1514 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1515
1516 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1517 return true;
1518
1519 const LLT PtrTy = Query.Types[1];
1520 unsigned AS = PtrTy.getAddressSpace();
1521 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1522 Query.MMODescrs[0].Ordering !=
1524 return true;
1525
1526 // Catch weird sized loads that don't evenly divide into the access sizes
1527 // TODO: May be able to widen depending on alignment etc.
1528 unsigned NumRegs = (MemSize + 31) / 32;
1529 if (NumRegs == 3) {
1530 if (!ST.hasDwordx3LoadStores())
1531 return true;
1532 } else {
1533 // If the alignment allows, these should have been widened.
1534 if (!isPowerOf2_32(NumRegs))
1535 return true;
1536 }
1537
1538 return false;
1539 };
1540
1541 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1542 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1543 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1544
1545 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1546 // LDS
1547 // TODO: Unsupported flat for SI.
1548
1549 for (unsigned Op : {G_LOAD, G_STORE}) {
1550 const bool IsStore = Op == G_STORE;
1551
1552 auto &Actions = getActionDefinitionsBuilder(Op);
1553 // Explicitly list some common cases.
1554 // TODO: Does this help compile time at all?
1555 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1556 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1557 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1558 {S64, GlobalPtr, S64, GlobalAlign32},
1559 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1560 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1561 {S32, GlobalPtr, S8, GlobalAlign8},
1562 {S32, GlobalPtr, S16, GlobalAlign16},
1563
1564 {S32, LocalPtr, S32, 32},
1565 {S64, LocalPtr, S64, 32},
1566 {V2S32, LocalPtr, V2S32, 32},
1567 {S32, LocalPtr, S8, 8},
1568 {S32, LocalPtr, S16, 16},
1569 {V2S16, LocalPtr, S32, 32},
1570
1571 {S32, PrivatePtr, S32, 32},
1572 {S32, PrivatePtr, S8, 8},
1573 {S32, PrivatePtr, S16, 16},
1574 {V2S16, PrivatePtr, S32, 32},
1575
1576 {S32, ConstantPtr, S32, GlobalAlign32},
1577 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1578 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1579 {S64, ConstantPtr, S64, GlobalAlign32},
1580 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1581 Actions.legalIf(
1582 [=](const LegalityQuery &Query) -> bool {
1583 return isLoadStoreLegal(ST, Query);
1584 });
1585
1586 // The custom pointers (fat pointers, buffer resources) don't work with load
1587 // and store at this level. Fat pointers should have been lowered to
1588 // intrinsics before the translation to MIR.
1589 Actions.unsupportedIf(
1590 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1591
1592 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1593 // ptrtoint. This is needed to account for the fact that we can't have i128
1594 // as a register class for SelectionDAG reasons.
1595 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1596 return hasBufferRsrcWorkaround(Query.Types[0]);
1597 });
1598
1599 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1600 // 64-bits.
1601 //
1602 // TODO: Should generalize bitcast action into coerce, which will also cover
1603 // inserting addrspacecasts.
1604 Actions.customIf(typeIs(1, Constant32Ptr));
1605
1606 // Turn any illegal element vectors into something easier to deal
1607 // with. These will ultimately produce 32-bit scalar shifts to extract the
1608 // parts anyway.
1609 //
1610 // For odd 16-bit element vectors, prefer to split those into pieces with
1611 // 16-bit vector parts.
1612 Actions.bitcastIf(
1613 [=](const LegalityQuery &Query) -> bool {
1614 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1615 Query.MMODescrs[0].MemoryTy);
1616 }, bitcastToRegisterType(0));
1617
1618 if (!IsStore) {
1619 // Widen suitably aligned loads by loading extra bytes. The standard
1620 // legalization actions can't properly express widening memory operands.
1621 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1622 return shouldWidenLoad(ST, Query, G_LOAD);
1623 });
1624 }
1625
1626 // FIXME: load/store narrowing should be moved to lower action
1627 Actions
1628 .narrowScalarIf(
1629 [=](const LegalityQuery &Query) -> bool {
1630 return !Query.Types[0].isVector() &&
1631 needToSplitMemOp(Query, Op == G_LOAD);
1632 },
1633 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1634 const LLT DstTy = Query.Types[0];
1635 const LLT PtrTy = Query.Types[1];
1636
1637 const unsigned DstSize = DstTy.getSizeInBits();
1638 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1639
1640 // Split extloads.
1641 if (DstSize > MemSize)
1642 return std::pair(0, LLT::scalar(MemSize));
1643
1644 unsigned MaxSize = maxSizeForAddrSpace(
1645 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1646 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1647 if (MemSize > MaxSize)
1648 return std::pair(0, LLT::scalar(MaxSize));
1649
1650 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1651 return std::pair(0, LLT::scalar(Align));
1652 })
1653 .fewerElementsIf(
1654 [=](const LegalityQuery &Query) -> bool {
1655 return Query.Types[0].isVector() &&
1656 needToSplitMemOp(Query, Op == G_LOAD);
1657 },
1658 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1659 const LLT DstTy = Query.Types[0];
1660 const LLT PtrTy = Query.Types[1];
1661
1662 LLT EltTy = DstTy.getElementType();
1663 unsigned MaxSize = maxSizeForAddrSpace(
1664 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1665 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1666
1667 // FIXME: Handle widened to power of 2 results better. This ends
1668 // up scalarizing.
1669 // FIXME: 3 element stores scalarized on SI
1670
1671 // Split if it's too large for the address space.
1672 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1673 if (MemSize > MaxSize) {
1674 unsigned NumElts = DstTy.getNumElements();
1675 unsigned EltSize = EltTy.getSizeInBits();
1676
1677 if (MaxSize % EltSize == 0) {
1678 return std::pair(
1680 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1681 }
1682
1683 unsigned NumPieces = MemSize / MaxSize;
1684
1685 // FIXME: Refine when odd breakdowns handled
1686 // The scalars will need to be re-legalized.
1687 if (NumPieces == 1 || NumPieces >= NumElts ||
1688 NumElts % NumPieces != 0)
1689 return std::pair(0, EltTy);
1690
1691 return std::pair(0,
1692 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1693 }
1694
1695 // FIXME: We could probably handle weird extending loads better.
1696 if (DstTy.getSizeInBits() > MemSize)
1697 return std::pair(0, EltTy);
1698
1699 unsigned EltSize = EltTy.getSizeInBits();
1700 unsigned DstSize = DstTy.getSizeInBits();
1701 if (!isPowerOf2_32(DstSize)) {
1702 // We're probably decomposing an odd sized store. Try to split
1703 // to the widest type. TODO: Account for alignment. As-is it
1704 // should be OK, since the new parts will be further legalized.
1705 unsigned FloorSize = llvm::bit_floor(DstSize);
1706 return std::pair(
1708 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1709 }
1710
1711 // May need relegalization for the scalars.
1712 return std::pair(0, EltTy);
1713 })
1714 .minScalar(0, S32)
1715 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1717 .widenScalarToNextPow2(0)
1718 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1719 .lower();
1720 }
1721
1722 // FIXME: Unaligned accesses not lowered.
1723 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1724 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1725 {S32, GlobalPtr, S16, 2 * 8},
1726 {S32, LocalPtr, S8, 8},
1727 {S32, LocalPtr, S16, 16},
1728 {S32, PrivatePtr, S8, 8},
1729 {S32, PrivatePtr, S16, 16},
1730 {S32, ConstantPtr, S8, 8},
1731 {S32, ConstantPtr, S16, 2 * 8}})
1732 .legalIf(
1733 [=](const LegalityQuery &Query) -> bool {
1734 return isLoadStoreLegal(ST, Query);
1735 });
1736
1737 if (ST.hasFlatAddressSpace()) {
1738 ExtLoads.legalForTypesWithMemDesc(
1739 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1740 }
1741
1742 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1743 // 64-bits.
1744 //
1745 // TODO: Should generalize bitcast action into coerce, which will also cover
1746 // inserting addrspacecasts.
1747 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1748
1749 ExtLoads.clampScalar(0, S32, S32)
1751 .lower();
1752
1753 auto &Atomics = getActionDefinitionsBuilder(
1754 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1755 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1756 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1757 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1758 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1759 {S64, GlobalPtr}, {S64, LocalPtr},
1760 {S32, RegionPtr}, {S64, RegionPtr}});
1761 if (ST.hasFlatAddressSpace()) {
1762 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1763 }
1764
1765 auto &Atomics32 =
1766 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1767 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1768 if (ST.hasFlatAddressSpace()) {
1769 Atomics32.legalFor({{S32, FlatPtr}});
1770 }
1771
1772 // TODO: v2bf16 operations, and fat buffer pointer support.
1773 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1774 if (ST.hasLDSFPAtomicAddF32()) {
1775 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1776 if (ST.hasLdsAtomicAddF64())
1777 Atomic.legalFor({{S64, LocalPtr}});
1778 if (ST.hasAtomicDsPkAdd16Insts())
1779 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1780 }
1781 if (ST.hasAtomicFaddInsts())
1782 Atomic.legalFor({{S32, GlobalPtr}});
1783 if (ST.hasFlatAtomicFaddF32Inst())
1784 Atomic.legalFor({{S32, FlatPtr}});
1785
1786 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1787 // These are legal with some caveats, and should have undergone expansion in
1788 // the IR in most situations
1789 // TODO: Move atomic expansion into legalizer
1790 Atomic.legalFor({
1791 {S32, GlobalPtr},
1792 {S64, GlobalPtr},
1793 {S64, FlatPtr}
1794 });
1795 }
1796
1797 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1798 ST.hasAtomicBufferGlobalPkAddF16Insts())
1799 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1800 if (ST.hasAtomicGlobalPkAddBF16Inst())
1801 Atomic.legalFor({{V2BF16, GlobalPtr}});
1802 if (ST.hasAtomicFlatPkAdd16Insts())
1803 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1804
1805
1806 // Most of the legalization work here is done by AtomicExpand. We could
1807 // probably use a simpler legality rule that just assumes anything is OK.
1808 auto &AtomicFMinFMax =
1809 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1810 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1811
1812 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1813 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1814 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1815 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1816 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1817 AtomicFMinFMax.legalFor({F32, FlatPtr});
1818 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1819 AtomicFMinFMax.legalFor({F64, FlatPtr});
1820
1821 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1822 // demarshalling
1823 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1824 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1825 {S32, FlatPtr}, {S64, FlatPtr}})
1826 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1827 {S32, RegionPtr}, {S64, RegionPtr}});
1828 // TODO: Pointer types, any 32-bit or 64-bit vector
1829
1830 // Condition should be s32 for scalar, s1 for vector.
1833 LocalPtr, FlatPtr, PrivatePtr,
1834 LLT::fixed_vector(2, LocalPtr),
1835 LLT::fixed_vector(2, PrivatePtr)},
1836 {S1, S32})
1837 .clampScalar(0, S16, S64)
1838 .scalarize(1)
1841 .clampMaxNumElements(0, S32, 2)
1842 .clampMaxNumElements(0, LocalPtr, 2)
1843 .clampMaxNumElements(0, PrivatePtr, 2)
1844 .scalarize(0)
1846 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1847
1848 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1849 // be more flexible with the shift amount type.
1850 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1851 .legalFor({{S32, S32}, {S64, S32}});
1852 if (ST.has16BitInsts()) {
1853 if (ST.hasVOP3PInsts()) {
1854 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1855 .clampMaxNumElements(0, S16, 2);
1856 } else
1857 Shifts.legalFor({{S16, S16}});
1858
1859 // TODO: Support 16-bit shift amounts for all types
1860 Shifts.widenScalarIf(
1861 [=](const LegalityQuery &Query) {
1862 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1863 // 32-bit amount.
1864 const LLT ValTy = Query.Types[0];
1865 const LLT AmountTy = Query.Types[1];
1866 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1867 AmountTy.getSizeInBits() < 16;
1868 }, changeTo(1, S16));
1869 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1870 Shifts.clampScalar(1, S32, S32);
1871 Shifts.widenScalarToNextPow2(0, 16);
1872 Shifts.clampScalar(0, S16, S64);
1873
1874 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1875 .minScalar(0, S16)
1876 .scalarize(0)
1877 .lower();
1878 } else {
1879 // Make sure we legalize the shift amount type first, as the general
1880 // expansion for the shifted type will produce much worse code if it hasn't
1881 // been truncated already.
1882 Shifts.clampScalar(1, S32, S32);
1883 Shifts.widenScalarToNextPow2(0, 32);
1884 Shifts.clampScalar(0, S32, S64);
1885
1886 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1887 .minScalar(0, S32)
1888 .scalarize(0)
1889 .lower();
1890 }
1891 Shifts.scalarize(0);
1892
1893 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1894 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1895 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1896 unsigned IdxTypeIdx = 2;
1897
1899 .customIf([=](const LegalityQuery &Query) {
1900 const LLT EltTy = Query.Types[EltTypeIdx];
1901 const LLT VecTy = Query.Types[VecTypeIdx];
1902 const LLT IdxTy = Query.Types[IdxTypeIdx];
1903 const unsigned EltSize = EltTy.getSizeInBits();
1904 const bool isLegalVecType =
1906 // Address space 8 pointers are 128-bit wide values, but the logic
1907 // below will try to bitcast them to 2N x s64, which will fail.
1908 // Therefore, as an intermediate step, wrap extracts/insertions from a
1909 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1910 // extraction result) in order to produce a vector operation that can
1911 // be handled by the logic below.
1912 if (EltTy.isPointer() && EltSize > 64)
1913 return true;
1914 return (EltSize == 32 || EltSize == 64) &&
1915 VecTy.getSizeInBits() % 32 == 0 &&
1916 VecTy.getSizeInBits() <= MaxRegisterSize &&
1917 IdxTy.getSizeInBits() == 32 &&
1918 isLegalVecType;
1919 })
1920 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1921 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1922 bitcastToVectorElement32(VecTypeIdx))
1923 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1924 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1925 scalarOrEltWiderThan(VecTypeIdx, 64)),
1926 [=](const LegalityQuery &Query) {
1927 // For > 64-bit element types, try to turn this into a
1928 // 64-bit element vector since we may be able to do better
1929 // indexing if this is scalar. If not, fall back to 32.
1930 const LLT EltTy = Query.Types[EltTypeIdx];
1931 const LLT VecTy = Query.Types[VecTypeIdx];
1932 const unsigned DstEltSize = EltTy.getSizeInBits();
1933 const unsigned VecSize = VecTy.getSizeInBits();
1934
1935 const unsigned TargetEltSize =
1936 DstEltSize % 64 == 0 ? 64 : 32;
1937 return std::pair(VecTypeIdx,
1938 LLT::fixed_vector(VecSize / TargetEltSize,
1939 TargetEltSize));
1940 })
1941 .clampScalar(EltTypeIdx, S32, S64)
1942 .clampScalar(VecTypeIdx, S32, S64)
1943 .clampScalar(IdxTypeIdx, S32, S32)
1944 .clampMaxNumElements(VecTypeIdx, S32, 32)
1945 // TODO: Clamp elements for 64-bit vectors?
1946 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1948 // It should only be necessary with variable indexes.
1949 // As a last resort, lower to the stack
1950 .lower();
1951 }
1952
1953 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1954 .unsupportedIf([=](const LegalityQuery &Query) {
1955 const LLT &EltTy = Query.Types[1].getElementType();
1956 return Query.Types[0] != EltTy;
1957 });
1958
1959 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1960 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1961 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1964 [=](const LegalityQuery &Query) {
1965 const LLT BigTy = Query.Types[BigTyIdx];
1966 return (BigTy.getScalarSizeInBits() < 16);
1967 },
1969 .widenScalarIf(
1970 [=](const LegalityQuery &Query) {
1971 const LLT LitTy = Query.Types[LitTyIdx];
1972 return (LitTy.getScalarSizeInBits() < 16);
1973 },
1975 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1976 .widenScalarToNextPow2(BigTyIdx, 32)
1977 .customIf([=](const LegalityQuery &Query) {
1978 // Generic lower operates on the full-width value, producing
1979 // shift+trunc/mask sequences. For simple cases where extract/insert
1980 // values are 32-bit aligned, we can instead unmerge/merge and work on
1981 // the 32-bit components. However, we can't check the offset here so
1982 // custom lower function will have to call generic lowering if offset
1983 // is not 32-bit aligned.
1984 const LLT BigTy = Query.Types[BigTyIdx];
1985 const LLT LitTy = Query.Types[LitTyIdx];
1986 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
1987 LitTy.getSizeInBits() % 32 == 0;
1988 })
1989 .lower();
1990 }
1991
1992 auto &BuildVector =
1993 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1995 .legalForCartesianProduct(AllS64Vectors, {S64})
1996 .clampNumElements(0, V16S32, V32S32)
2001
2002 if (ST.hasScalarPackInsts()) {
2003 BuildVector
2004 // FIXME: Should probably widen s1 vectors straight to s32
2005 .minScalarOrElt(0, S16)
2006 .minScalar(1, S16);
2007
2008 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2009 .legalFor({V2S16, S32})
2010 .lower();
2011 } else {
2012 BuildVector.customFor({V2S16, S16});
2013 BuildVector.minScalarOrElt(0, S32);
2014
2015 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2016 .customFor({V2S16, S32})
2017 .lower();
2018 }
2019
2020 BuildVector.legalIf(isRegisterType(ST, 0));
2021
2022 // FIXME: Clamp maximum size
2023 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2024 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2025 .clampMaxNumElements(0, S32, 32)
2026 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2027 .clampMaxNumElements(0, S16, 64);
2028
2029 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2030
2031 // Merge/Unmerge
2032 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2033 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2034 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2035
2036 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2037 const LLT Ty = Query.Types[TypeIdx];
2038 if (Ty.isVector()) {
2039 const LLT &EltTy = Ty.getElementType();
2040 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2041 return true;
2043 return true;
2044 }
2045 return false;
2046 };
2047
2048 auto &Builder =
2050 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2051 .lowerFor({{S16, V2S16}})
2052 .lowerIf([=](const LegalityQuery &Query) {
2053 const LLT BigTy = Query.Types[BigTyIdx];
2054 return BigTy.getSizeInBits() == 32;
2055 })
2056 // Try to widen to s16 first for small types.
2057 // TODO: Only do this on targets with legal s16 shifts
2058 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2059 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2061 oneMoreElement(BigTyIdx))
2063 elementTypeIs(1, S16)),
2064 changeTo(1, V2S16))
2065 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2066 // not worth considering the multiples of 64 since 2*192 and 2*384
2067 // are not valid.
2068 .clampScalar(LitTyIdx, S32, S512)
2069 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2070 // Break up vectors with weird elements into scalars
2072 [=](const LegalityQuery &Query) {
2073 return notValidElt(Query, LitTyIdx);
2074 },
2075 scalarize(0))
2076 .fewerElementsIf(
2077 [=](const LegalityQuery &Query) {
2078 return notValidElt(Query, BigTyIdx);
2079 },
2080 scalarize(1))
2081 .clampScalar(BigTyIdx, S32, MaxScalar);
2082
2083 if (Op == G_MERGE_VALUES) {
2084 Builder.widenScalarIf(
2085 // TODO: Use 16-bit shifts if legal for 8-bit values?
2086 [=](const LegalityQuery &Query) {
2087 const LLT Ty = Query.Types[LitTyIdx];
2088 return Ty.getSizeInBits() < 32;
2089 },
2090 changeTo(LitTyIdx, S32));
2091 }
2092
2093 Builder.widenScalarIf(
2094 [=](const LegalityQuery &Query) {
2095 const LLT Ty = Query.Types[BigTyIdx];
2096 return Ty.getSizeInBits() % 16 != 0;
2097 },
2098 [=](const LegalityQuery &Query) {
2099 // Pick the next power of 2, or a multiple of 64 over 128.
2100 // Whichever is smaller.
2101 const LLT &Ty = Query.Types[BigTyIdx];
2102 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2103 if (NewSizeInBits >= 256) {
2104 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2105 if (RoundedTo < NewSizeInBits)
2106 NewSizeInBits = RoundedTo;
2107 }
2108 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2109 })
2110 // Any vectors left are the wrong size. Scalarize them.
2111 .scalarize(0)
2112 .scalarize(1);
2113 }
2114
2115 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2116 // RegBankSelect.
2117 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2118 .legalFor({{S32}, {S64}})
2119 .clampScalar(0, S32, S64);
2120
2121 if (ST.hasVOP3PInsts()) {
2122 SextInReg.lowerFor({{V2S16}})
2123 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2124 // get more vector shift opportunities, since we'll get those when
2125 // expanded.
2126 .clampMaxNumElementsStrict(0, S16, 2);
2127 } else if (ST.has16BitInsts()) {
2128 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2129 } else {
2130 // Prefer to promote to s32 before lowering if we don't have 16-bit
2131 // shifts. This avoid a lot of intermediate truncate and extend operations.
2132 SextInReg.lowerFor({{S32}, {S64}});
2133 }
2134
2135 SextInReg
2136 .scalarize(0)
2137 .clampScalar(0, S32, S64)
2138 .lower();
2139
2140 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2141 .scalarize(0)
2142 .lower();
2143
2144 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2145 FSHRActionDefs.legalFor({{S32, S32}})
2146 .clampMaxNumElementsStrict(0, S16, 2);
2147 if (ST.hasVOP3PInsts())
2148 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2149 FSHRActionDefs.scalarize(0).lower();
2150
2151 if (ST.hasVOP3PInsts()) {
2153 .lowerFor({{V2S16, V2S16}})
2154 .clampMaxNumElementsStrict(0, S16, 2)
2155 .scalarize(0)
2156 .lower();
2157 } else {
2159 .scalarize(0)
2160 .lower();
2161 }
2162
2163 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2164 .legalFor({S64});
2165
2166 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2167
2169 .alwaysLegal();
2170
2171 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2172 .scalarize(0)
2173 .minScalar(0, S32)
2174 .lower();
2175
2176 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2177 .legalFor({{S32, S32}, {S64, S32}})
2178 .clampScalar(1, S32, S32)
2179 .clampScalar(0, S32, S64)
2181 .scalarize(0);
2182
2184 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2185 G_FCOPYSIGN,
2186
2187 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2188 G_READ_REGISTER, G_WRITE_REGISTER,
2189
2190 G_SADDO, G_SSUBO})
2191 .lower();
2192
2193 if (ST.hasIEEEMinimumMaximumInsts()) {
2194 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2195 .legalFor(FPTypesPK16)
2196 .clampMaxNumElements(0, S16, 2)
2197 .scalarize(0);
2198 } else if (ST.hasVOP3PInsts()) {
2199 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2200 .lowerFor({V2S16})
2201 .clampMaxNumElementsStrict(0, S16, 2)
2202 .scalarize(0)
2203 .lower();
2204 } else {
2205 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2206 .scalarize(0)
2207 .clampScalar(0, S32, S64)
2208 .lower();
2209 }
2210
2211 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2212 .lower();
2213
2214 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2215
2216 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2217 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2218 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2219 .unsupported();
2220
2222
2224 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2225 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2226 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2227 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2228 .legalFor(AllVectors)
2229 .scalarize(1)
2230 .lower();
2231
2233 verify(*ST.getInstrInfo());
2234}
2235
2238 LostDebugLocObserver &LocObserver) const {
2239 MachineIRBuilder &B = Helper.MIRBuilder;
2240 MachineRegisterInfo &MRI = *B.getMRI();
2241
2242 switch (MI.getOpcode()) {
2243 case TargetOpcode::G_ADDRSPACE_CAST:
2244 return legalizeAddrSpaceCast(MI, MRI, B);
2245 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2246 return legalizeFroundeven(MI, MRI, B);
2247 case TargetOpcode::G_FCEIL:
2248 return legalizeFceil(MI, MRI, B);
2249 case TargetOpcode::G_FREM:
2250 return legalizeFrem(MI, MRI, B);
2251 case TargetOpcode::G_INTRINSIC_TRUNC:
2252 return legalizeIntrinsicTrunc(MI, MRI, B);
2253 case TargetOpcode::G_SITOFP:
2254 return legalizeITOFP(MI, MRI, B, true);
2255 case TargetOpcode::G_UITOFP:
2256 return legalizeITOFP(MI, MRI, B, false);
2257 case TargetOpcode::G_FPTOSI:
2258 return legalizeFPTOI(MI, MRI, B, true);
2259 case TargetOpcode::G_FPTOUI:
2260 return legalizeFPTOI(MI, MRI, B, false);
2261 case TargetOpcode::G_FMINNUM:
2262 case TargetOpcode::G_FMAXNUM:
2263 case TargetOpcode::G_FMINIMUMNUM:
2264 case TargetOpcode::G_FMAXIMUMNUM:
2265 return legalizeMinNumMaxNum(Helper, MI);
2266 case TargetOpcode::G_EXTRACT:
2267 return legalizeExtract(Helper, MI);
2268 case TargetOpcode::G_INSERT:
2269 return legalizeInsert(Helper, MI);
2270 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2271 return legalizeExtractVectorElt(MI, MRI, B);
2272 case TargetOpcode::G_INSERT_VECTOR_ELT:
2273 return legalizeInsertVectorElt(MI, MRI, B);
2274 case TargetOpcode::G_FSIN:
2275 case TargetOpcode::G_FCOS:
2276 return legalizeSinCos(MI, MRI, B);
2277 case TargetOpcode::G_GLOBAL_VALUE:
2278 return legalizeGlobalValue(MI, MRI, B);
2279 case TargetOpcode::G_LOAD:
2280 case TargetOpcode::G_SEXTLOAD:
2281 case TargetOpcode::G_ZEXTLOAD:
2282 return legalizeLoad(Helper, MI);
2283 case TargetOpcode::G_STORE:
2284 return legalizeStore(Helper, MI);
2285 case TargetOpcode::G_FMAD:
2286 return legalizeFMad(MI, MRI, B);
2287 case TargetOpcode::G_FDIV:
2288 return legalizeFDIV(MI, MRI, B);
2289 case TargetOpcode::G_FFREXP:
2290 return legalizeFFREXP(MI, MRI, B);
2291 case TargetOpcode::G_FSQRT:
2292 return legalizeFSQRT(MI, MRI, B);
2293 case TargetOpcode::G_UDIV:
2294 case TargetOpcode::G_UREM:
2295 case TargetOpcode::G_UDIVREM:
2296 return legalizeUnsignedDIV_REM(MI, MRI, B);
2297 case TargetOpcode::G_SDIV:
2298 case TargetOpcode::G_SREM:
2299 case TargetOpcode::G_SDIVREM:
2300 return legalizeSignedDIV_REM(MI, MRI, B);
2301 case TargetOpcode::G_ATOMIC_CMPXCHG:
2302 return legalizeAtomicCmpXChg(MI, MRI, B);
2303 case TargetOpcode::G_FLOG2:
2304 return legalizeFlog2(MI, B);
2305 case TargetOpcode::G_FLOG:
2306 case TargetOpcode::G_FLOG10:
2307 return legalizeFlogCommon(MI, B);
2308 case TargetOpcode::G_FEXP2:
2309 return legalizeFExp2(MI, B);
2310 case TargetOpcode::G_FEXP:
2311 case TargetOpcode::G_FEXP10:
2312 return legalizeFExp(MI, B);
2313 case TargetOpcode::G_FPOW:
2314 return legalizeFPow(MI, B);
2315 case TargetOpcode::G_FFLOOR:
2316 return legalizeFFloor(MI, MRI, B);
2317 case TargetOpcode::G_BUILD_VECTOR:
2318 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2319 return legalizeBuildVector(MI, MRI, B);
2320 case TargetOpcode::G_MUL:
2321 return legalizeMul(Helper, MI);
2322 case TargetOpcode::G_CTLZ:
2323 case TargetOpcode::G_CTTZ:
2324 return legalizeCTLZ_CTTZ(MI, MRI, B);
2325 case TargetOpcode::G_CTLS:
2326 return legalizeCTLS(MI, MRI, B);
2327 case TargetOpcode::G_CTLZ_ZERO_POISON:
2328 return legalizeCTLZ_ZERO_POISON(MI, MRI, B);
2329 case TargetOpcode::G_STACKSAVE:
2330 return legalizeStackSave(MI, B);
2331 case TargetOpcode::G_GET_FPENV:
2332 return legalizeGetFPEnv(MI, MRI, B);
2333 case TargetOpcode::G_SET_FPENV:
2334 return legalizeSetFPEnv(MI, MRI, B);
2335 case TargetOpcode::G_TRAP:
2336 return legalizeTrap(MI, MRI, B);
2337 case TargetOpcode::G_DEBUGTRAP:
2338 return legalizeDebugTrap(MI, MRI, B);
2339 default:
2340 return false;
2341 }
2342
2343 llvm_unreachable("expected switch to return");
2344}
2345
2347 unsigned AS,
2349 MachineIRBuilder &B) const {
2350 MachineFunction &MF = B.getMF();
2351 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2352 const LLT S32 = LLT::scalar(32);
2353 const LLT S64 = LLT::scalar(64);
2354
2356
2357 if (ST.hasApertureRegs()) {
2358 // Note: this register is somewhat broken. When used as a 32-bit operand,
2359 // it only returns zeroes. The real value is in the upper 32 bits.
2360 // Thus, we must emit extract the high 32 bits.
2361 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2362 ? AMDGPU::SRC_SHARED_BASE
2363 : AMDGPU::SRC_PRIVATE_BASE;
2364 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2365 !ST.hasGloballyAddressableScratch()) &&
2366 "Cannot use src_private_base with globally addressable scratch!");
2368 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2369 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2370 return B.buildUnmerge(S32, Dst).getReg(1);
2371 }
2372
2375 // For code object version 5, private_base and shared_base are passed through
2376 // implicit kernargs.
2380
2385 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2386
2387 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2389
2390 if (!loadInputValue(KernargPtrReg, B,
2392 return Register();
2393
2395 PtrInfo.getWithOffset(Offset),
2399
2400 // Pointer address
2401 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2402 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2403 // Load address
2404 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2405 }
2406
2409
2411 return Register();
2412
2413 // TODO: Use custom PseudoSourceValue
2415
2416 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2417 // private_segment_aperture_base_hi.
2418 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2419
2421 PtrInfo,
2424 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2425
2426 B.buildObjectPtrOffset(
2427 LoadAddr, QueuePtr,
2428 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2429 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2430}
2431
2432/// Return true if the value is a known valid address, such that a null check is
2433/// not necessary.
2435 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2436 MachineInstr *Def = MRI.getVRegDef(Val);
2437 switch (Def->getOpcode()) {
2438 case AMDGPU::G_FRAME_INDEX:
2439 case AMDGPU::G_GLOBAL_VALUE:
2440 case AMDGPU::G_BLOCK_ADDR:
2441 return true;
2442 case AMDGPU::G_CONSTANT: {
2443 const ConstantInt *CI = Def->getOperand(1).getCImm();
2444 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2445 }
2446 default:
2447 return false;
2448 }
2449
2450 return false;
2451}
2452
2455 MachineIRBuilder &B) const {
2456 MachineFunction &MF = B.getMF();
2457
2458 // MI can either be a G_ADDRSPACE_CAST or a
2459 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2460 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2461 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2462 Intrinsic::amdgcn_addrspacecast_nonnull));
2463
2464 const LLT S32 = LLT::scalar(32);
2465 Register Dst = MI.getOperand(0).getReg();
2466 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2467 : MI.getOperand(1).getReg();
2468 LLT DstTy = MRI.getType(Dst);
2469 LLT SrcTy = MRI.getType(Src);
2470 unsigned DestAS = DstTy.getAddressSpace();
2471 unsigned SrcAS = SrcTy.getAddressSpace();
2472
2473 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2474 // vector element.
2475 assert(!DstTy.isVector());
2476
2477 const AMDGPUTargetMachine &TM
2478 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2479
2480 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2481 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2482 return true;
2483 }
2484
2485 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2486 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2487 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2488 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2489 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2490 ST.hasGloballyAddressableScratch()) {
2491 // flat -> private with globally addressable scratch: subtract
2492 // src_flat_scratch_base_lo.
2493 const LLT S32 = LLT::scalar(32);
2494 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2495 Register FlatScratchBaseLo =
2496 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2497 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2498 .getReg(0);
2499 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2500 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2501 return B.buildIntToPtr(Dst, Sub).getReg(0);
2502 }
2503
2504 // Extract low 32-bits of the pointer.
2505 return B.buildExtract(Dst, Src, 0).getReg(0);
2506 };
2507
2508 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2509 // G_ADDRSPACE_CAST we need to guess.
2510 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2511 castFlatToLocalOrPrivate(Dst);
2512 MI.eraseFromParent();
2513 return true;
2514 }
2515
2516 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2517
2518 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2519 auto FlatNull = B.buildConstant(SrcTy, 0);
2520
2521 // Extract low 32-bits of the pointer.
2522 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2523
2524 auto CmpRes =
2525 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2526 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2527
2528 MI.eraseFromParent();
2529 return true;
2530 }
2531
2532 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2533 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2534 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2535 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2536 // Coerce the type of the low half of the result so we can use
2537 // merge_values.
2538 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2539
2540 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2541 ST.hasGloballyAddressableScratch()) {
2542 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2543 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2544 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2545 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2546 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2547 .addUse(AllOnes)
2548 .addUse(ThreadID)
2549 .getReg(0);
2550 if (ST.isWave64()) {
2551 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2552 .addUse(AllOnes)
2553 .addUse(ThreadID)
2554 .getReg(0);
2555 }
2556 Register ShAmt =
2557 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2558 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2559 Register CvtPtr =
2560 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2561 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2562 // 64-bit hi:lo value.
2563 Register FlatScratchBase =
2564 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2565 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2566 .getReg(0);
2567 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2568 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2569 }
2570
2571 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2572 if (!ApertureReg.isValid())
2573 return false;
2574
2575 // TODO: Should we allow mismatched types but matching sizes in merges to
2576 // avoid the ptrtoint?
2577 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2578 };
2579
2580 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2581 // G_ADDRSPACE_CAST we need to guess.
2582 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2583 castLocalOrPrivateToFlat(Dst);
2584 MI.eraseFromParent();
2585 return true;
2586 }
2587
2588 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2589
2590 auto SegmentNull =
2591 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2592 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2593
2594 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2595 SegmentNull.getReg(0));
2596
2597 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2598
2599 MI.eraseFromParent();
2600 return true;
2601 }
2602
2603 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2604 SrcTy.getSizeInBits() == 64) {
2605 // Truncate.
2606 B.buildExtract(Dst, Src, 0);
2607 MI.eraseFromParent();
2608 return true;
2609 }
2610
2611 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2612 DstTy.getSizeInBits() == 64) {
2614 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2615 auto PtrLo = B.buildPtrToInt(S32, Src);
2616 if (AddrHiVal == 0) {
2617 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2618 B.buildIntToPtr(Dst, Zext);
2619 } else {
2620 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2621 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2622 }
2623
2624 MI.eraseFromParent();
2625 return true;
2626 }
2627
2628 // Invalid casts are poison.
2629 // TODO: Should return poison
2630 B.buildUndef(Dst);
2631 MI.eraseFromParent();
2632 return true;
2633}
2634
2637 MachineIRBuilder &B) const {
2638 Register Src = MI.getOperand(1).getReg();
2639 LLT Ty = MRI.getType(Src);
2640 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2641
2642 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2643 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2644
2645 auto C1 = B.buildFConstant(Ty, C1Val);
2646 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2647
2648 // TODO: Should this propagate fast-math-flags?
2649 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2650 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2651
2652 auto C2 = B.buildFConstant(Ty, C2Val);
2653 auto Fabs = B.buildFAbs(Ty, Src);
2654
2655 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2656 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2657 MI.eraseFromParent();
2658 return true;
2659}
2660
2663 MachineIRBuilder &B) const {
2664
2665 const LLT S1 = LLT::scalar(1);
2666 const LLT S64 = LLT::scalar(64);
2667
2668 Register Src = MI.getOperand(1).getReg();
2669 assert(MRI.getType(Src) == S64);
2670
2671 // result = trunc(src)
2672 // if (src > 0.0 && src != result)
2673 // result += 1.0
2674
2675 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2676
2677 const auto Zero = B.buildFConstant(S64, 0.0);
2678 const auto One = B.buildFConstant(S64, 1.0);
2679 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2680 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2681 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2682 auto Add = B.buildSelect(S64, And, One, Zero);
2683
2684 // TODO: Should this propagate fast-math-flags?
2685 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2686 MI.eraseFromParent();
2687 return true;
2688}
2689
2692 MachineIRBuilder &B) const {
2693 Register DstReg = MI.getOperand(0).getReg();
2694 Register Src0Reg = MI.getOperand(1).getReg();
2695 Register Src1Reg = MI.getOperand(2).getReg();
2696 auto Flags = MI.getFlags();
2697 LLT Ty = MRI.getType(DstReg);
2698
2699 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2700 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2701 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2702 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2703 MI.eraseFromParent();
2704 return true;
2705}
2706
2709 const unsigned FractBits = 52;
2710 const unsigned ExpBits = 11;
2711 LLT S32 = LLT::scalar(32);
2712
2713 auto Const0 = B.buildConstant(S32, FractBits - 32);
2714 auto Const1 = B.buildConstant(S32, ExpBits);
2715
2716 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2717 .addUse(Hi)
2718 .addUse(Const0.getReg(0))
2719 .addUse(Const1.getReg(0));
2720
2721 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2722}
2723
2726 MachineIRBuilder &B) const {
2727 const LLT S1 = LLT::scalar(1);
2728 const LLT S32 = LLT::scalar(32);
2729 const LLT S64 = LLT::scalar(64);
2730
2731 Register Src = MI.getOperand(1).getReg();
2732 assert(MRI.getType(Src) == S64);
2733
2734 // TODO: Should this use extract since the low half is unused?
2735 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2736 Register Hi = Unmerge.getReg(1);
2737
2738 // Extract the upper half, since this is where we will find the sign and
2739 // exponent.
2740 auto Exp = extractF64Exponent(Hi, B);
2741
2742 const unsigned FractBits = 52;
2743
2744 // Extract the sign bit.
2745 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2746 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2747
2748 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2749
2750 const auto Zero32 = B.buildConstant(S32, 0);
2751
2752 // Extend back to 64-bits.
2753 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2754
2755 auto Shr = B.buildAShr(S64, FractMask, Exp);
2756 auto Not = B.buildNot(S64, Shr);
2757 auto Tmp0 = B.buildAnd(S64, Src, Not);
2758 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2759
2760 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2761 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2762
2763 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2764 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2765 MI.eraseFromParent();
2766 return true;
2767}
2768
2771 MachineIRBuilder &B, bool Signed) const {
2772
2773 Register Dst = MI.getOperand(0).getReg();
2774 Register Src = MI.getOperand(1).getReg();
2775
2776 const LLT S64 = LLT::scalar(64);
2777 const LLT S32 = LLT::scalar(32);
2778
2779 assert(MRI.getType(Src) == S64);
2780
2781 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2782 auto ThirtyTwo = B.buildConstant(S32, 32);
2783
2784 if (MRI.getType(Dst) == S64) {
2785 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2786 : B.buildUITOFP(S64, Unmerge.getReg(1));
2787
2788 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2789 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2790
2791 // TODO: Should this propagate fast-math-flags?
2792 B.buildFAdd(Dst, LdExp, CvtLo);
2793 MI.eraseFromParent();
2794 return true;
2795 }
2796
2797 assert(MRI.getType(Dst) == S32);
2798
2799 auto One = B.buildConstant(S32, 1);
2800
2801 MachineInstrBuilder ShAmt;
2802 if (Signed) {
2803 auto ThirtyOne = B.buildConstant(S32, 31);
2804 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2805 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2806 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2807 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2808 .addUse(Unmerge.getReg(1));
2809 auto LS2 = B.buildSub(S32, LS, One);
2810 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2811 } else
2812 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2813 auto Norm = B.buildShl(S64, Src, ShAmt);
2814 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2815 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2816 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2817 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2818 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2819 B.buildFLdexp(Dst, FVal, Scale);
2820 MI.eraseFromParent();
2821 return true;
2822}
2823
2824// TODO: Copied from DAG implementation. Verify logic and document how this
2825// actually works.
2829 bool Signed) const {
2830
2831 Register Dst = MI.getOperand(0).getReg();
2832 Register Src = MI.getOperand(1).getReg();
2833
2834 const LLT S64 = LLT::scalar(64);
2835 const LLT S32 = LLT::scalar(32);
2836
2837 const LLT SrcLT = MRI.getType(Src);
2838 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2839
2840 unsigned Flags = MI.getFlags();
2841
2842 // The basic idea of converting a floating point number into a pair of 32-bit
2843 // integers is illustrated as follows:
2844 //
2845 // tf := trunc(val);
2846 // hif := floor(tf * 2^-32);
2847 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2848 // hi := fptoi(hif);
2849 // lo := fptoi(lof);
2850 //
2851 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2853 if (Signed && SrcLT == S32) {
2854 // However, a 32-bit floating point number has only 23 bits mantissa and
2855 // it's not enough to hold all the significant bits of `lof` if val is
2856 // negative. To avoid the loss of precision, We need to take the absolute
2857 // value after truncating and flip the result back based on the original
2858 // signedness.
2859 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2860 Trunc = B.buildFAbs(S32, Trunc, Flags);
2861 }
2862 MachineInstrBuilder K0, K1;
2863 if (SrcLT == S64) {
2864 K0 = B.buildFConstant(
2865 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2866 K1 = B.buildFConstant(
2867 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2868 } else {
2869 K0 = B.buildFConstant(
2870 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2871 K1 = B.buildFConstant(
2872 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2873 }
2874
2875 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2876 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2877 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2878
2879 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2880 : B.buildFPTOUI(S32, FloorMul);
2881 auto Lo = B.buildFPTOUI(S32, Fma);
2882
2883 if (Signed && SrcLT == S32) {
2884 // Flip the result based on the signedness, which is either all 0s or 1s.
2885 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2886 // r := xor({lo, hi}, sign) - sign;
2887 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2888 Sign);
2889 } else
2890 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2891 MI.eraseFromParent();
2892
2893 return true;
2894}
2895
2897 MachineInstr &MI) const {
2898 MachineFunction &MF = Helper.MIRBuilder.getMF();
2900
2901 // With ieee_mode disabled, the instructions have the correct behavior.
2902 if (!MFI->getMode().IEEE)
2903 return true;
2904
2906}
2907
2909 MachineInstr &MI) const {
2910 MachineIRBuilder &B = Helper.MIRBuilder;
2911 MachineRegisterInfo &MRI = *B.getMRI();
2912 Register DstReg = MI.getOperand(0).getReg();
2913 Register SrcReg = MI.getOperand(1).getReg();
2914 uint64_t Offset = MI.getOperand(2).getImm();
2915
2916 // Fall back to generic lowering for offset 0 (trivial trunc) and
2917 // non-32-bit-aligned cases which require shift+trunc sequences
2918 // that generic code handles correctly.
2919 if (Offset == 0 || Offset % 32 != 0)
2920 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2921
2922 const LLT DstTy = MRI.getType(DstReg);
2923 unsigned StartIdx = Offset / 32;
2924 unsigned DstCount = DstTy.getSizeInBits() / 32;
2925 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2926
2927 if (DstCount == 1) {
2928 if (DstTy.isPointer())
2929 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2930 else
2931 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2932 } else {
2933 SmallVector<Register, 8> MergeVec;
2934 for (unsigned I = 0; I < DstCount; ++I)
2935 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
2936 B.buildMergeLikeInstr(DstReg, MergeVec);
2937 }
2938
2939 MI.eraseFromParent();
2940 return true;
2941}
2942
2944 MachineInstr &MI) const {
2945 MachineIRBuilder &B = Helper.MIRBuilder;
2946 MachineRegisterInfo &MRI = *B.getMRI();
2947 Register DstReg = MI.getOperand(0).getReg();
2948 Register SrcReg = MI.getOperand(1).getReg();
2949 Register InsertSrc = MI.getOperand(2).getReg();
2950 uint64_t Offset = MI.getOperand(3).getImm();
2951
2952 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
2953 const LLT InsertTy = MRI.getType(InsertSrc);
2954 unsigned InsertSize = InsertTy.getSizeInBits();
2955
2956 // Fall back to generic lowering for non-32-bit-aligned cases which
2957 // require shift+mask sequences that generic code handles correctly.
2958 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2959 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
2960
2961 const LLT S32 = LLT::scalar(32);
2962 unsigned DstCount = DstSize / 32;
2963 unsigned InsertCount = InsertSize / 32;
2964 unsigned StartIdx = Offset / 32;
2965
2966 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
2967
2968 SmallVector<Register, 8> MergeVec;
2969 for (unsigned I = 0; I < StartIdx; ++I)
2970 MergeVec.push_back(SrcUnmerge.getReg(I));
2971
2972 if (InsertCount == 1) {
2973 // Merge-like instructions require same source types. Convert pointer
2974 // to scalar when inserting a pointer value into a scalar.
2975 if (InsertTy.isPointer())
2976 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
2977 MergeVec.push_back(InsertSrc);
2978 } else {
2979 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
2980 for (unsigned I = 0; I < InsertCount; ++I)
2981 MergeVec.push_back(InsertUnmerge.getReg(I));
2982 }
2983
2984 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
2985 MergeVec.push_back(SrcUnmerge.getReg(I));
2986
2987 B.buildMergeLikeInstr(DstReg, MergeVec);
2988
2989 MI.eraseFromParent();
2990 return true;
2991}
2992
2995 MachineIRBuilder &B) const {
2996 // TODO: Should move some of this into LegalizerHelper.
2997
2998 // TODO: Promote dynamic indexing of s16 to s32
2999
3000 Register Dst = MI.getOperand(0).getReg();
3001 Register Vec = MI.getOperand(1).getReg();
3002
3003 LLT VecTy = MRI.getType(Vec);
3004 LLT EltTy = VecTy.getElementType();
3005 assert(EltTy == MRI.getType(Dst));
3006
3007 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3008 // but we can't go directly to that logic becasue you can't bitcast a vector
3009 // of pointers to a vector of integers. Therefore, introduce an intermediate
3010 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3011 // drive the legalization forward.
3012 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3013 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3014 LLT IntVecTy = VecTy.changeElementType(IntTy);
3015
3016 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3017 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3018 B.buildIntToPtr(Dst, IntElt);
3019
3020 MI.eraseFromParent();
3021 return true;
3022 }
3023
3024 // FIXME: Artifact combiner probably should have replaced the truncated
3025 // constant before this, so we shouldn't need
3026 // getIConstantVRegValWithLookThrough.
3027 std::optional<ValueAndVReg> MaybeIdxVal =
3028 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3029 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3030 return true;
3031 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3032
3033 if (IdxVal < VecTy.getNumElements()) {
3034 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3035 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3036 } else {
3037 B.buildUndef(Dst);
3038 }
3039
3040 MI.eraseFromParent();
3041 return true;
3042}
3043
3046 MachineIRBuilder &B) const {
3047 // TODO: Should move some of this into LegalizerHelper.
3048
3049 // TODO: Promote dynamic indexing of s16 to s32
3050
3051 Register Dst = MI.getOperand(0).getReg();
3052 Register Vec = MI.getOperand(1).getReg();
3053 Register Ins = MI.getOperand(2).getReg();
3054
3055 LLT VecTy = MRI.getType(Vec);
3056 LLT EltTy = VecTy.getElementType();
3057 assert(EltTy == MRI.getType(Ins));
3058
3059 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3060 // but we can't go directly to that logic becasue you can't bitcast a vector
3061 // of pointers to a vector of integers. Therefore, make the pointer vector
3062 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3063 // new value, and then inttoptr the result vector back. This will then allow
3064 // the rest of legalization to take over.
3065 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3066 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3067 LLT IntVecTy = VecTy.changeElementType(IntTy);
3068
3069 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3070 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3071 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3072 MI.getOperand(3));
3073 B.buildIntToPtr(Dst, IntVecDest);
3074 MI.eraseFromParent();
3075 return true;
3076 }
3077
3078 // FIXME: Artifact combiner probably should have replaced the truncated
3079 // constant before this, so we shouldn't need
3080 // getIConstantVRegValWithLookThrough.
3081 std::optional<ValueAndVReg> MaybeIdxVal =
3082 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3083 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3084 return true;
3085
3086 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3087
3088 unsigned NumElts = VecTy.getNumElements();
3089 if (IdxVal < NumElts) {
3091 for (unsigned i = 0; i < NumElts; ++i)
3092 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3093 B.buildUnmerge(SrcRegs, Vec);
3094
3095 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3096 B.buildMergeLikeInstr(Dst, SrcRegs);
3097 } else {
3098 B.buildUndef(Dst);
3099 }
3100
3101 MI.eraseFromParent();
3102 return true;
3103}
3104
3107 MachineIRBuilder &B) const {
3108
3109 Register DstReg = MI.getOperand(0).getReg();
3110 Register SrcReg = MI.getOperand(1).getReg();
3111 LLT Ty = MRI.getType(DstReg);
3112 unsigned Flags = MI.getFlags();
3113
3114 Register TrigVal;
3115 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3116 if (ST.hasTrigReducedRange()) {
3117 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3118 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3119 .addUse(MulVal.getReg(0))
3120 .setMIFlags(Flags)
3121 .getReg(0);
3122 } else
3123 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3124
3125 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3126 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3127 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3128 .addUse(TrigVal)
3129 .setMIFlags(Flags);
3130 MI.eraseFromParent();
3131 return true;
3132}
3133
3136 const GlobalValue *GV,
3137 int64_t Offset,
3138 unsigned GAFlags) const {
3139 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3140 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3141 // to the following code sequence:
3142 //
3143 // For constant address space:
3144 // s_getpc_b64 s[0:1]
3145 // s_add_u32 s0, s0, $symbol
3146 // s_addc_u32 s1, s1, 0
3147 //
3148 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3149 // a fixup or relocation is emitted to replace $symbol with a literal
3150 // constant, which is a pc-relative offset from the encoding of the $symbol
3151 // operand to the global variable.
3152 //
3153 // For global address space:
3154 // s_getpc_b64 s[0:1]
3155 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3156 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3157 //
3158 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3159 // fixups or relocations are emitted to replace $symbol@*@lo and
3160 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3161 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3162 // operand to the global variable.
3163
3165
3166 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3167 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3168
3169 if (ST.has64BitLiterals()) {
3170 assert(GAFlags != SIInstrInfo::MO_NONE);
3171
3173 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3174 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3175 } else {
3177 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3178
3179 MIB.addGlobalAddress(GV, Offset, GAFlags);
3180 if (GAFlags == SIInstrInfo::MO_NONE)
3181 MIB.addImm(0);
3182 else
3183 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3184 }
3185
3186 if (!B.getMRI()->getRegClassOrNull(PCReg))
3187 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3188
3189 if (PtrTy.getSizeInBits() == 32)
3190 B.buildExtract(DstReg, PCReg, 0);
3191 return true;
3192}
3193
3194// Emit a ABS32_LO / ABS32_HI relocation stub.
3196 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3197 MachineRegisterInfo &MRI) const {
3198 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3199
3200 if (RequiresHighHalf && ST.has64BitLiterals()) {
3201 if (!MRI.getRegClassOrNull(DstReg))
3202 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3203 B.buildInstr(AMDGPU::S_MOV_B64)
3204 .addDef(DstReg)
3205 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3206 return;
3207 }
3208
3209 LLT S32 = LLT::scalar(32);
3210
3211 // Use the destination directly, if and only if we store the lower address
3212 // part only and we don't have a register class being set.
3213 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3214 ? DstReg
3216
3217 if (!MRI.getRegClassOrNull(AddrLo))
3218 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3219
3220 // Write the lower half.
3221 B.buildInstr(AMDGPU::S_MOV_B32)
3222 .addDef(AddrLo)
3223 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3224
3225 // If required, write the upper half as well.
3226 if (RequiresHighHalf) {
3227 assert(PtrTy.getSizeInBits() == 64 &&
3228 "Must provide a 64-bit pointer type!");
3229
3231 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3232
3233 B.buildInstr(AMDGPU::S_MOV_B32)
3234 .addDef(AddrHi)
3235 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3236
3237 // Use the destination directly, if and only if we don't have a register
3238 // class being set.
3239 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3240 ? DstReg
3242
3243 if (!MRI.getRegClassOrNull(AddrDst))
3244 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3245
3246 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3247
3248 // If we created a new register for the destination, cast the result into
3249 // the final output.
3250 if (AddrDst != DstReg)
3251 B.buildCast(DstReg, AddrDst);
3252 } else if (AddrLo != DstReg) {
3253 // If we created a new register for the destination, cast the result into
3254 // the final output.
3255 B.buildCast(DstReg, AddrLo);
3256 }
3257}
3258
3261 MachineIRBuilder &B) const {
3262 Register DstReg = MI.getOperand(0).getReg();
3263 LLT Ty = MRI.getType(DstReg);
3264 unsigned AS = Ty.getAddressSpace();
3265
3266 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3267 MachineFunction &MF = B.getMF();
3269
3271 if (!MFI->isModuleEntryFunction() &&
3272 GV->getName() != "llvm.amdgcn.module.lds" &&
3274 const Function &Fn = MF.getFunction();
3276 Fn, "local memory global used by non-kernel function",
3277 MI.getDebugLoc(), DS_Warning));
3278
3279 // We currently don't have a way to correctly allocate LDS objects that
3280 // aren't directly associated with a kernel. We do force inlining of
3281 // functions that use local objects. However, if these dead functions are
3282 // not eliminated, we don't want a compile time error. Just emit a warning
3283 // and a trap, since there should be no callable path here.
3284 B.buildTrap();
3285 B.buildUndef(DstReg);
3286 MI.eraseFromParent();
3287 return true;
3288 }
3289
3290 // TODO: We could emit code to handle the initialization somewhere.
3291 // We ignore the initializer for now and legalize it to allow selection.
3292 // The initializer will anyway get errored out during assembly emission.
3293 const SITargetLowering *TLI = ST.getTargetLowering();
3294 if (!TLI->shouldUseLDSConstAddress(GV)) {
3295 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3296 return true; // Leave in place;
3297 }
3298
3299 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3300 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3301 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3302 // zero-sized type in other languages to declare the dynamic shared
3303 // memory which size is not known at the compile time. They will be
3304 // allocated by the runtime and placed directly after the static
3305 // allocated ones. They all share the same offset.
3306 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3307 // Adjust alignment for that dynamic shared memory array.
3308 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3309 LLT S32 = LLT::scalar(32);
3310 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3311 B.buildIntToPtr(DstReg, Sz);
3312 MI.eraseFromParent();
3313 return true;
3314 }
3315 }
3316
3317 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3318 MI.eraseFromParent();
3319 return true;
3320 }
3321
3322 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3323 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3324 MI.eraseFromParent();
3325 return true;
3326 }
3327
3328 const SITargetLowering *TLI = ST.getTargetLowering();
3329
3330 if (TLI->shouldEmitFixup(GV)) {
3331 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3332 MI.eraseFromParent();
3333 return true;
3334 }
3335
3336 if (TLI->shouldEmitPCReloc(GV)) {
3337 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3338 MI.eraseFromParent();
3339 return true;
3340 }
3341
3343 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3344
3345 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3350 LoadTy, Align(8));
3351
3352 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3353
3354 if (Ty.getSizeInBits() == 32) {
3355 // Truncate if this is a 32-bit constant address.
3356 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3357 B.buildExtract(DstReg, Load, 0);
3358 } else
3359 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3360
3361 MI.eraseFromParent();
3362 return true;
3363}
3364
3366 if (Ty.isVector())
3367 return Ty.changeElementCount(
3368 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3369 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3370}
3371
3373 MachineInstr &MI) const {
3374 MachineIRBuilder &B = Helper.MIRBuilder;
3375 MachineRegisterInfo &MRI = *B.getMRI();
3376 GISelChangeObserver &Observer = Helper.Observer;
3377
3378 Register PtrReg = MI.getOperand(1).getReg();
3379 LLT PtrTy = MRI.getType(PtrReg);
3380 unsigned AddrSpace = PtrTy.getAddressSpace();
3381
3382 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3384 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3385 Observer.changingInstr(MI);
3386 MI.getOperand(1).setReg(Cast.getReg(0));
3387 Observer.changedInstr(MI);
3388 return true;
3389 }
3390
3391 if (MI.getOpcode() != AMDGPU::G_LOAD)
3392 return false;
3393
3394 Register ValReg = MI.getOperand(0).getReg();
3395 LLT ValTy = MRI.getType(ValReg);
3396
3397 if (hasBufferRsrcWorkaround(ValTy)) {
3398 Observer.changingInstr(MI);
3399 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3400 Observer.changedInstr(MI);
3401 return true;
3402 }
3403
3404 MachineMemOperand *MMO = *MI.memoperands_begin();
3405 const unsigned ValSize = ValTy.getSizeInBits();
3406 const LLT MemTy = MMO->getMemoryType();
3407 const Align MemAlign = MMO->getAlign();
3408 const unsigned MemSize = MemTy.getSizeInBits();
3409 const uint64_t AlignInBits = 8 * MemAlign.value();
3410
3411 // Widen non-power-of-2 loads to the alignment if needed
3412 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3413 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3414
3415 // This was already the correct extending load result type, so just adjust
3416 // the memory type.
3417 if (WideMemSize == ValSize) {
3418 MachineFunction &MF = B.getMF();
3419
3420 MachineMemOperand *WideMMO =
3421 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3422 Observer.changingInstr(MI);
3423 MI.setMemRefs(MF, {WideMMO});
3424 Observer.changedInstr(MI);
3425 return true;
3426 }
3427
3428 // Don't bother handling edge case that should probably never be produced.
3429 if (ValSize > WideMemSize)
3430 return false;
3431
3432 LLT WideTy = widenToNextPowerOf2(ValTy);
3433
3434 Register WideLoad;
3435 if (!WideTy.isVector()) {
3436 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3437 B.buildTrunc(ValReg, WideLoad).getReg(0);
3438 } else {
3439 // Extract the subvector.
3440
3441 if (isRegisterType(ST, ValTy)) {
3442 // If this a case where G_EXTRACT is legal, use it.
3443 // (e.g. <3 x s32> -> <4 x s32>)
3444 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3445 B.buildExtract(ValReg, WideLoad, 0);
3446 } else {
3447 // For cases where the widened type isn't a nice register value, unmerge
3448 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3449 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3450 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3451 }
3452 }
3453
3454 MI.eraseFromParent();
3455 return true;
3456 }
3457
3458 return false;
3459}
3460
3462 MachineInstr &MI) const {
3463 MachineIRBuilder &B = Helper.MIRBuilder;
3464 MachineRegisterInfo &MRI = *B.getMRI();
3465 GISelChangeObserver &Observer = Helper.Observer;
3466
3467 Register DataReg = MI.getOperand(0).getReg();
3468 LLT DataTy = MRI.getType(DataReg);
3469
3470 if (hasBufferRsrcWorkaround(DataTy)) {
3471 Observer.changingInstr(MI);
3473 Observer.changedInstr(MI);
3474 return true;
3475 }
3476 return false;
3477}
3478
3481 MachineIRBuilder &B) const {
3482 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3483 assert(Ty.isScalar());
3484
3485 MachineFunction &MF = B.getMF();
3487
3488 // TODO: Always legal with future ftz flag.
3489 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3490 // FIXME: Do we need just output?
3491 if (Ty == LLT::scalar(32) &&
3493 return true;
3494 if (Ty == LLT::scalar(16) &&
3496 return true;
3497
3498 MachineIRBuilder HelperBuilder(MI);
3499 GISelObserverWrapper DummyObserver;
3500 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3501 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3502}
3503
3506 Register DstReg = MI.getOperand(0).getReg();
3507 Register PtrReg = MI.getOperand(1).getReg();
3508 Register CmpVal = MI.getOperand(2).getReg();
3509 Register NewVal = MI.getOperand(3).getReg();
3510
3512 "this should not have been custom lowered");
3513
3514 LLT ValTy = MRI.getType(CmpVal);
3515 LLT VecTy = LLT::fixed_vector(2, ValTy);
3516
3517 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3518
3519 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3520 .addDef(DstReg)
3521 .addUse(PtrReg)
3522 .addUse(PackedVal)
3523 .setMemRefs(MI.memoperands());
3524
3525 MI.eraseFromParent();
3526 return true;
3527}
3528
3529/// Return true if it's known that \p Src can never be an f32 denormal value.
3531 Register Src) {
3532 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3533 switch (DefMI->getOpcode()) {
3534 case TargetOpcode::G_INTRINSIC: {
3536 case Intrinsic::amdgcn_frexp_mant:
3537 case Intrinsic::amdgcn_log:
3538 case Intrinsic::amdgcn_log_clamp:
3539 case Intrinsic::amdgcn_exp2:
3540 case Intrinsic::amdgcn_sqrt:
3541 return true;
3542 default:
3543 break;
3544 }
3545
3546 break;
3547 }
3548 case TargetOpcode::G_FSQRT:
3549 return true;
3550 case TargetOpcode::G_FFREXP: {
3551 if (DefMI->getOperand(0).getReg() == Src)
3552 return true;
3553 break;
3554 }
3555 case TargetOpcode::G_FPEXT: {
3556 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3557 }
3558 default:
3559 return false;
3560 }
3561
3562 return false;
3563}
3564
3565static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3566 return Flags & MachineInstr::FmAfn;
3567}
3568
3570 unsigned Flags) {
3571 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3574}
3575
3576std::pair<Register, Register>
3578 unsigned Flags) const {
3579 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3580 return {};
3581
3582 const LLT F32 = LLT::scalar(32);
3583 auto SmallestNormal = B.buildFConstant(
3585 auto IsLtSmallestNormal =
3586 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3587
3588 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3589 auto One = B.buildFConstant(F32, 1.0);
3590 auto ScaleFactor =
3591 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3592 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3593
3594 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3595}
3596
3598 MachineIRBuilder &B) const {
3599 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3600 // If we have to handle denormals, scale up the input and adjust the result.
3601
3602 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3603 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3604
3605 Register Dst = MI.getOperand(0).getReg();
3606 Register Src = MI.getOperand(1).getReg();
3607 LLT Ty = B.getMRI()->getType(Dst);
3608 unsigned Flags = MI.getFlags();
3609
3610 if (Ty == LLT::scalar(16)) {
3611 const LLT F32 = LLT::scalar(32);
3612 // Nothing in half is a denormal when promoted to f32.
3613 auto Ext = B.buildFPExt(F32, Src, Flags);
3614 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3615 .addUse(Ext.getReg(0))
3616 .setMIFlags(Flags);
3617 B.buildFPTrunc(Dst, Log2, Flags);
3618 MI.eraseFromParent();
3619 return true;
3620 }
3621
3622 assert(Ty == LLT::scalar(32));
3623
3624 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3625 if (!ScaledInput) {
3626 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3627 .addUse(Src)
3628 .setMIFlags(Flags);
3629 MI.eraseFromParent();
3630 return true;
3631 }
3632
3633 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3634 .addUse(ScaledInput)
3635 .setMIFlags(Flags);
3636
3637 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3638 auto Zero = B.buildFConstant(Ty, 0.0);
3639 auto ResultOffset =
3640 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3641 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3642
3643 MI.eraseFromParent();
3644 return true;
3645}
3646
3648 Register Z, unsigned Flags) {
3649 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3650 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3651}
3652
3654 MachineIRBuilder &B) const {
3655 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3656 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3657
3658 MachineRegisterInfo &MRI = *B.getMRI();
3659 Register Dst = MI.getOperand(0).getReg();
3660 Register X = MI.getOperand(1).getReg();
3661 unsigned Flags = MI.getFlags();
3662 const LLT Ty = MRI.getType(X);
3663
3664 const LLT F32 = LLT::scalar(32);
3665 const LLT F16 = LLT::scalar(16);
3666
3667 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3668 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3669 // depending on !fpmath metadata.
3670 bool PromoteToF32 =
3671 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3672 if (PromoteToF32) {
3674 auto PromoteSrc = B.buildFPExt(F32, X);
3675 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3676 B.buildFPTrunc(Dst, LogVal);
3677 } else {
3678 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3679 }
3680
3681 MI.eraseFromParent();
3682 return true;
3683 }
3684
3685 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3686 if (ScaledInput)
3687 X = ScaledInput;
3688
3689 auto Y =
3690 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3691
3692 Register R;
3693 if (ST.hasFastFMAF32()) {
3694 // c+cc are ln(2)/ln(10) to more than 49 bits
3695 const float c_log10 = 0x1.344134p-2f;
3696 const float cc_log10 = 0x1.09f79ep-26f;
3697
3698 // c + cc is ln(2) to more than 49 bits
3699 const float c_log = 0x1.62e42ep-1f;
3700 const float cc_log = 0x1.efa39ep-25f;
3701
3702 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3703 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3704 // This adds correction terms for which contraction may lead to an increase
3705 // in the error of the approximation, so disable it.
3706 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3707 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3708 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3709 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3710 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3711 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3712 } else {
3713 // ch+ct is ln(2)/ln(10) to more than 36 bits
3714 const float ch_log10 = 0x1.344000p-2f;
3715 const float ct_log10 = 0x1.3509f6p-18f;
3716
3717 // ch + ct is ln(2) to more than 36 bits
3718 const float ch_log = 0x1.62e000p-1f;
3719 const float ct_log = 0x1.0bfbe8p-15f;
3720
3721 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3722 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3723
3724 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3725 auto YH = B.buildAnd(Ty, Y, MaskConst);
3726 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3727 // This adds correction terms for which contraction may lead to an increase
3728 // in the error of the approximation, so disable it.
3729 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3730 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3731
3732 Register Mad0 =
3733 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3734 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3735 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3736 }
3737
3738 const bool IsFiniteOnly =
3740
3741 if (!IsFiniteOnly) {
3742 // Expand isfinite(x) => fabs(x) < inf
3743 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3744 auto Fabs = B.buildFAbs(Ty, Y);
3745 auto IsFinite =
3746 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3747 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3748 }
3749
3750 if (ScaledInput) {
3751 auto Zero = B.buildFConstant(Ty, 0.0);
3752 auto ShiftK =
3753 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3754 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3755 B.buildFSub(Dst, R, Shift, Flags);
3756 } else {
3757 B.buildCopy(Dst, R);
3758 }
3759
3760 MI.eraseFromParent();
3761 return true;
3762}
3763
3765 Register Src, bool IsLog10,
3766 unsigned Flags) const {
3767 const double Log2BaseInverted =
3769
3770 LLT Ty = B.getMRI()->getType(Dst);
3771
3772 if (Ty == LLT::scalar(32)) {
3773 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3774 if (ScaledInput) {
3775 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3776 .addUse(Src)
3777 .setMIFlags(Flags);
3778 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3779 auto Zero = B.buildFConstant(Ty, 0.0);
3780 auto ResultOffset =
3781 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3782 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3783
3784 if (ST.hasFastFMAF32())
3785 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3786 else {
3787 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3788 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3789 }
3790
3791 return true;
3792 }
3793 }
3794
3795 auto Log2Operand = Ty == LLT::scalar(16)
3796 ? B.buildFLog2(Ty, Src, Flags)
3797 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3798 .addUse(Src)
3799 .setMIFlags(Flags);
3800 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3801 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3802 return true;
3803}
3804
3806 MachineIRBuilder &B) const {
3807 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3808 // If we have to handle denormals, scale up the input and adjust the result.
3809
3810 Register Dst = MI.getOperand(0).getReg();
3811 Register Src = MI.getOperand(1).getReg();
3812 unsigned Flags = MI.getFlags();
3813 LLT Ty = B.getMRI()->getType(Dst);
3814 const LLT F16 = LLT::scalar(16);
3815 const LLT F32 = LLT::scalar(32);
3816 const LLT F64 = LLT::scalar(64);
3817
3818 if (Ty == F64)
3819 return legalizeFEXPF64(MI, B);
3820
3821 if (Ty == F16) {
3822 // Nothing in half is a denormal when promoted to f32.
3823 auto Ext = B.buildFPExt(F32, Src, Flags);
3824 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3825 .addUse(Ext.getReg(0))
3826 .setMIFlags(Flags);
3827 B.buildFPTrunc(Dst, Log2, Flags);
3828 MI.eraseFromParent();
3829 return true;
3830 }
3831
3832 assert(Ty == F32);
3833
3834 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3835 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3836 .addUse(Src)
3837 .setMIFlags(Flags);
3838 MI.eraseFromParent();
3839 return true;
3840 }
3841
3842 // bool needs_scaling = x < -0x1.f80000p+6f;
3843 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3844
3845 // -nextafter(128.0, -1)
3846 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3847 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3848 RangeCheckConst, Flags);
3849
3850 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3851 auto Zero = B.buildFConstant(Ty, 0.0);
3852 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3853 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3854
3855 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3856 .addUse(AddInput.getReg(0))
3857 .setMIFlags(Flags);
3858
3859 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3860 auto One = B.buildFConstant(Ty, 1.0);
3861 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3862 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3863 MI.eraseFromParent();
3864 return true;
3865}
3866
3868 const SrcOp &Src, unsigned Flags) {
3869 LLT Ty = Dst.getLLTTy(*B.getMRI());
3870
3871 if (Ty == LLT::scalar(32)) {
3872 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3873 .addUse(Src.getReg())
3874 .setMIFlags(Flags);
3875 }
3876 return B.buildFExp2(Dst, Src, Flags);
3877}
3878
3880 Register Dst, Register X,
3881 unsigned Flags,
3882 bool IsExp10) const {
3883 LLT Ty = B.getMRI()->getType(X);
3884
3885 // exp(x) -> exp2(M_LOG2E_F * x);
3886 // exp10(x) -> exp2(log2(10) * x);
3887 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3888 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3889 buildExp(B, Dst, Mul, Flags);
3890 return true;
3891}
3892
3894 Register X, unsigned Flags) const {
3895 LLT Ty = B.getMRI()->getType(Dst);
3896 LLT F32 = LLT::scalar(32);
3897
3898 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3899 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3900 }
3901
3902 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3903 auto NeedsScaling =
3904 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3905 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3906 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3907 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3908
3909 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3910 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3911
3912 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3913 .addUse(ExpInput.getReg(0))
3914 .setMIFlags(Flags);
3915
3916 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3917 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3918 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3919 return true;
3920}
3921
3923 Register Dst, Register X,
3924 unsigned Flags) const {
3925 LLT Ty = B.getMRI()->getType(Dst);
3926 LLT F32 = LLT::scalar(32);
3927
3928 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3929 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3930 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3931 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3932
3933 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3934 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3935 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3936 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3937 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3938 return true;
3939 }
3940
3941 // bool s = x < -0x1.2f7030p+5f;
3942 // x += s ? 0x1.0p+5f : 0.0f;
3943 // exp10 = exp2(x * 0x1.a92000p+1f) *
3944 // exp2(x * 0x1.4f0978p-11f) *
3945 // (s ? 0x1.9f623ep-107f : 1.0f);
3946
3947 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3948 auto NeedsScaling =
3949 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3950
3951 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3952 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3953 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3954
3955 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3956 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3957
3958 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3959 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3960 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3961 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3962
3963 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3964 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
3965 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3966
3967 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3968 return true;
3969}
3970
3971// This expansion gives a result slightly better than 1ulp.
3973 MachineIRBuilder &B) const {
3974
3975 Register X = MI.getOperand(1).getReg();
3976 LLT S64 = LLT::scalar(64);
3977 LLT S32 = LLT::scalar(32);
3978 LLT S1 = LLT::scalar(1);
3979
3980 // TODO: Check if reassoc is safe. There is an output change in exp2 and
3981 // exp10, which slightly increases ulp.
3982 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
3983
3984 Register Dn, F, T;
3985
3986 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
3987 // Dn = rint(X)
3988 Dn = B.buildFRint(S64, X, Flags).getReg(0);
3989 // F = X - Dn
3990 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
3991 // T = F*C1 + F*C2
3992 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
3993 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
3994 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
3995 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
3996
3997 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
3998 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
3999 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4000 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4001
4002 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4003 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
4004 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
4005 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4006 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4007
4008 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
4009 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
4010 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
4011 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
4012
4013 } else { // G_FEXP
4014 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4015 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4016 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4017
4018 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4019 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4020 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4021 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4022 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4023 }
4024
4025 // Polynomial chain for P
4026 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4027 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4028 Flags);
4029 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4030 Flags);
4031 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4032 Flags);
4033 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4034 Flags);
4035 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4036 Flags);
4037 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4038 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4039 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4040 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4041
4042 auto One = B.buildFConstant(S64, 1.0);
4043 P = B.buildFMA(S64, T, P, One, Flags);
4044 P = B.buildFMA(S64, T, P, One, Flags);
4045
4046 // Z = FLDEXP(P, (int)Dn)
4047 auto DnInt = B.buildFPTOSI(S32, Dn);
4048 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4049
4050 if (!(Flags & MachineInstr::FmNoInfs)) {
4051 // Overflow guard: if X <= 1024.0 then Z else +inf
4052 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4053 B.buildFConstant(S64, APFloat(1024.0)));
4054 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4055 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4056 }
4057
4058 // Underflow guard: if X >= -1075.0 then Z else 0.0
4059 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4060 B.buildFConstant(S64, APFloat(-1075.0)));
4061 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4062 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4063
4064 MI.eraseFromParent();
4065 return true;
4066}
4067
4069 MachineIRBuilder &B) const {
4070 Register Dst = MI.getOperand(0).getReg();
4071 Register X = MI.getOperand(1).getReg();
4072 const unsigned Flags = MI.getFlags();
4073 MachineFunction &MF = B.getMF();
4074 MachineRegisterInfo &MRI = *B.getMRI();
4075 LLT Ty = MRI.getType(Dst);
4076
4077 const LLT F64 = LLT::scalar(64);
4078
4079 if (Ty == F64)
4080 return legalizeFEXPF64(MI, B);
4081
4082 const LLT F16 = LLT::scalar(16);
4083 const LLT F32 = LLT::scalar(32);
4084 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4085
4086 if (Ty == F16) {
4087 // v_exp_f16 (fmul x, log2e)
4088 if (allowApproxFunc(MF, Flags)) {
4089 // TODO: Does this really require fast?
4090 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4091 : legalizeFExpUnsafe(B, Dst, X, Flags);
4092 MI.eraseFromParent();
4093 return true;
4094 }
4095
4096 // Nothing in half is a denormal when promoted to f32.
4097 //
4098 // exp(f16 x) ->
4099 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4100 //
4101 // exp10(f16 x) ->
4102 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4103 auto Ext = B.buildFPExt(F32, X, Flags);
4105 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4106 B.buildFPTrunc(Dst, Lowered, Flags);
4107 MI.eraseFromParent();
4108 return true;
4109 }
4110
4111 assert(Ty == F32);
4112
4113 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4114 // library behavior. Also, is known-not-daz source sufficient?
4115 if (allowApproxFunc(MF, Flags)) {
4116 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4117 : legalizeFExpUnsafe(B, Dst, X, Flags);
4118 MI.eraseFromParent();
4119 return true;
4120 }
4121
4122 // Algorithm:
4123 //
4124 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4125 //
4126 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4127 // n = 64*m + j, 0 <= j < 64
4128 //
4129 // e^x = 2^((64*m + j + f)/64)
4130 // = (2^m) * (2^(j/64)) * 2^(f/64)
4131 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4132 //
4133 // f = x*(64/ln(2)) - n
4134 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4135 //
4136 // e^x = (2^m) * (2^(j/64)) * e^r
4137 //
4138 // (2^(j/64)) is precomputed
4139 //
4140 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4141 // e^r = 1 + q
4142 //
4143 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4144 //
4145 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4146 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4147 Register PH, PL;
4148
4149 if (ST.hasFastFMAF32()) {
4150 const float c_exp = numbers::log2ef;
4151 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4152 const float c_exp10 = 0x1.a934f0p+1f;
4153 const float cc_exp10 = 0x1.2f346ep-24f;
4154
4155 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4156 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4157 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4158 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4159
4160 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4161 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4162 } else {
4163 const float ch_exp = 0x1.714000p+0f;
4164 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4165
4166 const float ch_exp10 = 0x1.a92000p+1f;
4167 const float cl_exp10 = 0x1.4f0978p-11f;
4168
4169 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4170 auto XH = B.buildAnd(Ty, X, MaskConst);
4171 auto XL = B.buildFSub(Ty, X, XH, Flags);
4172
4173 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4174 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4175
4176 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4177 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4178
4179 Register Mad0 =
4180 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4181 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4182 }
4183
4184 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4185
4186 // It is unsafe to contract this fsub into the PH multiply.
4187 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4188 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4189 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4190
4191 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4192 .addUse(A.getReg(0))
4193 .setMIFlags(Flags);
4194 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4195
4196 auto UnderflowCheckConst =
4197 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4198 auto Zero = B.buildFConstant(Ty, 0.0);
4199 auto Underflow =
4200 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4201
4202 R = B.buildSelect(Ty, Underflow, Zero, R);
4203
4204 if (!(Flags & MachineInstr::FmNoInfs)) {
4205 auto OverflowCheckConst =
4206 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4207
4208 auto Overflow =
4209 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4210 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4211 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4212 }
4213
4214 B.buildCopy(Dst, R);
4215 MI.eraseFromParent();
4216 return true;
4217}
4218
4220 MachineIRBuilder &B) const {
4221 Register Dst = MI.getOperand(0).getReg();
4222 Register Src0 = MI.getOperand(1).getReg();
4223 Register Src1 = MI.getOperand(2).getReg();
4224 unsigned Flags = MI.getFlags();
4225 LLT Ty = B.getMRI()->getType(Dst);
4226 const LLT F16 = LLT::scalar(16); // TODO: Expected LLT::float16()
4227 const LLT F32 = LLT::scalar(32); // TODO: Expected LLT::float32()
4228
4229 if (Ty == F32) {
4230 auto Log = B.buildFLog2(F32, Src0, Flags);
4231 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4232 .addUse(Log.getReg(0))
4233 .addUse(Src1)
4234 .setMIFlags(Flags);
4235 B.buildFExp2(Dst, Mul, Flags);
4236 } else if (Ty == F16) {
4237 // There's no f16 fmul_legacy, so we need to convert for it.
4238 auto Log = B.buildFLog2(F16, Src0, Flags);
4239 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4240 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4241 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4242 .addUse(Ext0.getReg(0))
4243 .addUse(Ext1.getReg(0))
4244 .setMIFlags(Flags);
4245 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4246 } else
4247 return false;
4248
4249 MI.eraseFromParent();
4250 return true;
4251}
4252
4253// Find a source register, ignoring any possible source modifiers.
4255 Register ModSrc = OrigSrc;
4256 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4257 ModSrc = SrcFNeg->getOperand(1).getReg();
4258 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4259 ModSrc = SrcFAbs->getOperand(1).getReg();
4260 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4261 ModSrc = SrcFAbs->getOperand(1).getReg();
4262 return ModSrc;
4263}
4264
4267 MachineIRBuilder &B) const {
4268
4269 const LLT S1 = LLT::scalar(1);
4270 const LLT F64 = LLT::scalar(64); // TODO: Expected float64
4271 Register Dst = MI.getOperand(0).getReg();
4272 Register OrigSrc = MI.getOperand(1).getReg();
4273 unsigned Flags = MI.getFlags();
4274 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4275 "this should not have been custom lowered");
4276
4277 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4278 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4279 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4280 // V_FRACT bug is:
4281 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4282 //
4283 // Convert floor(x) to (x - fract(x))
4284
4285 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4286 .addUse(OrigSrc)
4287 .setMIFlags(Flags);
4288
4289 // Give source modifier matching some assistance before obscuring a foldable
4290 // pattern.
4291
4292 // TODO: We can avoid the neg on the fract? The input sign to fract
4293 // shouldn't matter?
4294 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4295
4296 auto Const =
4297 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4298
4300
4301 // We don't need to concern ourselves with the snan handling difference, so
4302 // use the one which will directly select.
4303 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4304 if (MFI->getMode().IEEE)
4305 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4306 else
4307 B.buildFMinNum(Min, Fract, Const, Flags);
4308
4309 Register CorrectedFract = Min;
4310 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4311 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4312 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4313 }
4314
4315 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4316 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4317
4318 MI.eraseFromParent();
4319 return true;
4320}
4321
4322// Turn an illegal packed v2s16 build vector into bit operations.
4323// TODO: This should probably be a bitcast action in LegalizerHelper.
4326 Register Dst = MI.getOperand(0).getReg();
4327 const LLT S32 = LLT::scalar(32);
4328 const LLT S16 = LLT::scalar(16);
4329 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4330
4331 Register Src0 = MI.getOperand(1).getReg();
4332 Register Src1 = MI.getOperand(2).getReg();
4333
4334 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4335 assert(MRI.getType(Src0) == S32);
4336 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4337 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4338 }
4339
4340 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4341 B.buildBitcast(Dst, Merge);
4342
4343 MI.eraseFromParent();
4344 return true;
4345}
4346
4347// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4348//
4349// Source and accumulation registers must all be 32-bits.
4350//
4351// TODO: When the multiply is uniform, we should produce a code sequence
4352// that is better suited to instruction selection on the SALU. Instead of
4353// the outer loop going over parts of the result, the outer loop should go
4354// over parts of one of the factors. This should result in instruction
4355// selection that makes full use of S_ADDC_U32 instructions.
4358 ArrayRef<Register> Src0,
4359 ArrayRef<Register> Src1,
4360 bool UsePartialMad64_32,
4361 bool SeparateOddAlignedProducts) const {
4362 // Use (possibly empty) vectors of S1 registers to represent the set of
4363 // carries from one pair of positions to the next.
4364 using Carry = SmallVector<Register, 2>;
4365
4366 MachineIRBuilder &B = Helper.MIRBuilder;
4367 GISelValueTracking &VT = *Helper.getValueTracking();
4368
4369 const LLT S1 = LLT::scalar(1);
4370 const LLT S32 = LLT::scalar(32);
4371 const LLT S64 = LLT::scalar(64);
4372
4373 Register Zero32;
4374 Register Zero64;
4375
4376 auto getZero32 = [&]() -> Register {
4377 if (!Zero32)
4378 Zero32 = B.buildConstant(S32, 0).getReg(0);
4379 return Zero32;
4380 };
4381 auto getZero64 = [&]() -> Register {
4382 if (!Zero64)
4383 Zero64 = B.buildConstant(S64, 0).getReg(0);
4384 return Zero64;
4385 };
4386
4387 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4388 for (unsigned i = 0; i < Src0.size(); ++i) {
4389 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4390 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4391 }
4392
4393 // Merge the given carries into the 32-bit LocalAccum, which is modified
4394 // in-place.
4395 //
4396 // Returns the carry-out, which is a single S1 register or null.
4397 auto mergeCarry =
4398 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4399 if (CarryIn.empty())
4400 return Register();
4401
4402 bool HaveCarryOut = true;
4403 Register CarryAccum;
4404 if (CarryIn.size() == 1) {
4405 if (!LocalAccum) {
4406 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4407 return Register();
4408 }
4409
4410 CarryAccum = getZero32();
4411 } else {
4412 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4413 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4414 CarryAccum =
4415 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4416 .getReg(0);
4417 }
4418
4419 if (!LocalAccum) {
4420 LocalAccum = getZero32();
4421 HaveCarryOut = false;
4422 }
4423 }
4424
4425 auto Add =
4426 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4427 LocalAccum = Add.getReg(0);
4428 return HaveCarryOut ? Add.getReg(1) : Register();
4429 };
4430
4431 // Build a multiply-add chain to compute
4432 //
4433 // LocalAccum + (partial products at DstIndex)
4434 // + (opportunistic subset of CarryIn)
4435 //
4436 // LocalAccum is an array of one or two 32-bit registers that are updated
4437 // in-place. The incoming registers may be null.
4438 //
4439 // In some edge cases, carry-ins can be consumed "for free". In that case,
4440 // the consumed carry bits are removed from CarryIn in-place.
4441 auto buildMadChain =
4442 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4443 -> Carry {
4444 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4445 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4446
4447 Carry CarryOut;
4448 unsigned j0 = 0;
4449
4450 // Use plain 32-bit multiplication for the most significant part of the
4451 // result by default.
4452 if (LocalAccum.size() == 1 &&
4453 (!UsePartialMad64_32 || !CarryIn.empty())) {
4454 do {
4455 // Skip multiplication if one of the operands is 0
4456 unsigned j1 = DstIndex - j0;
4457 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4458 ++j0;
4459 continue;
4460 }
4461 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4462 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4463 LocalAccum[0] = Mul.getReg(0);
4464 } else {
4465 if (CarryIn.empty()) {
4466 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4467 } else {
4468 LocalAccum[0] =
4469 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4470 .getReg(0);
4471 CarryIn.pop_back();
4472 }
4473 }
4474 ++j0;
4475 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4476 }
4477
4478 // Build full 64-bit multiplies.
4479 if (j0 <= DstIndex) {
4480 bool HaveSmallAccum = false;
4481 Register Tmp;
4482
4483 if (LocalAccum[0]) {
4484 if (LocalAccum.size() == 1) {
4485 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4486 HaveSmallAccum = true;
4487 } else if (LocalAccum[1]) {
4488 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4489 HaveSmallAccum = false;
4490 } else {
4491 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4492 HaveSmallAccum = true;
4493 }
4494 } else {
4495 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4496 Tmp = getZero64();
4497 HaveSmallAccum = true;
4498 }
4499
4500 do {
4501 unsigned j1 = DstIndex - j0;
4502 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4503 ++j0;
4504 continue;
4505 }
4506 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4507 {Src0[j0], Src1[j1], Tmp});
4508 Tmp = Mad.getReg(0);
4509 if (!HaveSmallAccum)
4510 CarryOut.push_back(Mad.getReg(1));
4511 HaveSmallAccum = false;
4512
4513 ++j0;
4514 } while (j0 <= DstIndex);
4515
4516 auto Unmerge = B.buildUnmerge(S32, Tmp);
4517 LocalAccum[0] = Unmerge.getReg(0);
4518 if (LocalAccum.size() > 1)
4519 LocalAccum[1] = Unmerge.getReg(1);
4520 }
4521
4522 return CarryOut;
4523 };
4524
4525 // Outer multiply loop, iterating over destination parts from least
4526 // significant to most significant parts.
4527 //
4528 // The columns of the following diagram correspond to the destination parts
4529 // affected by one iteration of the outer loop (ignoring boundary
4530 // conditions).
4531 //
4532 // Dest index relative to 2 * i: 1 0 -1
4533 // ------
4534 // Carries from previous iteration: e o
4535 // Even-aligned partial product sum: E E .
4536 // Odd-aligned partial product sum: O O
4537 //
4538 // 'o' is OddCarry, 'e' is EvenCarry.
4539 // EE and OO are computed from partial products via buildMadChain and use
4540 // accumulation where possible and appropriate.
4541 //
4542 Register SeparateOddCarry;
4543 Carry EvenCarry;
4544 Carry OddCarry;
4545
4546 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4547 Carry OddCarryIn = std::move(OddCarry);
4548 Carry EvenCarryIn = std::move(EvenCarry);
4549 OddCarry.clear();
4550 EvenCarry.clear();
4551
4552 // Partial products at offset 2 * i.
4553 if (2 * i < Accum.size()) {
4554 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4555 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4556 }
4557
4558 // Partial products at offset 2 * i - 1.
4559 if (i > 0) {
4560 if (!SeparateOddAlignedProducts) {
4561 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4562 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4563 } else {
4564 bool IsHighest = 2 * i >= Accum.size();
4565 Register SeparateOddOut[2];
4566 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4567 .take_front(IsHighest ? 1 : 2);
4568 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4569
4571
4572 if (i == 1) {
4573 if (!IsHighest)
4574 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4575 else
4576 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4577 } else {
4578 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4579 SeparateOddCarry);
4580 }
4581 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4582
4583 if (!IsHighest) {
4584 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4585 Lo->getOperand(1).getReg());
4586 Accum[2 * i] = Hi.getReg(0);
4587 SeparateOddCarry = Hi.getReg(1);
4588 }
4589 }
4590 }
4591
4592 // Add in the carries from the previous iteration
4593 if (i > 0) {
4594 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4595 EvenCarryIn.push_back(CarryOut);
4596
4597 if (2 * i < Accum.size()) {
4598 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4599 OddCarry.push_back(CarryOut);
4600 }
4601 }
4602 }
4603}
4604
4605// Custom narrowing of wide multiplies using wide multiply-add instructions.
4606//
4607// TODO: If the multiply is followed by an addition, we should attempt to
4608// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4610 MachineInstr &MI) const {
4611 assert(ST.hasMad64_32());
4612 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4613
4614 MachineIRBuilder &B = Helper.MIRBuilder;
4615 MachineRegisterInfo &MRI = *B.getMRI();
4616
4617 Register DstReg = MI.getOperand(0).getReg();
4618 Register Src0 = MI.getOperand(1).getReg();
4619 Register Src1 = MI.getOperand(2).getReg();
4620
4621 LLT Ty = MRI.getType(DstReg);
4622 assert(Ty.isScalar());
4623
4624 unsigned Size = Ty.getSizeInBits();
4625 if (ST.hasVectorMulU64() && Size == 64)
4626 return true;
4627
4628 unsigned NumParts = Size / 32;
4629 assert((Size % 32) == 0);
4630 assert(NumParts >= 2);
4631
4632 // Whether to use MAD_64_32 for partial products whose high half is
4633 // discarded. This avoids some ADD instructions but risks false dependency
4634 // stalls on some subtargets in some cases.
4635 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4636
4637 // Whether to compute odd-aligned partial products separately. This is
4638 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4639 // in an even-aligned VGPR.
4640 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4641
4642 LLT S32 = LLT::scalar(32);
4643 SmallVector<Register, 2> Src0Parts, Src1Parts;
4644 for (unsigned i = 0; i < NumParts; ++i) {
4647 }
4648 B.buildUnmerge(Src0Parts, Src0);
4649 B.buildUnmerge(Src1Parts, Src1);
4650
4651 SmallVector<Register, 2> AccumRegs(NumParts);
4652 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4653 SeparateOddAlignedProducts);
4654
4655 B.buildMergeLikeInstr(DstReg, AccumRegs);
4656 MI.eraseFromParent();
4657 return true;
4658}
4659
4660// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4661// ctlz/cttz_zero_poison. This allows us to fix up the result for the zero input
4662// case with a single min instruction instead of a compare+select.
4665 MachineIRBuilder &B) const {
4666 Register Dst = MI.getOperand(0).getReg();
4667 Register Src = MI.getOperand(1).getReg();
4668 LLT DstTy = MRI.getType(Dst);
4669 LLT SrcTy = MRI.getType(Src);
4670
4671 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4672 ? AMDGPU::G_AMDGPU_FFBH_U32
4673 : AMDGPU::G_AMDGPU_FFBL_B32;
4674 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4675 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4676
4677 MI.eraseFromParent();
4678 return true;
4679}
4680
4683 MachineIRBuilder &B) const {
4684 Register Dst = MI.getOperand(0).getReg();
4685 Register Src = MI.getOperand(1).getReg();
4686 LLT SrcTy = MRI.getType(Src);
4687 TypeSize NumBits = SrcTy.getSizeInBits();
4688
4689 assert(NumBits < 32u);
4690
4691 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4692 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4693 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4694 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4695 B.buildTrunc(Dst, Ctlz);
4696 MI.eraseFromParent();
4697 return true;
4698}
4699
4702 MachineIRBuilder &B) const {
4703 Register Dst = MI.getOperand(0).getReg();
4704 Register Src = MI.getOperand(1).getReg();
4705 LLT SrcTy = MRI.getType(Src);
4706 const LLT S32 = LLT::scalar(32);
4707 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4708 unsigned BitWidth = SrcTy.getSizeInBits();
4709
4710 auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
4711 auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
4712 B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
4713 MI.eraseFromParent();
4714 return true;
4715}
4716
4717// Check that this is a G_XOR x, -1
4718static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4719 if (MI.getOpcode() != TargetOpcode::G_XOR)
4720 return false;
4721 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4722 return ConstVal == -1;
4723}
4724
4725// Return the use branch instruction, otherwise null if the usage is invalid.
4726static MachineInstr *
4728 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4729 Register CondDef = MI.getOperand(0).getReg();
4730 if (!MRI.hasOneNonDBGUse(CondDef))
4731 return nullptr;
4732
4733 MachineBasicBlock *Parent = MI.getParent();
4734 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4735
4736 if (isNot(MRI, *UseMI)) {
4737 Register NegatedCond = UseMI->getOperand(0).getReg();
4738 if (!MRI.hasOneNonDBGUse(NegatedCond))
4739 return nullptr;
4740
4741 // We're deleting the def of this value, so we need to remove it.
4742 eraseInstr(*UseMI, MRI);
4743
4744 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4745 Negated = true;
4746 }
4747
4748 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4749 return nullptr;
4750
4751 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4752 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4753 if (Next == Parent->end()) {
4754 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4755 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4756 return nullptr;
4757 UncondBrTarget = &*NextMBB;
4758 } else {
4759 if (Next->getOpcode() != AMDGPU::G_BR)
4760 return nullptr;
4761 Br = &*Next;
4762 UncondBrTarget = Br->getOperand(0).getMBB();
4763 }
4764
4765 return UseMI;
4766}
4767
4770 const ArgDescriptor *Arg,
4771 const TargetRegisterClass *ArgRC,
4772 LLT ArgTy) const {
4773 MCRegister SrcReg = Arg->getRegister();
4774 assert(SrcReg.isPhysical() && "Physical register expected");
4775 assert(DstReg.isVirtual() && "Virtual register expected");
4776
4777 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4778 *ArgRC, B.getDebugLoc(), ArgTy);
4779 if (Arg->isMasked()) {
4780 // TODO: Should we try to emit this once in the entry block?
4781 const LLT S32 = LLT::scalar(32);
4782 const unsigned Mask = Arg->getMask();
4783 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4784
4785 Register AndMaskSrc = LiveIn;
4786
4787 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4788 // 0.
4789 if (Shift != 0) {
4790 auto ShiftAmt = B.buildConstant(S32, Shift);
4791 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4792 }
4793
4794 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4795 } else {
4796 B.buildCopy(DstReg, LiveIn);
4797 }
4798}
4799
4804 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4805 Register DstReg = MI.getOperand(0).getReg();
4806 if (!ST.hasClusters()) {
4807 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4808 return false;
4809 MI.eraseFromParent();
4810 return true;
4811 }
4812
4813 // Clusters are supported. Return the global position in the grid. If clusters
4814 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4815
4816 // WorkGroupIdXYZ = ClusterId == 0 ?
4817 // ClusterIdXYZ :
4818 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4819 MachineRegisterInfo &MRI = *B.getMRI();
4820 const LLT S32 = LLT::scalar(32);
4821 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4822 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4823 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4824 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4825 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4826 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4827 return false;
4828
4829 auto One = B.buildConstant(S32, 1);
4830 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4831 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4832 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4833
4834 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4835
4836 switch (MFI->getClusterDims().getKind()) {
4839 B.buildCopy(DstReg, GlobalIdXYZ);
4840 MI.eraseFromParent();
4841 return true;
4842 }
4844 B.buildCopy(DstReg, ClusterIdXYZ);
4845 MI.eraseFromParent();
4846 return true;
4847 }
4849 using namespace AMDGPU::Hwreg;
4850 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4851 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4852 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4853 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4854 .addDef(ClusterId)
4855 .addImm(ClusterIdField);
4856 auto Zero = B.buildConstant(S32, 0);
4857 auto NoClusters =
4858 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4859 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4860 MI.eraseFromParent();
4861 return true;
4862 }
4863 }
4864
4865 llvm_unreachable("nothing should reach here");
4866}
4867
4869 Register DstReg, MachineIRBuilder &B,
4871 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4872 const ArgDescriptor *Arg = nullptr;
4873 const TargetRegisterClass *ArgRC;
4874 LLT ArgTy;
4875
4876 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4877 const ArgDescriptor WorkGroupIDX =
4878 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4879 // If GridZ is not programmed in an entry function then the hardware will set
4880 // it to all zeros, so there is no need to mask the GridY value in the low
4881 // order bits.
4882 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4883 AMDGPU::TTMP7,
4884 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4885 const ArgDescriptor WorkGroupIDZ =
4886 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4887 const ArgDescriptor ClusterWorkGroupIDX =
4888 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4889 const ArgDescriptor ClusterWorkGroupIDY =
4890 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4891 const ArgDescriptor ClusterWorkGroupIDZ =
4892 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4893 const ArgDescriptor ClusterWorkGroupMaxIDX =
4894 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4895 const ArgDescriptor ClusterWorkGroupMaxIDY =
4896 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4897 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4898 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4899 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4900 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4901
4902 auto LoadConstant = [&](unsigned N) {
4903 B.buildConstant(DstReg, N);
4904 return true;
4905 };
4906
4907 if (ST.hasArchitectedSGPRs() &&
4909 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4910 bool HasFixedDims = ClusterDims.isFixedDims();
4911
4912 switch (ArgType) {
4914 Arg = &WorkGroupIDX;
4915 ArgRC = &AMDGPU::SReg_32RegClass;
4916 ArgTy = LLT::scalar(32);
4917 break;
4919 Arg = &WorkGroupIDY;
4920 ArgRC = &AMDGPU::SReg_32RegClass;
4921 ArgTy = LLT::scalar(32);
4922 break;
4924 Arg = &WorkGroupIDZ;
4925 ArgRC = &AMDGPU::SReg_32RegClass;
4926 ArgTy = LLT::scalar(32);
4927 break;
4929 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4930 return LoadConstant(0);
4931 Arg = &ClusterWorkGroupIDX;
4932 ArgRC = &AMDGPU::SReg_32RegClass;
4933 ArgTy = LLT::scalar(32);
4934 break;
4936 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4937 return LoadConstant(0);
4938 Arg = &ClusterWorkGroupIDY;
4939 ArgRC = &AMDGPU::SReg_32RegClass;
4940 ArgTy = LLT::scalar(32);
4941 break;
4943 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4944 return LoadConstant(0);
4945 Arg = &ClusterWorkGroupIDZ;
4946 ArgRC = &AMDGPU::SReg_32RegClass;
4947 ArgTy = LLT::scalar(32);
4948 break;
4950 if (HasFixedDims)
4951 return LoadConstant(ClusterDims.getDims()[0] - 1);
4952 Arg = &ClusterWorkGroupMaxIDX;
4953 ArgRC = &AMDGPU::SReg_32RegClass;
4954 ArgTy = LLT::scalar(32);
4955 break;
4957 if (HasFixedDims)
4958 return LoadConstant(ClusterDims.getDims()[1] - 1);
4959 Arg = &ClusterWorkGroupMaxIDY;
4960 ArgRC = &AMDGPU::SReg_32RegClass;
4961 ArgTy = LLT::scalar(32);
4962 break;
4964 if (HasFixedDims)
4965 return LoadConstant(ClusterDims.getDims()[2] - 1);
4966 Arg = &ClusterWorkGroupMaxIDZ;
4967 ArgRC = &AMDGPU::SReg_32RegClass;
4968 ArgTy = LLT::scalar(32);
4969 break;
4971 Arg = &ClusterWorkGroupMaxFlatID;
4972 ArgRC = &AMDGPU::SReg_32RegClass;
4973 ArgTy = LLT::scalar(32);
4974 break;
4975 default:
4976 break;
4977 }
4978 }
4979
4980 if (!Arg)
4981 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4982
4983 if (!Arg) {
4985 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4986 // which case the pointer argument may be missing and we use null.
4987 return LoadConstant(0);
4988 }
4989
4990 // It's undefined behavior if a function marked with the amdgpu-no-*
4991 // attributes uses the corresponding intrinsic.
4992 B.buildUndef(DstReg);
4993 return true;
4994 }
4995
4996 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4997 return false; // TODO: Handle these
4998 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4999 return true;
5000}
5001
5005 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
5006 return false;
5007
5008 MI.eraseFromParent();
5009 return true;
5010}
5011
5013 int64_t C) {
5014 B.buildConstant(MI.getOperand(0).getReg(), C);
5015 MI.eraseFromParent();
5016 return true;
5017}
5018
5021 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5022 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
5023 if (MaxID == 0)
5024 return replaceWithConstant(B, MI, 0);
5025
5026 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5027 const ArgDescriptor *Arg;
5028 const TargetRegisterClass *ArgRC;
5029 LLT ArgTy;
5030 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5031
5032 Register DstReg = MI.getOperand(0).getReg();
5033 if (!Arg) {
5034 // It's undefined behavior if a function marked with the amdgpu-no-*
5035 // attributes uses the corresponding intrinsic.
5036 B.buildUndef(DstReg);
5037 MI.eraseFromParent();
5038 return true;
5039 }
5040
5041 if (Arg->isMasked()) {
5042 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5043 // masking operations anyway.
5044 //
5045 // TODO: We could assert the top bit is 0 for the source copy.
5046 if (!loadInputValue(DstReg, B, ArgType))
5047 return false;
5048 } else {
5050 if (!loadInputValue(TmpReg, B, ArgType))
5051 return false;
5052 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5053 }
5054
5055 MI.eraseFromParent();
5056 return true;
5057}
5058
5061 // This isn't really a constant pool but close enough.
5064 return PtrInfo;
5065}
5066
5068 int64_t Offset) const {
5070 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5071
5072 // TODO: If we passed in the base kernel offset we could have a better
5073 // alignment than 4, but we don't really need it.
5074 if (!loadInputValue(KernArgReg, B,
5076 llvm_unreachable("failed to find kernarg segment ptr");
5077
5078 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5079 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5080}
5081
5082/// Legalize a value that's loaded from kernel arguments. This is only used by
5083/// legacy intrinsics.
5087 Align Alignment) const {
5088 Register DstReg = MI.getOperand(0).getReg();
5089
5090 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5091 "unexpected kernarg parameter type");
5092
5095 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5098 MI.eraseFromParent();
5099 return true;
5100}
5101
5104 MachineIRBuilder &B) const {
5105 Register Dst = MI.getOperand(0).getReg();
5106 LLT DstTy = MRI.getType(Dst);
5107 LLT S16 = LLT::scalar(16);
5108 LLT S32 = LLT::scalar(32);
5109 LLT S64 = LLT::scalar(64);
5110
5111 if (DstTy == S16)
5112 return legalizeFDIV16(MI, MRI, B);
5113 if (DstTy == S32)
5114 return legalizeFDIV32(MI, MRI, B);
5115 if (DstTy == S64)
5116 return legalizeFDIV64(MI, MRI, B);
5117
5118 return false;
5119}
5120
5122 Register DstDivReg,
5123 Register DstRemReg,
5124 Register X,
5125 Register Y) const {
5126 const LLT S1 = LLT::scalar(1);
5127 const LLT S32 = LLT::scalar(32);
5128
5129 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5130 // algorithm used here.
5131
5132 // Initial estimate of inv(y).
5133 auto FloatY = B.buildUITOFP(S32, Y);
5134 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5135 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5136 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5137 auto Z = B.buildFPTOUI(S32, ScaledY);
5138
5139 // One round of UNR.
5140 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5141 auto NegYZ = B.buildMul(S32, NegY, Z);
5142 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5143
5144 // Quotient/remainder estimate.
5145 auto Q = B.buildUMulH(S32, X, Z);
5146 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5147
5148 // First quotient/remainder refinement.
5149 auto One = B.buildConstant(S32, 1);
5150 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5151 if (DstDivReg)
5152 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5153 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5154
5155 // Second quotient/remainder refinement.
5156 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5157 if (DstDivReg)
5158 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5159
5160 if (DstRemReg)
5161 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5162}
5163
5164// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5165//
5166// Return lo, hi of result
5167//
5168// %cvt.lo = G_UITOFP Val.lo
5169// %cvt.hi = G_UITOFP Val.hi
5170// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5171// %rcp = G_AMDGPU_RCP_IFLAG %mad
5172// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5173// %mul2 = G_FMUL %mul1, 2**(-32)
5174// %trunc = G_INTRINSIC_TRUNC %mul2
5175// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5176// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5177static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5178 Register Val) {
5179 const LLT S32 = LLT::scalar(32);
5180 auto Unmerge = B.buildUnmerge(S32, Val);
5181
5182 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5183 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5184
5185 auto Mad = B.buildFMAD(
5186 S32, CvtHi, // 2**32
5187 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5188
5189 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5190 auto Mul1 = B.buildFMul(
5191 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5192
5193 // 2**(-32)
5194 auto Mul2 = B.buildFMul(
5195 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5196 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5197
5198 // -(2**32)
5199 auto Mad2 = B.buildFMAD(
5200 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5201 Mul1);
5202
5203 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5204 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5205
5206 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5207}
5208
5210 Register DstDivReg,
5211 Register DstRemReg,
5212 Register Numer,
5213 Register Denom) const {
5214 const LLT S32 = LLT::scalar(32);
5215 const LLT S64 = LLT::scalar(64);
5216 const LLT S1 = LLT::scalar(1);
5217 Register RcpLo, RcpHi;
5218
5219 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5220
5221 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5222
5223 auto Zero64 = B.buildConstant(S64, 0);
5224 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5225
5226 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5227 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5228
5229 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5230 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5231 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5232
5233 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5234 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5235 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5236
5237 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5238 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5239 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5240 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5241 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5242
5243 auto Zero32 = B.buildConstant(S32, 0);
5244 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5245 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5246 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5247
5248 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5249 Register NumerLo = UnmergeNumer.getReg(0);
5250 Register NumerHi = UnmergeNumer.getReg(1);
5251
5252 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5253 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5254 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5255 Register Mul3_Lo = UnmergeMul3.getReg(0);
5256 Register Mul3_Hi = UnmergeMul3.getReg(1);
5257 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5258 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5259 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5260 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5261
5262 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5263 Register DenomLo = UnmergeDenom.getReg(0);
5264 Register DenomHi = UnmergeDenom.getReg(1);
5265
5266 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5267 auto C1 = B.buildSExt(S32, CmpHi);
5268
5269 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5270 auto C2 = B.buildSExt(S32, CmpLo);
5271
5272 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5273 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5274
5275 // TODO: Here and below portions of the code can be enclosed into if/endif.
5276 // Currently control flow is unconditional and we have 4 selects after
5277 // potential endif to substitute PHIs.
5278
5279 // if C3 != 0 ...
5280 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5281 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5282 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5283 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5284
5285 auto One64 = B.buildConstant(S64, 1);
5286 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5287
5288 auto C4 =
5289 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5290 auto C5 =
5291 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5292 auto C6 = B.buildSelect(
5293 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5294
5295 // if (C6 != 0)
5296 auto Add4 = B.buildAdd(S64, Add3, One64);
5297 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5298
5299 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5300 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5301 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5302
5303 // endif C6
5304 // endif C3
5305
5306 if (DstDivReg) {
5307 auto Sel1 = B.buildSelect(
5308 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5309 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5310 Sel1, MulHi3);
5311 }
5312
5313 if (DstRemReg) {
5314 auto Sel2 = B.buildSelect(
5315 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5316 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5317 Sel2, Sub1);
5318 }
5319}
5320
5323 MachineIRBuilder &B) const {
5324 Register DstDivReg, DstRemReg;
5325 switch (MI.getOpcode()) {
5326 default:
5327 llvm_unreachable("Unexpected opcode!");
5328 case AMDGPU::G_UDIV: {
5329 DstDivReg = MI.getOperand(0).getReg();
5330 break;
5331 }
5332 case AMDGPU::G_UREM: {
5333 DstRemReg = MI.getOperand(0).getReg();
5334 break;
5335 }
5336 case AMDGPU::G_UDIVREM: {
5337 DstDivReg = MI.getOperand(0).getReg();
5338 DstRemReg = MI.getOperand(1).getReg();
5339 break;
5340 }
5341 }
5342
5343 const LLT S64 = LLT::scalar(64);
5344 const LLT S32 = LLT::scalar(32);
5345 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5346 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5347 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5348 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5349
5350 if (Ty == S32)
5351 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5352 else if (Ty == S64)
5353 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5354 else
5355 return false;
5356
5357 MI.eraseFromParent();
5358 return true;
5359}
5360
5363 MachineIRBuilder &B) const {
5364 const LLT S64 = LLT::scalar(64);
5365 const LLT S32 = LLT::scalar(32);
5366
5367 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5368 if (Ty != S32 && Ty != S64)
5369 return false;
5370
5371 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5372 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5373 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5374
5375 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5376 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5377 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5378
5379 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5380 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5381
5382 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5383 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5384
5385 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5386 switch (MI.getOpcode()) {
5387 default:
5388 llvm_unreachable("Unexpected opcode!");
5389 case AMDGPU::G_SDIV: {
5390 DstDivReg = MI.getOperand(0).getReg();
5391 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5392 break;
5393 }
5394 case AMDGPU::G_SREM: {
5395 DstRemReg = MI.getOperand(0).getReg();
5396 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5397 break;
5398 }
5399 case AMDGPU::G_SDIVREM: {
5400 DstDivReg = MI.getOperand(0).getReg();
5401 DstRemReg = MI.getOperand(1).getReg();
5402 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5403 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5404 break;
5405 }
5406 }
5407
5408 if (Ty == S32)
5409 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5410 else
5411 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5412
5413 if (DstDivReg) {
5414 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5415 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5416 B.buildSub(DstDivReg, SignXor, Sign);
5417 }
5418
5419 if (DstRemReg) {
5420 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5421 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5422 B.buildSub(DstRemReg, SignXor, Sign);
5423 }
5424
5425 MI.eraseFromParent();
5426 return true;
5427}
5428
5431 MachineIRBuilder &B) const {
5432 Register Res = MI.getOperand(0).getReg();
5433 Register LHS = MI.getOperand(1).getReg();
5434 Register RHS = MI.getOperand(2).getReg();
5435 uint16_t Flags = MI.getFlags();
5436 LLT ResTy = MRI.getType(Res);
5437
5438 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5439
5440 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5441 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5442 return false;
5443
5444 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5445 // the CI documentation has a worst case error of 1 ulp.
5446 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5447 // use it as long as we aren't trying to use denormals.
5448 //
5449 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5450
5451 // 1 / x -> RCP(x)
5452 if (CLHS->isExactlyValue(1.0)) {
5453 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5454 .addUse(RHS)
5455 .setMIFlags(Flags);
5456
5457 MI.eraseFromParent();
5458 return true;
5459 }
5460
5461 // -1 / x -> RCP( FNEG(x) )
5462 if (CLHS->isExactlyValue(-1.0)) {
5463 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5464 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5465 .addUse(FNeg.getReg(0))
5466 .setMIFlags(Flags);
5467
5468 MI.eraseFromParent();
5469 return true;
5470 }
5471 }
5472
5473 // For f16 require afn or arcp.
5474 // For f32 require afn.
5475 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5476 !MI.getFlag(MachineInstr::FmArcp)))
5477 return false;
5478
5479 // x / y -> x * (1.0 / y)
5480 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5481 .addUse(RHS)
5482 .setMIFlags(Flags);
5483 B.buildFMul(Res, LHS, RCP, Flags);
5484
5485 MI.eraseFromParent();
5486 return true;
5487}
5488
5491 MachineIRBuilder &B) const {
5492 Register Res = MI.getOperand(0).getReg();
5493 Register X = MI.getOperand(1).getReg();
5494 Register Y = MI.getOperand(2).getReg();
5495 uint16_t Flags = MI.getFlags();
5496 LLT ResTy = MRI.getType(Res);
5497
5498 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5499
5500 if (!AllowInaccurateRcp)
5501 return false;
5502
5503 const ConstantFP *CLHS = getConstantFPVRegVal(X, MRI);
5504 bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);
5505
5506 // Pull out the negation so it folds for free into the source modifiers.
5507 if (IsNegRcp)
5508 X = B.buildFConstant(ResTy, 1.0).getReg(0);
5509
5510 Register NegY = IsNegRcp ? Y : B.buildFNeg(ResTy, Y).getReg(0);
5511 auto One = B.buildFConstant(ResTy, 1.0);
5512
5513 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5514 .addUse(Y)
5515 .setMIFlags(Flags);
5516 if (IsNegRcp)
5517 R = B.buildFNeg(ResTy, R);
5518
5519 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5520 R = B.buildFMA(ResTy, Tmp0, R, R);
5521
5522 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5523 R = B.buildFMA(ResTy, Tmp1, R, R);
5524
5525 // Skip the last 2 correction terms for reciprocal.
5526 if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0))) {
5527 B.buildCopy(Res, R);
5528 MI.eraseFromParent();
5529 return true;
5530 }
5531
5532 auto Ret = B.buildFMul(ResTy, X, R);
5533 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5534
5535 B.buildFMA(Res, Tmp2, R, Ret);
5536 MI.eraseFromParent();
5537 return true;
5538}
5539
5542 MachineIRBuilder &B) const {
5543 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5544 return true;
5545
5546 Register Res = MI.getOperand(0).getReg();
5547 Register LHS = MI.getOperand(1).getReg();
5548 Register RHS = MI.getOperand(2).getReg();
5549
5550 uint16_t Flags = MI.getFlags();
5551
5552 LLT S16 = LLT::scalar(16);
5553 LLT S32 = LLT::scalar(32);
5554
5555 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5556 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5557 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5558 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5559 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5560 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5561 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5562 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5563 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5564 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5565 // q16.u = opx(V_CVT_F16_F32, q32.u);
5566 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5567
5568 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5569 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5570 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5571 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5572 .addUse(RHSExt.getReg(0))
5573 .setMIFlags(Flags);
5574 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5576 if (ST.hasMadMacF32Insts()) {
5577 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5578 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5579 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5580 } else {
5581 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5582 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5583 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5584 }
5585 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5586 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5587 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5588 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5589 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5590 .addUse(RDst.getReg(0))
5591 .addUse(RHS)
5592 .addUse(LHS)
5593 .setMIFlags(Flags);
5594
5595 MI.eraseFromParent();
5596 return true;
5597}
5598
5599static constexpr unsigned SPDenormModeBitField =
5601
5602// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5603// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5605 const GCNSubtarget &ST,
5607 // Set SP denorm mode to this value.
5608 unsigned SPDenormMode =
5609 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5610
5611 if (ST.hasDenormModeInst()) {
5612 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5613 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5614
5615 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5616 B.buildInstr(AMDGPU::S_DENORM_MODE)
5617 .addImm(NewDenormModeValue);
5618
5619 } else {
5620 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5621 .addImm(SPDenormMode)
5622 .addImm(SPDenormModeBitField);
5623 }
5624}
5625
5628 MachineIRBuilder &B) const {
5629 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5630 return true;
5631
5632 Register Res = MI.getOperand(0).getReg();
5633 Register LHS = MI.getOperand(1).getReg();
5634 Register RHS = MI.getOperand(2).getReg();
5635 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5636 SIModeRegisterDefaults Mode = MFI->getMode();
5637
5638 uint16_t Flags = MI.getFlags();
5639
5640 LLT S32 = LLT::scalar(32);
5641 LLT S1 = LLT::scalar(1);
5642
5643 auto One = B.buildFConstant(S32, 1.0f);
5644
5645 auto DenominatorScaled =
5646 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5647 .addUse(LHS)
5648 .addUse(RHS)
5649 .addImm(0)
5650 .setMIFlags(Flags);
5651 auto NumeratorScaled =
5652 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5653 .addUse(LHS)
5654 .addUse(RHS)
5655 .addImm(1)
5656 .setMIFlags(Flags);
5657
5658 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5659 .addUse(DenominatorScaled.getReg(0))
5660 .setMIFlags(Flags);
5661 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5662
5663 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5664 const bool HasDynamicDenormals =
5665 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5666 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5667
5668 Register SavedSPDenormMode;
5669 if (!PreservesDenormals) {
5670 if (HasDynamicDenormals) {
5671 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5672 B.buildInstr(AMDGPU::S_GETREG_B32)
5673 .addDef(SavedSPDenormMode)
5674 .addImm(SPDenormModeBitField);
5675 }
5676 toggleSPDenormMode(true, B, ST, Mode);
5677 }
5678
5679 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5680 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5681 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5682 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5683 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5684 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5685
5686 if (!PreservesDenormals) {
5687 if (HasDynamicDenormals) {
5688 assert(SavedSPDenormMode);
5689 B.buildInstr(AMDGPU::S_SETREG_B32)
5690 .addReg(SavedSPDenormMode)
5691 .addImm(SPDenormModeBitField);
5692 } else
5693 toggleSPDenormMode(false, B, ST, Mode);
5694 }
5695
5696 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5697 .addUse(Fma4.getReg(0))
5698 .addUse(Fma1.getReg(0))
5699 .addUse(Fma3.getReg(0))
5700 .addUse(NumeratorScaled.getReg(1))
5701 .setMIFlags(Flags);
5702
5703 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5704 .addUse(Fmas.getReg(0))
5705 .addUse(RHS)
5706 .addUse(LHS)
5707 .setMIFlags(Flags);
5708
5709 MI.eraseFromParent();
5710 return true;
5711}
5712
5715 MachineIRBuilder &B) const {
5716 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5717 return true;
5718
5719 Register Res = MI.getOperand(0).getReg();
5720 Register LHS = MI.getOperand(1).getReg();
5721 Register RHS = MI.getOperand(2).getReg();
5722
5723 uint16_t Flags = MI.getFlags();
5724
5725 LLT S64 = LLT::scalar(64);
5726 LLT S1 = LLT::scalar(1);
5727
5728 auto One = B.buildFConstant(S64, 1.0);
5729
5730 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5731 .addUse(LHS)
5732 .addUse(RHS)
5733 .addImm(0)
5734 .setMIFlags(Flags);
5735
5736 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5737
5738 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5739 .addUse(DivScale0.getReg(0))
5740 .setMIFlags(Flags);
5741
5742 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5743 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5744 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5745
5746 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5747 .addUse(LHS)
5748 .addUse(RHS)
5749 .addImm(1)
5750 .setMIFlags(Flags);
5751
5752 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5753 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5754 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5755
5756 Register Scale;
5757 if (!ST.hasUsableDivScaleConditionOutput()) {
5758 // Workaround a hardware bug on SI where the condition output from div_scale
5759 // is not usable.
5760
5761 LLT S32 = LLT::scalar(32);
5762
5763 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5764 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5765 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5766 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5767
5768 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5769 Scale1Unmerge.getReg(1));
5770 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5771 Scale0Unmerge.getReg(1));
5772 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5773 } else {
5774 Scale = DivScale1.getReg(1);
5775 }
5776
5777 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5778 .addUse(Fma4.getReg(0))
5779 .addUse(Fma3.getReg(0))
5780 .addUse(Mul.getReg(0))
5781 .addUse(Scale)
5782 .setMIFlags(Flags);
5783
5784 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5785 .addUse(Fmas.getReg(0))
5786 .addUse(RHS)
5787 .addUse(LHS)
5788 .setMIFlags(Flags);
5789
5790 MI.eraseFromParent();
5791 return true;
5792}
5793
5796 MachineIRBuilder &B) const {
5797 Register Res0 = MI.getOperand(0).getReg();
5798 Register Res1 = MI.getOperand(1).getReg();
5799 Register Val = MI.getOperand(2).getReg();
5800 uint16_t Flags = MI.getFlags();
5801
5802 LLT Ty = MRI.getType(Res0);
5803 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5804
5805 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5806 .addUse(Val)
5807 .setMIFlags(Flags);
5808 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5809 .addUse(Val)
5810 .setMIFlags(Flags);
5811
5812 if (ST.hasFractBug()) {
5813 auto Fabs = B.buildFAbs(Ty, Val);
5814 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5815 auto IsFinite =
5816 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5817 auto Zero = B.buildConstant(InstrExpTy, 0);
5818 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5819 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5820 }
5821
5822 B.buildCopy(Res0, Mant);
5823 B.buildSExtOrTrunc(Res1, Exp);
5824
5825 MI.eraseFromParent();
5826 return true;
5827}
5828
5831 MachineIRBuilder &B) const {
5832 Register Res = MI.getOperand(0).getReg();
5833 Register LHS = MI.getOperand(2).getReg();
5834 Register RHS = MI.getOperand(3).getReg();
5835 uint16_t Flags = MI.getFlags();
5836
5837 LLT S32 = LLT::scalar(32);
5838 LLT S1 = LLT::scalar(1);
5839
5840 auto Abs = B.buildFAbs(S32, RHS, Flags);
5841 const APFloat C0Val(1.0f);
5842
5843 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5844 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5845 auto C2 = B.buildFConstant(S32, 1.0f);
5846
5847 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5848 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5849
5850 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5851
5852 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5853 .addUse(Mul0.getReg(0))
5854 .setMIFlags(Flags);
5855
5856 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5857
5858 B.buildFMul(Res, Sel, Mul1, Flags);
5859
5860 MI.eraseFromParent();
5861 return true;
5862}
5863
5866 MachineIRBuilder &B) const {
5867 // Bypass the correct expansion a standard promotion through G_FSQRT would
5868 // get. The f32 op is accurate enough for the f16 cas.
5869 unsigned Flags = MI.getFlags();
5870 assert(!ST.has16BitInsts());
5871 const LLT F32 = LLT::scalar(32);
5872 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5873 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5874 .addUse(Ext.getReg(0))
5875 .setMIFlags(Flags);
5876 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5877 MI.eraseFromParent();
5878 return true;
5879}
5880
5883 MachineIRBuilder &B) const {
5884 MachineFunction &MF = B.getMF();
5885 Register Dst = MI.getOperand(0).getReg();
5886 Register X = MI.getOperand(1).getReg();
5887 const unsigned Flags = MI.getFlags();
5888 const LLT S1 = LLT::scalar(1);
5889 const LLT F32 = LLT::scalar(32);
5890 const LLT I32 = LLT::scalar(32);
5891
5892 if (allowApproxFunc(MF, Flags)) {
5893 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5894 .addUse(X)
5895 .setMIFlags(Flags);
5896 MI.eraseFromParent();
5897 return true;
5898 }
5899
5900 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5901 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5902 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5903 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5904 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5905
5907 if (needsDenormHandlingF32(MF, X, Flags)) {
5908 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5909 .addUse(SqrtX.getReg(0))
5910 .setMIFlags(Flags);
5911
5912 auto NegOne = B.buildConstant(I32, -1);
5913 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5914
5915 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5916 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5917
5918 auto PosOne = B.buildConstant(I32, 1);
5919 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5920
5921 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5922 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5923
5924 auto Zero = B.buildFConstant(F32, 0.0f);
5925 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5926
5927 SqrtS =
5928 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5929
5930 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5931 SqrtS =
5932 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5933 } else {
5934 auto SqrtR =
5935 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5936 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5937
5938 auto Half = B.buildFConstant(F32, 0.5f);
5939 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5940 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5941 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5942 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5943 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5944 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5945 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5946 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5947 }
5948
5949 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5950
5951 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5952
5953 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5954
5955 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5956 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5957
5958 MI.eraseFromParent();
5959 return true;
5960}
5961
5964 MachineIRBuilder &B) const {
5965 // For double type, the SQRT and RSQ instructions don't have required
5966 // precision, we apply Goldschmidt's algorithm to improve the result:
5967 //
5968 // y0 = rsq(x)
5969 // g0 = x * y0
5970 // h0 = 0.5 * y0
5971 //
5972 // r0 = 0.5 - h0 * g0
5973 // g1 = g0 * r0 + g0
5974 // h1 = h0 * r0 + h0
5975 //
5976 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5977 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5978 // h2 = h1 * r1 + h1
5979 //
5980 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5981 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5982 //
5983 // sqrt(x) = g3
5984
5985 const LLT S1 = LLT::scalar(1);
5986 const LLT S32 = LLT::scalar(32);
5987 const LLT F64 = LLT::scalar(64);
5988
5989 Register Dst = MI.getOperand(0).getReg();
5990 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5991
5992 Register X = MI.getOperand(1).getReg();
5993 unsigned Flags = MI.getFlags();
5994
5995 Register SqrtX = X;
5996 Register Scaling, ZeroInt;
5997 if (!MI.getFlag(MachineInstr::FmAfn)) {
5998 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5999
6000 ZeroInt = B.buildConstant(S32, 0).getReg(0);
6001 Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0);
6002
6003 // Scale up input if it is too small.
6004 auto ScaleUpFactor = B.buildConstant(S32, 256);
6005 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
6006 SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0);
6007 }
6008
6009 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX);
6010
6011 auto Half = B.buildFConstant(F64, 0.5);
6012 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
6013 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
6014
6015 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
6016 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
6017
6018 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
6019 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
6020
6021 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
6022 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
6023
6024 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
6025
6026 Register SqrtRet = SqrtS2.getReg(0);
6027 if (!MI.getFlag(MachineInstr::FmAfn)) {
6028 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
6029 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
6030 auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
6031
6032 // Scale down the result.
6033 auto ScaleDownFactor = B.buildConstant(S32, -128);
6034 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
6035 SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0);
6036 }
6037
6038 Register IsZeroOrInf;
6039 if (MI.getFlag(MachineInstr::FmNoInfs)) {
6040 auto ZeroFP = B.buildFConstant(F64, 0.0);
6041 IsZeroOrInf = B.buildFCmp(FCmpInst::FCMP_OEQ, S1, SqrtX, ZeroFP).getReg(0);
6042 } else {
6043 IsZeroOrInf = B.buildIsFPClass(S1, SqrtX, fcZero | fcPosInf).getReg(0);
6044 }
6045
6046 // TODO: Check for DAZ and expand to subnormals
6047
6048 // If x is +INF, +0, or -0, use its original value
6049 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6050
6051 MI.eraseFromParent();
6052 return true;
6053}
6054
6057 MachineIRBuilder &B) const {
6058 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6059 if (Ty == LLT::scalar(32))
6060 return legalizeFSQRTF32(MI, MRI, B);
6061 if (Ty == LLT::scalar(64))
6062 return legalizeFSQRTF64(MI, MRI, B);
6063 if (Ty == LLT::scalar(16))
6064 return legalizeFSQRTF16(MI, MRI, B);
6065 return false;
6066}
6067
6068// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6069// FIXME: Why do we handle this one but not other removed instructions?
6070//
6071// Reciprocal square root. The clamp prevents infinite results, clamping
6072// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6073// +-max_float.
6076 MachineIRBuilder &B) const {
6077 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6078 return true;
6079
6080 Register Dst = MI.getOperand(0).getReg();
6081 Register Src = MI.getOperand(2).getReg();
6082 auto Flags = MI.getFlags();
6083
6084 LLT Ty = MRI.getType(Dst);
6085
6086 const fltSemantics *FltSemantics;
6087 if (Ty == LLT::scalar(32))
6088 FltSemantics = &APFloat::IEEEsingle();
6089 else if (Ty == LLT::scalar(64))
6090 FltSemantics = &APFloat::IEEEdouble();
6091 else
6092 return false;
6093
6094 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6095 .addUse(Src)
6096 .setMIFlags(Flags);
6097
6098 // We don't need to concern ourselves with the snan handling difference, since
6099 // the rsq quieted (or not) so use the one which will directly select.
6100 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6101 const bool UseIEEE = MFI->getMode().IEEE;
6102
6103 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6104 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6105 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6106
6107 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6108
6109 if (UseIEEE)
6110 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6111 else
6112 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6113 MI.eraseFromParent();
6114 return true;
6115}
6116
6117// TODO: Fix pointer type handling
6120 Intrinsic::ID IID) const {
6121
6122 MachineIRBuilder &B = Helper.MIRBuilder;
6123 MachineRegisterInfo &MRI = *B.getMRI();
6124
6125 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6126 IID == Intrinsic::amdgcn_permlanex16;
6127 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6128 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6129
6130 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6131 Register Src2, LLT VT) -> Register {
6132 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6133 switch (IID) {
6134 case Intrinsic::amdgcn_readfirstlane:
6135 case Intrinsic::amdgcn_permlane64:
6136 return LaneOp.getReg(0);
6137 case Intrinsic::amdgcn_readlane:
6138 case Intrinsic::amdgcn_set_inactive:
6139 case Intrinsic::amdgcn_set_inactive_chain_arg:
6140 return LaneOp.addUse(Src1).getReg(0);
6141 case Intrinsic::amdgcn_writelane:
6142 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6143 case Intrinsic::amdgcn_permlane16:
6144 case Intrinsic::amdgcn_permlanex16: {
6145 Register Src3 = MI.getOperand(5).getReg();
6146 int64_t Src4 = MI.getOperand(6).getImm();
6147 int64_t Src5 = MI.getOperand(7).getImm();
6148 return LaneOp.addUse(Src1)
6149 .addUse(Src2)
6150 .addUse(Src3)
6151 .addImm(Src4)
6152 .addImm(Src5)
6153 .getReg(0);
6154 }
6155 case Intrinsic::amdgcn_mov_dpp8:
6156 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6157 case Intrinsic::amdgcn_update_dpp:
6158 return LaneOp.addUse(Src1)
6159 .addImm(MI.getOperand(4).getImm())
6160 .addImm(MI.getOperand(5).getImm())
6161 .addImm(MI.getOperand(6).getImm())
6162 .addImm(MI.getOperand(7).getImm())
6163 .getReg(0);
6164 default:
6165 llvm_unreachable("unhandled lane op");
6166 }
6167 };
6168
6169 Register DstReg = MI.getOperand(0).getReg();
6170 Register Src0 = MI.getOperand(2).getReg();
6171 Register Src1, Src2;
6172 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6173 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6174 Src1 = MI.getOperand(3).getReg();
6175 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6176 Src2 = MI.getOperand(4).getReg();
6177 }
6178 }
6179
6180 LLT Ty = MRI.getType(DstReg);
6181 unsigned Size = Ty.getSizeInBits();
6182
6183 unsigned SplitSize = 32;
6184 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6185 ST.hasDPALU_DPP() &&
6186 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6187 SplitSize = 64;
6188
6189 if (Size == SplitSize) {
6190 // Already legal
6191 return true;
6192 }
6193
6194 if (Size < 32) {
6195 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6196
6197 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6198 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6199
6200 if (IID == Intrinsic::amdgcn_writelane)
6201 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6202
6203 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6204 B.buildTrunc(DstReg, LaneOpDst);
6205 MI.eraseFromParent();
6206 return true;
6207 }
6208
6209 if (Size % SplitSize != 0)
6210 return false;
6211
6212 LLT PartialResTy = LLT::scalar(SplitSize);
6213 bool NeedsBitcast = false;
6214 if (Ty.isVector()) {
6215 LLT EltTy = Ty.getElementType();
6216 unsigned EltSize = EltTy.getSizeInBits();
6217 if (EltSize == SplitSize) {
6218 PartialResTy = EltTy;
6219 } else if (EltSize == 16 || EltSize == 32) {
6220 unsigned NElem = SplitSize / EltSize;
6221 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6222 } else {
6223 // Handle all other cases via S32/S64 pieces
6224 NeedsBitcast = true;
6225 }
6226 }
6227
6228 SmallVector<Register, 4> PartialRes;
6229 unsigned NumParts = Size / SplitSize;
6230 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6231 MachineInstrBuilder Src1Parts, Src2Parts;
6232
6233 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6234 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6235
6236 if (IID == Intrinsic::amdgcn_writelane)
6237 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6238
6239 for (unsigned i = 0; i < NumParts; ++i) {
6240 Src0 = Src0Parts.getReg(i);
6241
6242 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6243 Src1 = Src1Parts.getReg(i);
6244
6245 if (IID == Intrinsic::amdgcn_writelane)
6246 Src2 = Src2Parts.getReg(i);
6247
6248 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6249 }
6250
6251 if (NeedsBitcast)
6252 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6253 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6254 else
6255 B.buildMergeLikeInstr(DstReg, PartialRes);
6256
6257 MI.eraseFromParent();
6258 return true;
6259}
6260
6263 MachineIRBuilder &B) const {
6265 ST.getTargetLowering()->getImplicitParameterOffset(
6267 LLT DstTy = MRI.getType(DstReg);
6268 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6269
6270 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6271 if (!loadInputValue(KernargPtrReg, B,
6273 return false;
6274
6275 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6276 B.buildConstant(IdxTy, Offset).getReg(0));
6277 return true;
6278}
6279
6280/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6281/// bits of the pointer and replace them with the stride argument, then
6282/// merge_values everything together. In the common case of a raw buffer (the
6283/// stride component is 0), we can just AND off the upper half.
6286 Register Result = MI.getOperand(0).getReg();
6287 Register Pointer = MI.getOperand(2).getReg();
6288 Register Stride = MI.getOperand(3).getReg();
6289 Register NumRecords = MI.getOperand(4).getReg();
6290 Register Flags = MI.getOperand(5).getReg();
6291
6292 LLT S32 = LLT::scalar(32);
6293 LLT S64 = LLT::scalar(64);
6294
6295 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6296
6297 auto ExtStride = B.buildAnyExt(S32, Stride);
6298
6299 if (ST.has45BitNumRecordsBufferResource()) {
6300 Register Zero = B.buildConstant(S32, 0).getReg(0);
6301 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6302 // num_records.
6303 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6304 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6305 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6306 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6307 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6308
6309 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6310 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6311 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6312 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6313 auto ExtShiftedStride =
6314 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6315 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6316 auto ExtShiftedFlags =
6317 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6318 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6319 Register HighHalf =
6320 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6321 B.buildMergeValues(Result, {LowHalf, HighHalf});
6322 } else {
6323 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6324 auto Unmerge = B.buildUnmerge(S32, Pointer);
6325 auto LowHalf = Unmerge.getReg(0);
6326 auto HighHalf = Unmerge.getReg(1);
6327
6328 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6329 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6330 auto ShiftConst = B.buildConstant(S32, 16);
6331 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6332 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6333 Register NewHighHalfReg = NewHighHalf.getReg(0);
6334 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6335 }
6336
6337 MI.eraseFromParent();
6338 return true;
6339}
6340
6343 MachineIRBuilder &B) const {
6344 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6345 if (!MFI->isEntryFunction()) {
6346 return legalizePreloadedArgIntrin(MI, MRI, B,
6348 }
6349
6350 Register DstReg = MI.getOperand(0).getReg();
6351 if (!getImplicitArgPtr(DstReg, MRI, B))
6352 return false;
6353
6354 MI.eraseFromParent();
6355 return true;
6356}
6357
6360 MachineIRBuilder &B) const {
6361 Function &F = B.getMF().getFunction();
6362 std::optional<uint32_t> KnownSize =
6364 if (KnownSize.has_value())
6365 B.buildConstant(DstReg, *KnownSize);
6366 return false;
6367}
6368
6371 MachineIRBuilder &B) const {
6372
6373 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6374 if (!MFI->isEntryFunction()) {
6375 return legalizePreloadedArgIntrin(MI, MRI, B,
6377 }
6378
6379 Register DstReg = MI.getOperand(0).getReg();
6380 if (!getLDSKernelId(DstReg, MRI, B))
6381 return false;
6382
6383 MI.eraseFromParent();
6384 return true;
6385}
6386
6390 unsigned AddrSpace) const {
6391 const LLT S32 = LLT::scalar(32);
6392 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6393 Register Hi32 = Unmerge.getReg(1);
6394
6395 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6396 ST.hasGloballyAddressableScratch()) {
6397 Register FlatScratchBaseHi =
6398 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6399 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6400 .getReg(0);
6401 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6402 // Test bits 63..58 against the aperture address.
6403 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6404 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6405 B.buildConstant(S32, 1u << 26));
6406 } else {
6407 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6408 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6409 }
6410 MI.eraseFromParent();
6411 return true;
6412}
6413
6414// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6415// offset (the offset that is included in bounds checking and swizzling, to be
6416// split between the instruction's voffset and immoffset fields) and soffset
6417// (the offset that is excluded from bounds checking and swizzling, to go in
6418// the instruction's soffset field). This function takes the first kind of
6419// offset and figures out how to split it between voffset and immoffset.
6420std::pair<Register, unsigned>
6422 Register OrigOffset) const {
6423 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6424 Register BaseReg;
6425 unsigned ImmOffset;
6426 const LLT S32 = LLT::scalar(32);
6427 MachineRegisterInfo &MRI = *B.getMRI();
6428
6429 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6430 // being added, so we can only safely match a 32-bit addition with no unsigned
6431 // overflow.
6432 bool CheckNUW = ST.hasGFX1250Insts();
6433 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6434 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6435
6436 // If BaseReg is a pointer, convert it to int.
6437 if (MRI.getType(BaseReg).isPointer())
6438 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6439
6440 // If the immediate value is too big for the immoffset field, put only bits
6441 // that would normally fit in the immoffset field. The remaining value that
6442 // is copied/added for the voffset field is a large power of 2, and it
6443 // stands more chance of being CSEd with the copy/add for another similar
6444 // load/store.
6445 // However, do not do that rounding down if that is a negative
6446 // number, as it appears to be illegal to have a negative offset in the
6447 // vgpr, even if adding the immediate offset makes it positive.
6448 unsigned Overflow = ImmOffset & ~MaxImm;
6449 ImmOffset -= Overflow;
6450 if ((int32_t)Overflow < 0) {
6451 Overflow += ImmOffset;
6452 ImmOffset = 0;
6453 }
6454
6455 if (Overflow != 0) {
6456 if (!BaseReg) {
6457 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6458 } else {
6459 auto OverflowVal = B.buildConstant(S32, Overflow);
6460 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6461 }
6462 }
6463
6464 if (!BaseReg)
6465 BaseReg = B.buildConstant(S32, 0).getReg(0);
6466
6467 return std::pair(BaseReg, ImmOffset);
6468}
6469
6470/// Handle register layout difference for f16 images for some subtargets.
6473 Register Reg,
6474 bool ImageStore) const {
6475 const LLT S16 = LLT::scalar(16);
6476 const LLT S32 = LLT::scalar(32);
6477 LLT StoreVT = MRI.getType(Reg);
6478 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6479
6480 if (ST.hasUnpackedD16VMem()) {
6481 auto Unmerge = B.buildUnmerge(S16, Reg);
6482
6483 SmallVector<Register, 4> WideRegs;
6484 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6485 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6486
6487 int NumElts = StoreVT.getNumElements();
6488
6489 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6490 .getReg(0);
6491 }
6492
6493 if (ImageStore && ST.hasImageStoreD16Bug()) {
6494 if (StoreVT.getNumElements() == 2) {
6495 SmallVector<Register, 4> PackedRegs;
6496 Reg = B.buildBitcast(S32, Reg).getReg(0);
6497 PackedRegs.push_back(Reg);
6498 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6499 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6500 .getReg(0);
6501 }
6502
6503 if (StoreVT.getNumElements() == 3) {
6504 SmallVector<Register, 4> PackedRegs;
6505 auto Unmerge = B.buildUnmerge(S16, Reg);
6506 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6507 PackedRegs.push_back(Unmerge.getReg(I));
6508 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6509 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6510 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6511 }
6512
6513 if (StoreVT.getNumElements() == 4) {
6514 SmallVector<Register, 4> PackedRegs;
6515 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6516 auto Unmerge = B.buildUnmerge(S32, Reg);
6517 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6518 PackedRegs.push_back(Unmerge.getReg(I));
6519 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6520 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6521 .getReg(0);
6522 }
6523
6524 llvm_unreachable("invalid data type");
6525 }
6526
6527 if (StoreVT == LLT::fixed_vector(3, S16)) {
6528 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6529 .getReg(0);
6530 }
6531 return Reg;
6532}
6533
6535 Register VData, LLT MemTy,
6536 bool IsFormat) const {
6537 MachineRegisterInfo *MRI = B.getMRI();
6538 LLT Ty = MRI->getType(VData);
6539
6540 const LLT S16 = LLT::scalar(16);
6541
6542 // Fixup buffer resources themselves needing to be v4i128.
6544 return castBufferRsrcToV4I32(VData, B);
6545
6546 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6547 Ty = getBitcastRegisterType(Ty);
6548 VData = B.buildBitcast(Ty, VData).getReg(0);
6549 }
6550 // Fixup illegal register types for i8 stores.
6551 if (Ty == LLT::scalar(8) || Ty == S16) {
6552 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6553 return AnyExt;
6554 }
6555
6556 if (Ty.isVector()) {
6557 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6558 if (IsFormat)
6559 return handleD16VData(B, *MRI, VData);
6560 }
6561 }
6562
6563 return VData;
6564}
6565
6567 LegalizerHelper &Helper,
6568 bool IsTyped,
6569 bool IsFormat) const {
6570 MachineIRBuilder &B = Helper.MIRBuilder;
6571 MachineRegisterInfo &MRI = *B.getMRI();
6572
6573 Register VData = MI.getOperand(1).getReg();
6574 LLT Ty = MRI.getType(VData);
6575 LLT EltTy = Ty.getScalarType();
6576 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6577 const LLT S32 = LLT::scalar(32);
6578
6579 MachineMemOperand *MMO = *MI.memoperands_begin();
6580 const int MemSize = MMO->getSize().getValue();
6581 LLT MemTy = MMO->getMemoryType();
6582
6583 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6584
6586 Register RSrc = MI.getOperand(2).getReg();
6587
6588 unsigned ImmOffset;
6589
6590 // The typed intrinsics add an immediate after the registers.
6591 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6592
6593 // The struct intrinsic variants add one additional operand over raw.
6594 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6595 Register VIndex;
6596 int OpOffset = 0;
6597 if (HasVIndex) {
6598 VIndex = MI.getOperand(3).getReg();
6599 OpOffset = 1;
6600 } else {
6601 VIndex = B.buildConstant(S32, 0).getReg(0);
6602 }
6603
6604 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6605 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6606
6607 unsigned Format = 0;
6608 if (IsTyped) {
6609 Format = MI.getOperand(5 + OpOffset).getImm();
6610 ++OpOffset;
6611 }
6612
6613 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6614
6615 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6616
6617 unsigned Opc;
6618 if (IsTyped) {
6619 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6620 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6621 } else if (IsFormat) {
6622 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6623 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6624 } else {
6625 switch (MemSize) {
6626 case 1:
6627 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6628 break;
6629 case 2:
6630 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6631 break;
6632 default:
6633 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6634 break;
6635 }
6636 }
6637
6638 auto MIB = B.buildInstr(Opc)
6639 .addUse(VData) // vdata
6640 .addUse(RSrc) // rsrc
6641 .addUse(VIndex) // vindex
6642 .addUse(VOffset) // voffset
6643 .addUse(SOffset) // soffset
6644 .addImm(ImmOffset); // offset(imm)
6645
6646 if (IsTyped)
6647 MIB.addImm(Format);
6648
6649 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6650 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6651 .addMemOperand(MMO);
6652
6653 MI.eraseFromParent();
6654 return true;
6655}
6656
6657static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6658 Register VIndex, Register VOffset, Register SOffset,
6659 unsigned ImmOffset, unsigned Format,
6660 unsigned AuxiliaryData, MachineMemOperand *MMO,
6661 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6662 auto MIB = B.buildInstr(Opc)
6663 .addDef(LoadDstReg) // vdata
6664 .addUse(RSrc) // rsrc
6665 .addUse(VIndex) // vindex
6666 .addUse(VOffset) // voffset
6667 .addUse(SOffset) // soffset
6668 .addImm(ImmOffset); // offset(imm)
6669
6670 if (IsTyped)
6671 MIB.addImm(Format);
6672
6673 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6674 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6675 .addMemOperand(MMO);
6676}
6677
6679 LegalizerHelper &Helper,
6680 bool IsFormat,
6681 bool IsTyped) const {
6682 MachineIRBuilder &B = Helper.MIRBuilder;
6683 MachineRegisterInfo &MRI = *B.getMRI();
6684 GISelChangeObserver &Observer = Helper.Observer;
6685
6686 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6687 MachineMemOperand *MMO = *MI.memoperands_begin();
6688 const LLT MemTy = MMO->getMemoryType();
6689 const LLT S32 = LLT::scalar(32);
6690
6691 Register Dst = MI.getOperand(0).getReg();
6692
6693 Register StatusDst;
6694 int OpOffset = 0;
6695 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6696 bool IsTFE = MI.getNumExplicitDefs() == 2;
6697 if (IsTFE) {
6698 StatusDst = MI.getOperand(1).getReg();
6699 ++OpOffset;
6700 }
6701
6702 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6703 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6704
6705 // The typed intrinsics add an immediate after the registers.
6706 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6707
6708 // The struct intrinsic variants add one additional operand over raw.
6709 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6710 Register VIndex;
6711 if (HasVIndex) {
6712 VIndex = MI.getOperand(3 + OpOffset).getReg();
6713 ++OpOffset;
6714 } else {
6715 VIndex = B.buildConstant(S32, 0).getReg(0);
6716 }
6717
6718 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6719 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6720
6721 unsigned Format = 0;
6722 if (IsTyped) {
6723 Format = MI.getOperand(5 + OpOffset).getImm();
6724 ++OpOffset;
6725 }
6726
6727 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6728 unsigned ImmOffset;
6729
6730 LLT Ty = MRI.getType(Dst);
6731 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6732 // logic doesn't have to handle that case.
6733 if (hasBufferRsrcWorkaround(Ty)) {
6734 Observer.changingInstr(MI);
6735 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6736 Observer.changedInstr(MI);
6737 Dst = MI.getOperand(0).getReg();
6738 B.setInsertPt(B.getMBB(), MI);
6739 }
6740 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6741 Ty = getBitcastRegisterType(Ty);
6742 Observer.changingInstr(MI);
6743 Helper.bitcastDst(MI, Ty, 0);
6744 Observer.changedInstr(MI);
6745 Dst = MI.getOperand(0).getReg();
6746 B.setInsertPt(B.getMBB(), MI);
6747 }
6748
6749 LLT EltTy = Ty.getScalarType();
6750 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6751 const bool Unpacked = ST.hasUnpackedD16VMem();
6752
6753 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6754
6755 unsigned Opc;
6756
6757 // TODO: Support TFE for typed and narrow loads.
6758 if (IsTyped) {
6759 if (IsTFE)
6760 return false;
6761 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6762 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6763 } else if (IsFormat) {
6764 if (IsD16) {
6765 if (IsTFE)
6766 return false;
6767 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6768 } else {
6769 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6770 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6771 }
6772 } else {
6773 switch (MemTy.getSizeInBits()) {
6774 case 8:
6775 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6776 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6777 break;
6778 case 16:
6779 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6780 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6781 break;
6782 default:
6783 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6784 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6785 break;
6786 }
6787 }
6788
6789 if (IsTFE) {
6790 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6791 unsigned NumLoadDWords = NumValueDWords + 1;
6792 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6793 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6794 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6795 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6796 if (MemTy.getSizeInBits() < 32) {
6797 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6798 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6799 B.buildTrunc(Dst, ExtDst);
6800 } else if (NumValueDWords == 1) {
6801 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6802 } else {
6803 SmallVector<Register, 5> LoadElts;
6804 for (unsigned I = 0; I != NumValueDWords; ++I)
6805 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6806 LoadElts.push_back(StatusDst);
6807 B.buildUnmerge(LoadElts, LoadDstReg);
6808 LoadElts.truncate(NumValueDWords);
6809 B.buildMergeLikeInstr(Dst, LoadElts);
6810 }
6811 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6812 (IsD16 && !Ty.isVector())) {
6813 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6814 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6815 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6816 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6817 B.buildTrunc(Dst, LoadDstReg);
6818 } else if (Unpacked && IsD16 && Ty.isVector()) {
6819 LLT UnpackedTy = Ty.changeElementSize(32);
6820 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6821 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6822 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6823 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6824 // FIXME: G_TRUNC should work, but legalization currently fails
6825 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6827 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6828 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6829 B.buildMergeLikeInstr(Dst, Repack);
6830 } else {
6831 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6832 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6833 }
6834
6835 MI.eraseFromParent();
6836 return true;
6837}
6838
6839static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6840 switch (IntrID) {
6841 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6842 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6843 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6844 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6845 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6846 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6847 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6848 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6849 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6850 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6851 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6852 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6853 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6854 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6855 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6856 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6857 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6858 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6859 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6860 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6861 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6862 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6863 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6865 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6866 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6867 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6868 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6869 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6871 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6873 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6875 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6876 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6878 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6881 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6883 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6886 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6888 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6891 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6893 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6896 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6898 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6901 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6903 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6906 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6908 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6911 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6913 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6916 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6931 default:
6932 llvm_unreachable("unhandled atomic opcode");
6933 }
6934}
6935
6938 Intrinsic::ID IID) const {
6939 const bool IsCmpSwap =
6940 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6941 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6942 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6943 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6944
6945 Register Dst = MI.getOperand(0).getReg();
6946 // Since we don't have 128-bit atomics, we don't need to handle the case of
6947 // p8 argmunents to the atomic itself
6948 Register VData = MI.getOperand(2).getReg();
6949
6950 Register CmpVal;
6951 int OpOffset = 0;
6952
6953 if (IsCmpSwap) {
6954 CmpVal = MI.getOperand(3).getReg();
6955 ++OpOffset;
6956 }
6957
6958 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6959 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6960 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6961
6962 // The struct intrinsic variants add one additional operand over raw.
6963 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6964 Register VIndex;
6965 if (HasVIndex) {
6966 VIndex = MI.getOperand(4 + OpOffset).getReg();
6967 ++OpOffset;
6968 } else {
6969 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6970 }
6971
6972 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6973 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6974 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6975
6976 MachineMemOperand *MMO = *MI.memoperands_begin();
6977
6978 unsigned ImmOffset;
6979 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6980
6981 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6982 .addDef(Dst)
6983 .addUse(VData); // vdata
6984
6985 if (IsCmpSwap)
6986 MIB.addReg(CmpVal);
6987
6988 MIB.addUse(RSrc) // rsrc
6989 .addUse(VIndex) // vindex
6990 .addUse(VOffset) // voffset
6991 .addUse(SOffset) // soffset
6992 .addImm(ImmOffset) // offset(imm)
6993 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6994 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6995 .addMemOperand(MMO);
6996
6997 MI.eraseFromParent();
6998 return true;
6999}
7000
7001/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
7002/// vector with s16 typed elements.
7004 SmallVectorImpl<Register> &PackedAddrs,
7005 unsigned ArgOffset,
7007 bool IsA16, bool IsG16) {
7008 const LLT S16 = LLT::scalar(16);
7009 const LLT V2S16 = LLT::fixed_vector(2, 16);
7010 auto EndIdx = Intr->VAddrEnd;
7011
7012 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
7013 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7014 if (!SrcOp.isReg())
7015 continue; // _L to _LZ may have eliminated this.
7016
7017 Register AddrReg = SrcOp.getReg();
7018
7019 if ((I < Intr->GradientStart) ||
7020 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7021 (I >= Intr->CoordStart && !IsA16)) {
7022 if ((I < Intr->GradientStart) && IsA16 &&
7023 (B.getMRI()->getType(AddrReg) == S16)) {
7024 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7025 // Special handling of bias when A16 is on. Bias is of type half but
7026 // occupies full 32-bit.
7027 PackedAddrs.push_back(
7028 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7029 .getReg(0));
7030 } else {
7031 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7032 "Bias needs to be converted to 16 bit in A16 mode");
7033 // Handle any gradient or coordinate operands that should not be packed
7034 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
7035 PackedAddrs.push_back(AddrReg);
7036 }
7037 } else {
7038 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7039 // derivatives dx/dh and dx/dv are packed with undef.
7040 if (((I + 1) >= EndIdx) ||
7041 ((Intr->NumGradients / 2) % 2 == 1 &&
7042 (I == static_cast<unsigned>(Intr->GradientStart +
7043 (Intr->NumGradients / 2) - 1) ||
7044 I == static_cast<unsigned>(Intr->GradientStart +
7045 Intr->NumGradients - 1))) ||
7046 // Check for _L to _LZ optimization
7047 !MI.getOperand(ArgOffset + I + 1).isReg()) {
7048 PackedAddrs.push_back(
7049 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7050 .getReg(0));
7051 } else {
7052 PackedAddrs.push_back(
7053 B.buildBuildVector(
7054 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7055 .getReg(0));
7056 ++I;
7057 }
7058 }
7059 }
7060}
7061
7062/// Convert from separate vaddr components to a single vector address register,
7063/// and replace the remaining operands with $noreg.
7065 int DimIdx, int NumVAddrs) {
7066 const LLT S32 = LLT::scalar(32);
7067 (void)S32;
7068 SmallVector<Register, 8> AddrRegs;
7069 for (int I = 0; I != NumVAddrs; ++I) {
7070 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7071 if (SrcOp.isReg()) {
7072 AddrRegs.push_back(SrcOp.getReg());
7073 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7074 }
7075 }
7076
7077 int NumAddrRegs = AddrRegs.size();
7078 if (NumAddrRegs != 1) {
7079 auto VAddr =
7080 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7081 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7082 }
7083
7084 for (int I = 1; I != NumVAddrs; ++I) {
7085 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7086 if (SrcOp.isReg())
7087 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7088 }
7089}
7090
7091/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7092///
7093/// Depending on the subtarget, load/store with 16-bit element data need to be
7094/// rewritten to use the low half of 32-bit registers, or directly use a packed
7095/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7096/// registers.
7097///
7098/// We don't want to directly select image instructions just yet, but also want
7099/// to exposes all register repacking to the legalizer/combiners. We also don't
7100/// want a selected instruction entering RegBankSelect. In order to avoid
7101/// defining a multitude of intermediate image instructions, directly hack on
7102/// the intrinsic's arguments. In cases like a16 addresses, this requires
7103/// padding now unnecessary arguments with $noreg.
7106 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7107
7108 const MachineFunction &MF = *MI.getMF();
7109 const unsigned NumDefs = MI.getNumExplicitDefs();
7110 const unsigned ArgOffset = NumDefs + 1;
7111 bool IsTFE = NumDefs == 2;
7112 // We are only processing the operands of d16 image operations on subtargets
7113 // that use the unpacked register layout, or need to repack the TFE result.
7114
7115 // TODO: Do we need to guard against already legalized intrinsics?
7116 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7118
7119 MachineRegisterInfo *MRI = B.getMRI();
7120 const LLT S32 = LLT::scalar(32);
7121 const LLT S16 = LLT::scalar(16);
7122 const LLT V2S16 = LLT::fixed_vector(2, 16);
7123
7124 unsigned DMask = 0;
7125 Register VData;
7126 LLT Ty;
7127
7128 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7129 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7130 Ty = MRI->getType(VData);
7131 }
7132
7133 const bool IsAtomicPacked16Bit =
7134 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7135 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7136
7137 // Check for 16 bit addresses and pack if true.
7138 LLT GradTy =
7139 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7140 LLT AddrTy =
7141 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7142 const bool IsG16 =
7143 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7144 const bool IsA16 = AddrTy == S16;
7145 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7146
7147 int DMaskLanes = 0;
7148 if (!BaseOpcode->Atomic) {
7149 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7150 if (BaseOpcode->Gather4) {
7151 DMaskLanes = 4;
7152 } else if (DMask != 0) {
7153 DMaskLanes = llvm::popcount(DMask);
7154 } else if (!IsTFE && !BaseOpcode->Store) {
7155 // If dmask is 0, this is a no-op load. This can be eliminated.
7156 B.buildUndef(MI.getOperand(0));
7157 MI.eraseFromParent();
7158 return true;
7159 }
7160 }
7161
7162 Observer.changingInstr(MI);
7163 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7164
7165 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7166 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7167 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7168 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7169 unsigned NewOpcode = LoadOpcode;
7170 if (BaseOpcode->Store)
7171 NewOpcode = StoreOpcode;
7172 else if (BaseOpcode->NoReturn)
7173 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7174
7175 // Track that we legalized this
7176 MI.setDesc(B.getTII().get(NewOpcode));
7177
7178 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7179 // dmask to be at least 1 otherwise the instruction will fail
7180 if (IsTFE && DMask == 0) {
7181 DMask = 0x1;
7182 DMaskLanes = 1;
7183 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7184 }
7185
7186 if (BaseOpcode->Atomic) {
7187 Register VData0 = MI.getOperand(2).getReg();
7188 LLT Ty = MRI->getType(VData0);
7189
7190 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7191 if (Ty.isVector() && !IsAtomicPacked16Bit)
7192 return false;
7193
7194 if (BaseOpcode->AtomicX2) {
7195 Register VData1 = MI.getOperand(3).getReg();
7196 // The two values are packed in one register.
7197 LLT PackedTy = LLT::fixed_vector(2, Ty);
7198 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7199 MI.getOperand(2).setReg(Concat.getReg(0));
7200 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7201 }
7202 }
7203
7204 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7205
7206 // Rewrite the addressing register layout before doing anything else.
7207 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7208 // 16 bit gradients are supported, but are tied to the A16 control
7209 // so both gradients and addresses must be 16 bit
7210 return false;
7211 }
7212
7213 if (IsA16 && !ST.hasA16()) {
7214 // A16 not supported
7215 return false;
7216 }
7217
7218 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7219 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7220
7221 if (IsA16 || IsG16) {
7222 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7223 // instructions expect VGPR_32
7224 SmallVector<Register, 4> PackedRegs;
7225
7226 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7227
7228 // See also below in the non-a16 branch
7229 const bool UseNSA = ST.hasNSAEncoding() &&
7230 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7231 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7232 const bool UsePartialNSA =
7233 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7234
7235 if (UsePartialNSA) {
7236 // Pack registers that would go over NSAMaxSize into last VAddr register
7237 LLT PackedAddrTy =
7238 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7239 auto Concat = B.buildConcatVectors(
7240 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7241 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7242 PackedRegs.resize(NSAMaxSize);
7243 } else if (!UseNSA && PackedRegs.size() > 1) {
7244 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7245 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7246 PackedRegs[0] = Concat.getReg(0);
7247 PackedRegs.resize(1);
7248 }
7249
7250 const unsigned NumPacked = PackedRegs.size();
7251 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7252 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7253 if (!SrcOp.isReg()) {
7254 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7255 continue;
7256 }
7257
7258 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7259
7260 if (I - Intr->VAddrStart < NumPacked)
7261 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7262 else
7263 SrcOp.setReg(AMDGPU::NoRegister);
7264 }
7265 } else {
7266 // If the register allocator cannot place the address registers contiguously
7267 // without introducing moves, then using the non-sequential address encoding
7268 // is always preferable, since it saves VALU instructions and is usually a
7269 // wash in terms of code size or even better.
7270 //
7271 // However, we currently have no way of hinting to the register allocator
7272 // that MIMG addresses should be placed contiguously when it is possible to
7273 // do so, so force non-NSA for the common 2-address case as a heuristic.
7274 //
7275 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7276 // allocation when possible.
7277 //
7278 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7279 // set of the remaining addresses.
7280 const bool UseNSA = ST.hasNSAEncoding() &&
7281 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7282 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7283 const bool UsePartialNSA =
7284 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7285
7286 if (UsePartialNSA) {
7288 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7289 Intr->NumVAddrs - NSAMaxSize + 1);
7290 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7291 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7292 Intr->NumVAddrs);
7293 }
7294 }
7295
7296 int Flags = 0;
7297 if (IsA16)
7298 Flags |= 1;
7299 if (IsG16)
7300 Flags |= 2;
7301 MI.addOperand(MachineOperand::CreateImm(Flags));
7302
7303 if (BaseOpcode->NoReturn) { // No TFE for stores?
7304 // TODO: Handle dmask trim
7305 if (!Ty.isVector() || !IsD16)
7306 return true;
7307
7308 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7309 if (RepackedReg != VData) {
7310 MI.getOperand(1).setReg(RepackedReg);
7311 }
7312
7313 return true;
7314 }
7315
7316 Register DstReg = MI.getOperand(0).getReg();
7317 const LLT EltTy = Ty.getScalarType();
7318 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7319
7320 // Confirm that the return type is large enough for the dmask specified
7321 if (NumElts < DMaskLanes)
7322 return false;
7323
7324 if (NumElts > 4 || DMaskLanes > 4)
7325 return false;
7326
7327 // Image atomic instructions are using DMask to specify how many bits
7328 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7329 // DMaskLanes for image atomic has default value '0'.
7330 // We must be sure that atomic variants (especially packed) will not be
7331 // truncated from v2s16 or v4s16 to s16 type.
7332 //
7333 // ChangeElementCount will be needed for image load where Ty is always scalar.
7334 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7335 const LLT AdjustedTy =
7336 DMaskLanes == 0
7337 ? Ty
7338 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7339
7340 // The raw dword aligned data component of the load. The only legal cases
7341 // where this matters should be when using the packed D16 format, for
7342 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7343 LLT RoundedTy;
7344
7345 // S32 vector to cover all data, plus TFE result element.
7346 LLT TFETy;
7347
7348 // Register type to use for each loaded component. Will be S32 or V2S16.
7349 LLT RegTy;
7350
7351 if (IsD16 && ST.hasUnpackedD16VMem()) {
7352 RoundedTy =
7353 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7354 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7355 RegTy = S32;
7356 } else {
7357 unsigned EltSize = EltTy.getSizeInBits();
7358 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7359 unsigned RoundedSize = 32 * RoundedElts;
7360 RoundedTy = LLT::scalarOrVector(
7361 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7362 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7363 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7364 }
7365
7366 // The return type does not need adjustment.
7367 // TODO: Should we change s16 case to s32 or <2 x s16>?
7368 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7369 return true;
7370
7371 Register Dst1Reg;
7372
7373 // Insert after the instruction.
7374 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7375
7376 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7377 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7378 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7379 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7380
7381 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7382
7383 MI.getOperand(0).setReg(NewResultReg);
7384
7385 // In the IR, TFE is supposed to be used with a 2 element struct return
7386 // type. The instruction really returns these two values in one contiguous
7387 // register, with one additional dword beyond the loaded data. Rewrite the
7388 // return type to use a single register result.
7389
7390 if (IsTFE) {
7391 Dst1Reg = MI.getOperand(1).getReg();
7392 if (MRI->getType(Dst1Reg) != S32)
7393 return false;
7394
7395 // TODO: Make sure the TFE operand bit is set.
7396 MI.removeOperand(1);
7397
7398 // Handle the easy case that requires no repack instructions.
7399 if (Ty == S32) {
7400 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7401 return true;
7402 }
7403 }
7404
7405 // Now figure out how to copy the new result register back into the old
7406 // result.
7407 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7408
7409 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7410
7411 if (ResultNumRegs == 1) {
7412 assert(!IsTFE);
7413 ResultRegs[0] = NewResultReg;
7414 } else {
7415 // We have to repack into a new vector of some kind.
7416 for (int I = 0; I != NumDataRegs; ++I)
7417 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7418 B.buildUnmerge(ResultRegs, NewResultReg);
7419
7420 // Drop the final TFE element to get the data part. The TFE result is
7421 // directly written to the right place already.
7422 if (IsTFE)
7423 ResultRegs.resize(NumDataRegs);
7424 }
7425
7426 // For an s16 scalar result, we form an s32 result with a truncate regardless
7427 // of packed vs. unpacked.
7428 if (IsD16 && !Ty.isVector()) {
7429 B.buildTrunc(DstReg, ResultRegs[0]);
7430 return true;
7431 }
7432
7433 // Avoid a build/concat_vector of 1 entry.
7434 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7435 B.buildBitcast(DstReg, ResultRegs[0]);
7436 return true;
7437 }
7438
7439 assert(Ty.isVector());
7440
7441 if (IsD16) {
7442 // For packed D16 results with TFE enabled, all the data components are
7443 // S32. Cast back to the expected type.
7444 //
7445 // TODO: We don't really need to use load s32 elements. We would only need one
7446 // cast for the TFE result if a multiple of v2s16 was used.
7447 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7448 for (Register &Reg : ResultRegs)
7449 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7450 } else if (ST.hasUnpackedD16VMem()) {
7451 for (Register &Reg : ResultRegs)
7452 Reg = B.buildTrunc(S16, Reg).getReg(0);
7453 }
7454 }
7455
7456 auto padWithUndef = [&](LLT Ty, int NumElts) {
7457 if (NumElts == 0)
7458 return;
7459 Register Undef = B.buildUndef(Ty).getReg(0);
7460 for (int I = 0; I != NumElts; ++I)
7461 ResultRegs.push_back(Undef);
7462 };
7463
7464 // Pad out any elements eliminated due to the dmask.
7465 LLT ResTy = MRI->getType(ResultRegs[0]);
7466 if (!ResTy.isVector()) {
7467 padWithUndef(ResTy, NumElts - ResultRegs.size());
7468 B.buildBuildVector(DstReg, ResultRegs);
7469 return true;
7470 }
7471
7472 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7473 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7474
7475 // Deal with the one annoying legal case.
7476 const LLT V3S16 = LLT::fixed_vector(3, 16);
7477 if (Ty == V3S16) {
7478 if (IsTFE) {
7479 if (ResultRegs.size() == 1) {
7480 NewResultReg = ResultRegs[0];
7481 } else if (ResultRegs.size() == 2) {
7482 LLT V4S16 = LLT::fixed_vector(4, 16);
7483 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7484 } else {
7485 return false;
7486 }
7487 }
7488
7489 if (MRI->getType(DstReg).getNumElements() <
7490 MRI->getType(NewResultReg).getNumElements()) {
7491 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7492 } else {
7493 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7494 }
7495 return true;
7496 }
7497
7498 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7499 B.buildConcatVectors(DstReg, ResultRegs);
7500 return true;
7501}
7502
7504 MachineInstr &MI) const {
7505 MachineIRBuilder &B = Helper.MIRBuilder;
7506 GISelChangeObserver &Observer = Helper.Observer;
7507
7508 Register OrigDst = MI.getOperand(0).getReg();
7509 Register Dst;
7510 LLT Ty = B.getMRI()->getType(OrigDst);
7511 unsigned Size = Ty.getSizeInBits();
7512 MachineFunction &MF = B.getMF();
7513 unsigned Opc = 0;
7514 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7515 assert(Size == 8 || Size == 16);
7516 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7517 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7518 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7519 // destination register.
7520 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7521 } else {
7522 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7523 Dst = OrigDst;
7524 }
7525
7526 Observer.changingInstr(MI);
7527
7528 // Handle needing to s.buffer.load() a p8 value.
7529 if (hasBufferRsrcWorkaround(Ty)) {
7530 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7531 B.setInsertPt(B.getMBB(), MI);
7532 }
7534 Ty = getBitcastRegisterType(Ty);
7535 Helper.bitcastDst(MI, Ty, 0);
7536 B.setInsertPt(B.getMBB(), MI);
7537 }
7538
7539 // FIXME: We don't really need this intermediate instruction. The intrinsic
7540 // should be fixed to have a memory operand. Since it's readnone, we're not
7541 // allowed to add one.
7542 MI.setDesc(B.getTII().get(Opc));
7543 MI.removeOperand(1); // Remove intrinsic ID
7544
7545 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7546 const unsigned MemSize = (Size + 7) / 8;
7547 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7553 MemSize, MemAlign);
7554 MI.addMemOperand(MF, MMO);
7555 if (Dst != OrigDst) {
7556 MI.getOperand(0).setReg(Dst);
7557 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7558 B.buildTrunc(OrigDst, Dst);
7559 }
7560
7561 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7562 // always be legal. We may need to restore this to a 96-bit result if it turns
7563 // out this needs to be converted to a vector load during RegBankSelect.
7564 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7565 if (Ty.isVector())
7567 else
7568 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7569 }
7570
7571 Observer.changedInstr(MI);
7572 return true;
7573}
7574
7576 MachineInstr &MI) const {
7577 MachineIRBuilder &B = Helper.MIRBuilder;
7578 GISelChangeObserver &Observer = Helper.Observer;
7579 Observer.changingInstr(MI);
7580 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7581 MI.removeOperand(0); // Remove intrinsic ID
7583 Observer.changedInstr(MI);
7584 return true;
7585}
7586
7587// TODO: Move to selection
7590 MachineIRBuilder &B) const {
7591 if (!ST.hasTrapHandler() ||
7592 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7593 return legalizeTrapEndpgm(MI, MRI, B);
7594
7595 return ST.supportsGetDoorbellID() ?
7597}
7598
7601 const DebugLoc &DL = MI.getDebugLoc();
7602 MachineBasicBlock &BB = B.getMBB();
7603 MachineFunction *MF = BB.getParent();
7604
7605 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7606 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7607 .addImm(0);
7608 MI.eraseFromParent();
7609 return true;
7610 }
7611
7612 // We need a block split to make the real endpgm a terminator. We also don't
7613 // want to break phis in successor blocks, so we can't just delete to the
7614 // end of the block.
7615 BB.splitAt(MI, false /*UpdateLiveIns*/);
7617 MF->push_back(TrapBB);
7618 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7619 .addImm(0);
7620 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7621 .addMBB(TrapBB);
7622
7623 BB.addSuccessor(TrapBB);
7624 MI.eraseFromParent();
7625 return true;
7626}
7627
7630 MachineFunction &MF = B.getMF();
7631 const LLT S64 = LLT::scalar(64);
7632
7633 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7634 // For code object version 5, queue_ptr is passed through implicit kernarg.
7640 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7641
7642 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7644
7645 if (!loadInputValue(KernargPtrReg, B,
7647 return false;
7648
7649 // TODO: can we be smarter about machine pointer info?
7652 PtrInfo.getWithOffset(Offset),
7656
7657 // Pointer address
7660 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7661 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7662 // Load address
7663 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7664 B.buildCopy(SGPR01, Temp);
7665 B.buildInstr(AMDGPU::S_TRAP)
7666 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7667 .addReg(SGPR01, RegState::Implicit);
7668 MI.eraseFromParent();
7669 return true;
7670 }
7671
7672 // Pass queue pointer to trap handler as input, and insert trap instruction
7673 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7674 Register LiveIn =
7677 return false;
7678
7679 B.buildCopy(SGPR01, LiveIn);
7680 B.buildInstr(AMDGPU::S_TRAP)
7681 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7682 .addReg(SGPR01, RegState::Implicit);
7683
7684 MI.eraseFromParent();
7685 return true;
7686}
7687
7690 MachineIRBuilder &B) const {
7691 // We need to simulate the 's_trap 2' instruction on targets that run in
7692 // PRIV=1 (where it is treated as a nop).
7693 if (ST.hasPrivEnabledTrap2NopBug()) {
7694 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7695 MI.getDebugLoc());
7696 MI.eraseFromParent();
7697 return true;
7698 }
7699
7700 B.buildInstr(AMDGPU::S_TRAP)
7701 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7702 MI.eraseFromParent();
7703 return true;
7704}
7705
7708 MachineIRBuilder &B) const {
7709 // Is non-HSA path or trap-handler disabled? Then, report a warning
7710 // accordingly
7711 if (!ST.hasTrapHandler() ||
7712 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7713 Function &Fn = B.getMF().getFunction();
7715 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7716 } else {
7717 // Insert debug-trap instruction
7718 B.buildInstr(AMDGPU::S_TRAP)
7719 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7720 }
7721
7722 MI.eraseFromParent();
7723 return true;
7724}
7725
7727 MachineInstr &MI, MachineIRBuilder &B) const {
7728 MachineRegisterInfo &MRI = *B.getMRI();
7729 const LLT S16 = LLT::scalar(16);
7730 const LLT S32 = LLT::scalar(32);
7731 const LLT V2S16 = LLT::fixed_vector(2, 16);
7732 const LLT V3S32 = LLT::fixed_vector(3, 32);
7733
7734 Register DstReg = MI.getOperand(0).getReg();
7735 Register NodePtr = MI.getOperand(2).getReg();
7736 Register RayExtent = MI.getOperand(3).getReg();
7737 Register RayOrigin = MI.getOperand(4).getReg();
7738 Register RayDir = MI.getOperand(5).getReg();
7739 Register RayInvDir = MI.getOperand(6).getReg();
7740 Register TDescr = MI.getOperand(7).getReg();
7741
7742 if (!ST.hasGFX10_AEncoding()) {
7743 Function &Fn = B.getMF().getFunction();
7745 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7746 return false;
7747 }
7748
7749 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7750 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7751 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7752 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7753 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7754 const unsigned NumVDataDwords = 4;
7755 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7756 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7757 const bool UseNSA =
7758 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7759
7760 const unsigned BaseOpcodes[2][2] = {
7761 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7762 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7763 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7764 int Opcode;
7765 if (UseNSA) {
7766 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7767 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7768 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7769 : AMDGPU::MIMGEncGfx10NSA,
7770 NumVDataDwords, NumVAddrDwords);
7771 } else {
7772 assert(!IsGFX12Plus);
7773 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7774 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7775 : AMDGPU::MIMGEncGfx10Default,
7776 NumVDataDwords, NumVAddrDwords);
7777 }
7778 assert(Opcode != -1);
7779
7781 if (UseNSA && IsGFX11Plus) {
7782 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7783 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7784 auto Merged = B.buildMergeLikeInstr(
7785 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7786 Ops.push_back(Merged.getReg(0));
7787 };
7788
7789 Ops.push_back(NodePtr);
7790 Ops.push_back(RayExtent);
7791 packLanes(RayOrigin);
7792
7793 if (IsA16) {
7794 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7795 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7796 auto MergedDir = B.buildMergeLikeInstr(
7797 V3S32,
7798 {B.buildBitcast(
7799 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7800 UnmergeRayDir.getReg(0)}))
7801 .getReg(0),
7802 B.buildBitcast(
7803 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7804 UnmergeRayDir.getReg(1)}))
7805 .getReg(0),
7806 B.buildBitcast(
7807 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7808 UnmergeRayDir.getReg(2)}))
7809 .getReg(0)});
7810 Ops.push_back(MergedDir.getReg(0));
7811 } else {
7812 packLanes(RayDir);
7813 packLanes(RayInvDir);
7814 }
7815 } else {
7816 if (Is64) {
7817 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7818 Ops.push_back(Unmerge.getReg(0));
7819 Ops.push_back(Unmerge.getReg(1));
7820 } else {
7821 Ops.push_back(NodePtr);
7822 }
7823 Ops.push_back(RayExtent);
7824
7825 auto packLanes = [&Ops, &S32, &B](Register Src) {
7826 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7827 Ops.push_back(Unmerge.getReg(0));
7828 Ops.push_back(Unmerge.getReg(1));
7829 Ops.push_back(Unmerge.getReg(2));
7830 };
7831
7832 packLanes(RayOrigin);
7833 if (IsA16) {
7834 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7835 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7839 B.buildMergeLikeInstr(R1,
7840 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7841 B.buildMergeLikeInstr(
7842 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7843 B.buildMergeLikeInstr(
7844 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7845 Ops.push_back(R1);
7846 Ops.push_back(R2);
7847 Ops.push_back(R3);
7848 } else {
7849 packLanes(RayDir);
7850 packLanes(RayInvDir);
7851 }
7852 }
7853
7854 if (!UseNSA) {
7855 // Build a single vector containing all the operands so far prepared.
7856 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7857 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7858 Ops.clear();
7859 Ops.push_back(MergedOps);
7860 }
7861
7862 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7863 .addDef(DstReg)
7864 .addImm(Opcode);
7865
7866 for (Register R : Ops) {
7867 MIB.addUse(R);
7868 }
7869
7870 MIB.addUse(TDescr)
7871 .addImm(IsA16 ? 1 : 0)
7872 .cloneMemRefs(MI);
7873
7874 MI.eraseFromParent();
7875 return true;
7876}
7877
7879 MachineInstr &MI, MachineIRBuilder &B) const {
7880 const LLT S32 = LLT::scalar(32);
7881 const LLT V2S32 = LLT::fixed_vector(2, 32);
7882
7883 Register DstReg = MI.getOperand(0).getReg();
7884 Register DstOrigin = MI.getOperand(1).getReg();
7885 Register DstDir = MI.getOperand(2).getReg();
7886 Register NodePtr = MI.getOperand(4).getReg();
7887 Register RayExtent = MI.getOperand(5).getReg();
7888 Register InstanceMask = MI.getOperand(6).getReg();
7889 Register RayOrigin = MI.getOperand(7).getReg();
7890 Register RayDir = MI.getOperand(8).getReg();
7891 Register Offsets = MI.getOperand(9).getReg();
7892 Register TDescr = MI.getOperand(10).getReg();
7893
7894 if (!ST.hasBVHDualAndBVH8Insts()) {
7895 Function &Fn = B.getMF().getFunction();
7897 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7898 return false;
7899 }
7900
7901 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7902 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7903 const unsigned NumVDataDwords = 10;
7904 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7905 int Opcode = AMDGPU::getMIMGOpcode(
7906 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7907 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7908 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7909 assert(Opcode != -1);
7910
7911 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7912 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7913
7914 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7915 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7916 .addDef(DstReg)
7917 .addDef(DstOrigin)
7918 .addDef(DstDir)
7919 .addImm(Opcode)
7920 .addUse(NodePtr)
7921 .addUse(RayExtentInstanceMaskVec.getReg(0))
7922 .addUse(RayOrigin)
7923 .addUse(RayDir)
7924 .addUse(Offsets)
7925 .addUse(TDescr)
7926 .cloneMemRefs(MI);
7927
7928 MI.eraseFromParent();
7929 return true;
7930}
7931
7933 MachineIRBuilder &B) const {
7934 const SITargetLowering *TLI = ST.getTargetLowering();
7936 Register DstReg = MI.getOperand(0).getReg();
7937 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7938 MI.eraseFromParent();
7939 return true;
7940}
7941
7943 MachineIRBuilder &B) const {
7944 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7945 if (!ST.hasArchitectedSGPRs())
7946 return false;
7947 LLT S32 = LLT::scalar(32);
7948 Register DstReg = MI.getOperand(0).getReg();
7949 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7950 auto LSB = B.buildConstant(S32, 25);
7951 auto Width = B.buildConstant(S32, 5);
7952 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7953 MI.eraseFromParent();
7954 return true;
7955}
7956
7959 AMDGPU::Hwreg::Id HwReg,
7960 unsigned LowBit,
7961 unsigned Width) const {
7962 MachineRegisterInfo &MRI = *B.getMRI();
7963 Register DstReg = MI.getOperand(0).getReg();
7964 if (!MRI.getRegClassOrNull(DstReg))
7965 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7966 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7967 .addDef(DstReg)
7968 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7969 MI.eraseFromParent();
7970 return true;
7971}
7972
7973static constexpr unsigned FPEnvModeBitField =
7975
7976static constexpr unsigned FPEnvTrapBitField =
7978
7981 MachineIRBuilder &B) const {
7982 Register Src = MI.getOperand(0).getReg();
7983 if (MRI.getType(Src) != S64)
7984 return false;
7985
7986 auto ModeReg =
7987 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7988 /*HasSideEffects=*/true, /*isConvergent=*/false)
7989 .addImm(FPEnvModeBitField);
7990 auto TrapReg =
7991 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7992 /*HasSideEffects=*/true, /*isConvergent=*/false)
7993 .addImm(FPEnvTrapBitField);
7994 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7995 MI.eraseFromParent();
7996 return true;
7997}
7998
8001 MachineIRBuilder &B) const {
8002 Register Src = MI.getOperand(0).getReg();
8003 if (MRI.getType(Src) != S64)
8004 return false;
8005
8006 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
8007 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8008 /*HasSideEffects=*/true, /*isConvergent=*/false)
8009 .addImm(static_cast<int16_t>(FPEnvModeBitField))
8010 .addReg(Unmerge.getReg(0));
8011 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8012 /*HasSideEffects=*/true, /*isConvergent=*/false)
8013 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
8014 .addReg(Unmerge.getReg(1));
8015 MI.eraseFromParent();
8016 return true;
8017}
8018
8020 MachineInstr &MI) const {
8021 MachineIRBuilder &B = Helper.MIRBuilder;
8022 MachineRegisterInfo &MRI = *B.getMRI();
8023
8024 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8025 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
8026 switch (IntrID) {
8027 case Intrinsic::sponentry:
8028 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8029 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8030 // that we can remove this cast.
8031 const LLT S32 = LLT::scalar(32);
8033 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8034
8035 Register DstReg = MI.getOperand(0).getReg();
8036 B.buildIntToPtr(DstReg, TmpReg);
8037 MI.eraseFromParent();
8038 } else {
8039 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8040 1, 0, /*IsImmutable=*/false);
8041 B.buildFrameIndex(MI.getOperand(0), FI);
8042 MI.eraseFromParent();
8043 }
8044 return true;
8045 case Intrinsic::amdgcn_if:
8046 case Intrinsic::amdgcn_else: {
8047 MachineInstr *Br = nullptr;
8048 MachineBasicBlock *UncondBrTarget = nullptr;
8049 bool Negated = false;
8050 if (MachineInstr *BrCond =
8051 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8052 const SIRegisterInfo *TRI
8053 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8054
8055 Register Def = MI.getOperand(1).getReg();
8056 Register Use = MI.getOperand(3).getReg();
8057
8058 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8059
8060 if (Negated)
8061 std::swap(CondBrTarget, UncondBrTarget);
8062
8063 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8064 if (IntrID == Intrinsic::amdgcn_if) {
8065 B.buildInstr(AMDGPU::SI_IF)
8066 .addDef(Def)
8067 .addUse(Use)
8068 .addMBB(UncondBrTarget);
8069 } else {
8070 B.buildInstr(AMDGPU::SI_ELSE)
8071 .addDef(Def)
8072 .addUse(Use)
8073 .addMBB(UncondBrTarget);
8074 }
8075
8076 if (Br) {
8077 Br->getOperand(0).setMBB(CondBrTarget);
8078 } else {
8079 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8080 // since we're swapping branch targets it needs to be reinserted.
8081 // FIXME: IRTranslator should probably not do this
8082 B.buildBr(*CondBrTarget);
8083 }
8084
8085 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8086 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8087 MI.eraseFromParent();
8088 BrCond->eraseFromParent();
8089 return true;
8090 }
8091
8092 return false;
8093 }
8094 case Intrinsic::amdgcn_loop: {
8095 MachineInstr *Br = nullptr;
8096 MachineBasicBlock *UncondBrTarget = nullptr;
8097 bool Negated = false;
8098 if (MachineInstr *BrCond =
8099 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8100 const SIRegisterInfo *TRI
8101 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8102
8103 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8104 Register Reg = MI.getOperand(2).getReg();
8105
8106 if (Negated)
8107 std::swap(CondBrTarget, UncondBrTarget);
8108
8109 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8110 B.buildInstr(AMDGPU::SI_LOOP)
8111 .addUse(Reg)
8112 .addMBB(UncondBrTarget);
8113
8114 if (Br)
8115 Br->getOperand(0).setMBB(CondBrTarget);
8116 else
8117 B.buildBr(*CondBrTarget);
8118
8119 MI.eraseFromParent();
8120 BrCond->eraseFromParent();
8121 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8122 return true;
8123 }
8124
8125 return false;
8126 }
8127 case Intrinsic::amdgcn_addrspacecast_nonnull:
8128 return legalizeAddrSpaceCast(MI, MRI, B);
8129 case Intrinsic::amdgcn_make_buffer_rsrc:
8130 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8131 case Intrinsic::amdgcn_kernarg_segment_ptr:
8132 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8133 // This only makes sense to call in a kernel, so just lower to null.
8134 B.buildConstant(MI.getOperand(0).getReg(), 0);
8135 MI.eraseFromParent();
8136 return true;
8137 }
8138
8141 case Intrinsic::amdgcn_implicitarg_ptr:
8142 return legalizeImplicitArgPtr(MI, MRI, B);
8143 case Intrinsic::amdgcn_workitem_id_x:
8144 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8146 case Intrinsic::amdgcn_workitem_id_y:
8147 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8149 case Intrinsic::amdgcn_workitem_id_z:
8150 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8152 case Intrinsic::amdgcn_workgroup_id_x:
8153 return legalizeWorkGroupId(
8157 case Intrinsic::amdgcn_workgroup_id_y:
8158 return legalizeWorkGroupId(
8162 case Intrinsic::amdgcn_workgroup_id_z:
8163 return legalizeWorkGroupId(
8167 case Intrinsic::amdgcn_cluster_id_x:
8168 return ST.hasClusters() &&
8171 case Intrinsic::amdgcn_cluster_id_y:
8172 return ST.hasClusters() &&
8175 case Intrinsic::amdgcn_cluster_id_z:
8176 return ST.hasClusters() &&
8179 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8180 return ST.hasClusters() &&
8183 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8184 return ST.hasClusters() &&
8187 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8188 return ST.hasClusters() &&
8191 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8192 return ST.hasClusters() &&
8194 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8195 return ST.hasClusters() &&
8198 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8199 return ST.hasClusters() &&
8202 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8203 return ST.hasClusters() &&
8206 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8207 return ST.hasClusters() &&
8209 MI, MRI, B,
8211 case Intrinsic::amdgcn_wave_id:
8212 return legalizeWaveID(MI, B);
8213 case Intrinsic::amdgcn_lds_kernel_id:
8214 return legalizePreloadedArgIntrin(MI, MRI, B,
8216 case Intrinsic::amdgcn_dispatch_ptr:
8217 return legalizePreloadedArgIntrin(MI, MRI, B,
8219 case Intrinsic::amdgcn_queue_ptr:
8220 return legalizePreloadedArgIntrin(MI, MRI, B,
8222 case Intrinsic::amdgcn_implicit_buffer_ptr:
8225 case Intrinsic::amdgcn_dispatch_id:
8226 return legalizePreloadedArgIntrin(MI, MRI, B,
8228 case Intrinsic::r600_read_ngroups_x:
8229 // TODO: Emit error for hsa
8232 case Intrinsic::r600_read_ngroups_y:
8235 case Intrinsic::r600_read_ngroups_z:
8238 case Intrinsic::r600_read_local_size_x:
8239 // TODO: Could insert G_ASSERT_ZEXT from s16
8241 case Intrinsic::r600_read_local_size_y:
8242 // TODO: Could insert G_ASSERT_ZEXT from s16
8244 // TODO: Could insert G_ASSERT_ZEXT from s16
8245 case Intrinsic::r600_read_local_size_z:
8248 case Intrinsic::amdgcn_fdiv_fast:
8249 return legalizeFDIVFastIntrin(MI, MRI, B);
8250 case Intrinsic::amdgcn_is_shared:
8252 case Intrinsic::amdgcn_is_private:
8254 case Intrinsic::amdgcn_wavefrontsize: {
8255 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8256 MI.eraseFromParent();
8257 return true;
8258 }
8259 case Intrinsic::amdgcn_s_buffer_load:
8260 return legalizeSBufferLoad(Helper, MI);
8261 case Intrinsic::amdgcn_raw_buffer_store:
8262 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8263 case Intrinsic::amdgcn_struct_buffer_store:
8264 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8265 return legalizeBufferStore(MI, Helper, false, false);
8266 case Intrinsic::amdgcn_raw_buffer_store_format:
8267 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8268 case Intrinsic::amdgcn_struct_buffer_store_format:
8269 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8270 return legalizeBufferStore(MI, Helper, false, true);
8271 case Intrinsic::amdgcn_raw_tbuffer_store:
8272 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8273 case Intrinsic::amdgcn_struct_tbuffer_store:
8274 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8275 return legalizeBufferStore(MI, Helper, true, true);
8276 case Intrinsic::amdgcn_raw_buffer_load:
8277 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8278 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8279 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8280 case Intrinsic::amdgcn_struct_buffer_load:
8281 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8282 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8283 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8284 return legalizeBufferLoad(MI, Helper, false, false);
8285 case Intrinsic::amdgcn_raw_buffer_load_format:
8286 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8287 case Intrinsic::amdgcn_struct_buffer_load_format:
8288 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8289 return legalizeBufferLoad(MI, Helper, true, false);
8290 case Intrinsic::amdgcn_raw_tbuffer_load:
8291 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8292 case Intrinsic::amdgcn_struct_tbuffer_load:
8293 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8294 return legalizeBufferLoad(MI, Helper, true, true);
8295 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8296 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8297 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8298 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8299 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8300 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8301 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8302 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8303 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8305 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8306 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8307 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8308 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8309 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8310 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8311 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8312 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8313 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8314 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8315 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8316 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8317 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8318 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8319 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8320 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8321 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8322 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8323 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8324 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8325 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8326 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8327 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8328 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8329 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8330 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8331 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8332 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8333 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8334 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8335 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8336 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8337 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8338 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8339 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8340 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8341 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8342 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8343 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8344 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8345 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8346 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8347 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8348 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8349 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8350 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8351 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8352 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8353 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8354 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8355 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8356 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8357 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8358 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8359 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8360 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8361 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8362 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8363 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8364 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8365 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8366 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8367 return legalizeBufferAtomic(MI, B, IntrID);
8368 case Intrinsic::amdgcn_rsq_clamp:
8369 return legalizeRsqClampIntrinsic(MI, MRI, B);
8370 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8372 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8373 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8375 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8376 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8377 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8378 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8379 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8380 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8381 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8382 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8383 Register Index = MI.getOperand(5).getReg();
8384 LLT S64 = LLT::scalar(64);
8385 LLT IndexArgTy = MRI.getType(Index);
8386 if (IndexArgTy != S64) {
8387 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8388 : B.buildAnyExt(S64, Index);
8389 MI.getOperand(5).setReg(NewIndex.getReg(0));
8390 }
8391 return true;
8392 }
8393 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8394 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8395 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8396 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8397 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8398 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8399 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8400 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8401 Register Index = MI.getOperand(5).getReg();
8402 LLT S32 = LLT::scalar(32);
8403 if (MRI.getType(Index) != S32)
8404 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8405 return true;
8406 }
8407 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8408 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8409 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8410 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8411 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8412 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8413 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8414 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8415 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8416 Register Index = MI.getOperand(7).getReg();
8417 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8418 ? LLT::scalar(64)
8419 : LLT::scalar(32);
8420 LLT IndexArgTy = MRI.getType(Index);
8421 if (IndexArgTy != IdxTy) {
8422 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8423 : B.buildAnyExt(IdxTy, Index);
8424 MI.getOperand(7).setReg(NewIndex.getReg(0));
8425 }
8426 return true;
8427 }
8428
8429 case Intrinsic::amdgcn_fmed3: {
8430 GISelChangeObserver &Observer = Helper.Observer;
8431
8432 // FIXME: This is to workaround the inability of tablegen match combiners to
8433 // match intrinsics in patterns.
8434 Observer.changingInstr(MI);
8435 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8436 MI.removeOperand(1);
8437 Observer.changedInstr(MI);
8438 return true;
8439 }
8440 case Intrinsic::amdgcn_readlane:
8441 case Intrinsic::amdgcn_writelane:
8442 case Intrinsic::amdgcn_readfirstlane:
8443 case Intrinsic::amdgcn_permlane16:
8444 case Intrinsic::amdgcn_permlanex16:
8445 case Intrinsic::amdgcn_permlane64:
8446 case Intrinsic::amdgcn_set_inactive:
8447 case Intrinsic::amdgcn_set_inactive_chain_arg:
8448 case Intrinsic::amdgcn_mov_dpp8:
8449 case Intrinsic::amdgcn_update_dpp:
8450 return legalizeLaneOp(Helper, MI, IntrID);
8451 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8452 return legalizeSBufferPrefetch(Helper, MI);
8453 case Intrinsic::amdgcn_dead: {
8454 // TODO: Use poison instead of undef
8455 for (const MachineOperand &Def : MI.defs())
8456 B.buildUndef(Def);
8457 MI.eraseFromParent();
8458 return true;
8459 }
8460 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8461 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8462 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8463 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8464 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8465 MI.eraseFromParent();
8466 return true;
8467 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8468 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8469 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8470 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8471 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8472 MI.eraseFromParent();
8473 return true;
8474 case Intrinsic::amdgcn_flat_load_monitor_b32:
8475 case Intrinsic::amdgcn_flat_load_monitor_b64:
8476 case Intrinsic::amdgcn_flat_load_monitor_b128:
8477 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8478 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8479 .add(MI.getOperand(0))
8480 .add(MI.getOperand(2))
8481 .addMemOperand(*MI.memoperands_begin());
8482 MI.eraseFromParent();
8483 return true;
8484 case Intrinsic::amdgcn_global_load_monitor_b32:
8485 case Intrinsic::amdgcn_global_load_monitor_b64:
8486 case Intrinsic::amdgcn_global_load_monitor_b128:
8487 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8488 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8489 .add(MI.getOperand(0))
8490 .add(MI.getOperand(2))
8491 .addMemOperand(*MI.memoperands_begin());
8492 MI.eraseFromParent();
8493 return true;
8494 default: {
8495 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8497 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8498 return true;
8499 }
8500 }
8501
8502 return true;
8503}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:77
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1273
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1197
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:383
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:557
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:1980
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1678
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:78
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.