LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::scalar(32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::scalar(64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
327constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
328constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
329constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
330constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
331constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
332constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
333constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
334constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
335constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
336constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
337constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
338constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
341constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
342constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
343constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
344constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
345constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
346constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
347constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
350constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
364
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
375
378 (ST.useRealTrue16Insts() && Ty == S16) ||
380}
381
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 Query.MMODescrs[0].Ordering !=
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
500 Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
601 RoundedSize, AddrSpace, Align(AlignInBits / 8),
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
612 Query.MMODescrs[0].AlignInBits,
613 Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(Idx);
622
623 const LLT PointerTy = MRI.getType(MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
647 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Unmerged.getReg(I));
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 }
675 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
677}
678
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
686 return;
688}
689
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const std::initializer_list<LLT> FPTypesPK16_64 = {S32, S64, S16, V2S16,
736 V2S64};
737
738 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
739
741
742 // s1 for VCC branches, s32 for SCC branches.
744
745 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
746 // elements for v3s16
749 .legalFor(AllS32Vectors)
751 .legalFor(AddrSpaces64)
752 .legalFor(AddrSpaces32)
753 .legalFor(AddrSpaces128)
754 .legalIf(isPointer(0))
755 .clampScalar(0, S16, S256)
757 .clampMaxNumElements(0, S32, 16)
759 .scalarize(0);
760
761 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
762 // Full set of gfx9 features.
763 if (ST.hasPackedU64Ops()) {
764 getActionDefinitionsBuilder({G_ADD, G_SUB})
765 .legalFor({S64, S32, S16, V2S16, V2S64})
766 .clampMaxNumElementsStrict(0, S16, 2)
768 .scalarize(0)
769 .minScalar(0, S16)
771 .maxScalar(0, S32);
772 } else if (ST.hasScalarAddSub64()) {
773 getActionDefinitionsBuilder({G_ADD, G_SUB})
774 .legalFor({S64, S32, S16, V2S16})
775 .clampMaxNumElementsStrict(0, S16, 2)
776 .scalarize(0)
777 .minScalar(0, S16)
779 .maxScalar(0, S32);
780 } else {
781 getActionDefinitionsBuilder({G_ADD, G_SUB})
782 .legalFor({S32, S16, V2S16})
783 .clampMaxNumElementsStrict(0, S16, 2)
784 .scalarize(0)
785 .minScalar(0, S16)
787 .maxScalar(0, S32);
788 }
789
790 if (ST.hasScalarSMulU64()) {
792 .legalFor({S64, S32, S16, V2S16})
793 .clampMaxNumElementsStrict(0, S16, 2)
794 .scalarize(0)
795 .minScalar(0, S16)
797 .custom();
798 } else {
800 .legalFor({S32, S16, V2S16})
801 .clampMaxNumElementsStrict(0, S16, 2)
802 .scalarize(0)
803 .minScalar(0, S16)
805 .custom();
806 }
807 assert(ST.hasMad64_32());
808
809 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
810 .legalFor({S32, S16, V2S16}) // Clamp modifier
811 .minScalarOrElt(0, S16)
813 .scalarize(0)
815 .lower();
816 } else if (ST.has16BitInsts()) {
817 getActionDefinitionsBuilder({G_ADD, G_SUB})
818 .legalFor({S32, S16})
819 .minScalar(0, S16)
821 .maxScalar(0, S32)
822 .scalarize(0);
823
825 .legalFor({S32, S16})
826 .scalarize(0)
827 .minScalar(0, S16)
829 .custom();
830 assert(ST.hasMad64_32());
831
832 // Technically the saturating operations require clamp bit support, but this
833 // was introduced at the same time as 16-bit operations.
834 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
835 .legalFor({S32, S16}) // Clamp modifier
836 .minScalar(0, S16)
837 .scalarize(0)
839 .lower();
840
841 // We're just lowering this, but it helps get a better result to try to
842 // coerce to the desired type first.
843 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
844 .minScalar(0, S16)
845 .scalarize(0)
846 .lower();
847 } else {
848 getActionDefinitionsBuilder({G_ADD, G_SUB})
849 .legalFor({S32})
850 .widenScalarToNextMultipleOf(0, 32)
851 .clampScalar(0, S32, S32)
852 .scalarize(0);
853
854 auto &Mul = getActionDefinitionsBuilder(G_MUL)
855 .legalFor({S32})
856 .scalarize(0)
857 .minScalar(0, S32)
859
860 if (ST.hasMad64_32())
861 Mul.custom();
862 else
863 Mul.maxScalar(0, S32);
864
865 if (ST.hasIntClamp()) {
866 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
867 .legalFor({S32}) // Clamp modifier.
868 .scalarize(0)
870 .lower();
871 } else {
872 // Clamp bit support was added in VI, along with 16-bit operations.
873 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
874 .minScalar(0, S32)
875 .scalarize(0)
876 .lower();
877 }
878
879 // FIXME: DAG expansion gets better results. The widening uses the smaller
880 // range values and goes for the min/max lowering directly.
881 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
882 .minScalar(0, S32)
883 .scalarize(0)
884 .lower();
885 }
886
888 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
889 .customFor({S32, S64})
890 .clampScalar(0, S32, S64)
892 .scalarize(0);
893
894 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
895 .legalFor({S32})
896 .maxScalar(0, S32);
897
898 if (ST.hasVOP3PInsts()) {
899 Mulh
900 .clampMaxNumElements(0, S8, 2)
901 .lowerFor({V2S8});
902 }
903
904 Mulh
905 .scalarize(0)
906 .lower();
907
908 // Report legal for any types we can handle anywhere. For the cases only legal
909 // on the SALU, RegBankSelect will be able to re-legalize.
910 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
911 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
912 .clampScalar(0, S32, S64)
918 .scalarize(0);
919
921 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
922 .legalFor({{S32, S1}, {S32, S32}})
923 .clampScalar(0, S32, S32)
924 .scalarize(0);
925
927 // Don't worry about the size constraint.
929 .lower();
930
932 .legalFor({S1, S32, S64, S16, GlobalPtr,
933 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
934 .legalIf(isPointer(0))
935 .clampScalar(0, S32, S64)
937
938 getActionDefinitionsBuilder(G_FCONSTANT)
939 .legalFor({S32, S64, S16})
940 .clampScalar(0, S16, S64);
941
942 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
943 .legalIf(isRegisterClassType(ST, 0))
944 // s1 and s16 are special cases because they have legal operations on
945 // them, but don't really occupy registers in the normal way.
946 .legalFor({S1, S16})
947 .clampNumElements(0, V16S32, V32S32)
951 .clampMaxNumElements(0, S32, 16);
952
953 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
954
955 // If the amount is divergent, we have to do a wave reduction to get the
956 // maximum value, so this is expanded during RegBankSelect.
957 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
958 .legalFor({{PrivatePtr, S32}});
959
960 getActionDefinitionsBuilder(G_STACKSAVE)
961 .customFor({PrivatePtr});
962 getActionDefinitionsBuilder(G_STACKRESTORE)
963 .legalFor({PrivatePtr});
964
965 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
966
967 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
968 .customIf(typeIsNot(0, PrivatePtr));
969
970 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
971
972 auto &FPOpActions = getActionDefinitionsBuilder(
973 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
974 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
975 .legalFor({S32, S64});
976 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
977 .customFor({S32, S64});
978 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
979 .customFor({S32, S64});
980
981 if (ST.has16BitInsts()) {
982 if (ST.hasVOP3PInsts())
983 FPOpActions.legalFor({S16, V2S16});
984 else
985 FPOpActions.legalFor({S16});
986
987 TrigActions.customFor({S16});
988 FDIVActions.customFor({S16});
989 }
990
991 if (ST.hasPackedFP32Ops()) {
992 FPOpActions.legalFor({V2S32});
993 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
994 }
995
996 if (ST.hasPackedFP64Ops()) {
997 FPOpActions.legalFor({V2S64});
998 FPOpActions.clampMaxNumElementsStrict(0, S64, 2);
999 }
1000
1001 if (ST.hasPackedFP64Ops()) {
1002 FPOpActions.legalFor({V2S64});
1003 FPOpActions.clampMaxNumElementsStrict(0, S64, 2);
1004 }
1005
1006 auto &MinNumMaxNumIeee =
1007 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
1008
1009 if (ST.hasVOP3PInsts()) {
1010 MinNumMaxNumIeee.legalFor(FPTypesPK16)
1011 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1012 .clampMaxNumElements(0, S16, 2)
1013 .clampScalar(0, S16, S64)
1014 .scalarize(0);
1015 } else if (ST.has16BitInsts()) {
1016 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
1017 } else {
1018 MinNumMaxNumIeee.legalFor(FPTypesBase)
1019 .clampScalar(0, S32, S64)
1020 .scalarize(0);
1021 }
1022
1023 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1024 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1025
1026 if (ST.hasPackedFP64Ops()) {
1027 MinNumMaxNum.customFor(FPTypesPK16_64)
1028 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1029 .clampMaxNumElements(0, S16, 2)
1030 .clampMaxNumElements(0, S64, 2)
1031 .clampScalar(0, S16, S64)
1032 .scalarize(0);
1033 } else if (ST.hasVOP3PInsts()) {
1034 MinNumMaxNum.customFor(FPTypesPK16)
1035 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1036 .clampMaxNumElements(0, S16, 2)
1037 .clampScalar(0, S16, S64)
1038 .scalarize(0);
1039 } else if (ST.has16BitInsts()) {
1040 MinNumMaxNum.customFor(FPTypes16)
1041 .clampScalar(0, S16, S64)
1042 .scalarize(0);
1043 } else {
1044 MinNumMaxNum.customFor(FPTypesBase)
1045 .clampScalar(0, S32, S64)
1046 .scalarize(0);
1047 }
1048
1049 if (ST.hasVOP3PInsts())
1050 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1051
1052 FPOpActions
1053 .scalarize(0)
1054 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1055
1056 TrigActions
1057 .scalarize(0)
1058 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1059
1060 FDIVActions
1061 .scalarize(0)
1062 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1063
1064 auto &FNegAbs = getActionDefinitionsBuilder({G_FNEG, G_FABS});
1065 FNegAbs.legalFor(FPTypesPK16)
1066 .legalFor(ST.hasPackedFP32Ops(), {V2S32})
1068 if (ST.hasPackedFP32Ops())
1069 FNegAbs.clampMaxNumElementsStrict(0, S32, 2);
1070 FNegAbs.scalarize(0).clampScalar(0, S16, S64);
1071
1072 if (ST.has16BitInsts()) {
1074 .legalFor({S16})
1075 .customFor({S32, S64})
1076 .scalarize(0)
1077 .unsupported();
1079 .legalFor({S32, S64, S16})
1080 .scalarize(0)
1081 .clampScalar(0, S16, S64);
1082
1083 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1084 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1085 .scalarize(0)
1086 .maxScalarIf(typeIs(0, S16), 1, S16)
1087 .clampScalar(1, S32, S32)
1088 .lower();
1089
1091 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1092 .scalarize(0)
1093 .lower();
1094
1096 .lowerFor({S16, S32, S64})
1097 .scalarize(0)
1098 .lower();
1099 } else {
1101 .customFor({S32, S64, S16})
1102 .scalarize(0)
1103 .unsupported();
1104
1105
1106 if (ST.hasFractBug()) {
1108 .customFor({S64})
1109 .legalFor({S32, S64})
1110 .scalarize(0)
1111 .clampScalar(0, S32, S64);
1112 } else {
1114 .legalFor({S32, S64})
1115 .scalarize(0)
1116 .clampScalar(0, S32, S64);
1117 }
1118
1119 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1120 .legalFor({{S32, S32}, {S64, S32}})
1121 .scalarize(0)
1122 .clampScalar(0, S32, S64)
1123 .clampScalar(1, S32, S32)
1124 .lower();
1125
1127 .customFor({{S32, S32}, {S64, S32}})
1128 .scalarize(0)
1129 .minScalar(0, S32)
1130 .clampScalar(1, S32, S32)
1131 .lower();
1132
1134 .lowerFor({S32, S64})
1135 .scalarize(0)
1136 .lower();
1137 }
1138
1139 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1140 if (ST.hasCvtPkF16F32Inst()) {
1141 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1142 .clampMaxNumElements(0, S16, 2);
1143 } else {
1144 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1145 }
1146 FPTruncActions.scalarize(0).lower();
1147
1149 .legalFor({{S64, S32}, {S32, S16}})
1150 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1151 .scalarize(0);
1152
1153 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1154 if (ST.has16BitInsts()) {
1155 FSubActions
1156 // Use actual fsub instruction
1157 .legalFor({S32, S16})
1158 // Must use fadd + fneg
1159 .lowerFor({S64, V2S16});
1160 } else {
1161 FSubActions
1162 // Use actual fsub instruction
1163 .legalFor({S32})
1164 // Must use fadd + fneg
1165 .lowerFor({S64, S16, V2S16});
1166 }
1167
1168 if (ST.hasPackedFP32Ops())
1169 FSubActions.lowerFor({V2S32}).clampMaxNumElements(0, S32, 2);
1170
1171 FSubActions
1172 .clampMaxNumElements(0, S16, 2)
1173 .scalarize(0)
1174 .clampScalar(0, S32, S64);
1175
1176 // Whether this is legal depends on the floating point mode for the function.
1177 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1178 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1179 FMad.customFor({S32, S16});
1180 else if (ST.hasMadMacF32Insts())
1181 FMad.customFor({S32});
1182 else if (ST.hasMadF16())
1183 FMad.customFor({S16});
1184 FMad.scalarize(0)
1185 .lower();
1186
1187 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1188 if (ST.has16BitInsts()) {
1189 FRem.customFor({S16, S32, S64});
1190 } else {
1191 FRem.minScalar(0, S32)
1192 .customFor({S32, S64});
1193 }
1194 FRem.scalarize(0);
1195
1196 // TODO: Do we need to clamp maximum bitwidth?
1198 .legalIf(isScalar(0))
1199 .legalFor({{V2S16, V2S32}})
1200 .clampMaxNumElements(0, S16, 2)
1201 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1202 // situations (like an invalid implicit use), we don't want to infinite loop
1203 // in the legalizer.
1205 .alwaysLegal();
1206
1207 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1208 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1209 {S32, S1}, {S64, S1}, {S16, S1}})
1210 .scalarize(0)
1211 .clampScalar(0, S32, S64)
1212 .widenScalarToNextPow2(1, 32);
1213
1214 // TODO: Split s1->s64 during regbankselect for VALU.
1215 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1216 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1217 .lowerIf(typeIs(1, S1))
1218 .customFor({{S32, S64}, {S64, S64}});
1219 if (ST.has16BitInsts())
1220 IToFP.legalFor({{S16, S16}});
1221 IToFP.clampScalar(1, S32, S64)
1222 .minScalar(0, S32)
1223 .scalarize(0)
1225
1226 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1227 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1228 .customFor({{S64, S32}, {S64, S64}})
1229 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1230 if (ST.has16BitInsts())
1231 FPToI.legalFor({{S16, S16}});
1232 else
1233 FPToI.minScalar(1, S32);
1234
1235 FPToI.minScalar(0, S32)
1236 .widenScalarToNextPow2(0, 32)
1237 .scalarize(0)
1238 .lower();
1239
1240 // clang-format off
1241 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1242 .legalFor({{S32, S32}, {S32, S64}})
1243 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1244 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1245
1246 // If available, widen width <16 to i16, intead of i32 so v_cvt_i16/u16_f16 can be used.
1247 if (ST.has16BitInsts())
1248 FPToISat.minScalarIf(typeIs(1, S16), 0, S16);
1249
1250 FPToISat.minScalar(1, S32);
1251 FPToISat.minScalar(0, S32)
1252 .widenScalarToNextPow2(0, 32)
1253 .scalarize(0)
1254 .lower();
1255 // clang-format on
1256
1257 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1258 .clampScalar(0, S16, S64)
1259 .scalarize(0)
1260 .lower();
1261
1262 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1263 .legalFor({S16, S32})
1264 .scalarize(0)
1265 .lower();
1266
1267 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1268 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1269 .scalarize(0)
1270 .lower();
1271
1272 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1273 .clampScalar(0, S16, S64)
1274 .scalarize(0)
1275 .lower();
1276
1277 if (ST.has16BitInsts()) {
1278 getActionDefinitionsBuilder(
1279 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1280 .legalFor({S16, S32, S64})
1281 .clampScalar(0, S16, S64)
1282 .scalarize(0);
1283 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1284 getActionDefinitionsBuilder(
1285 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1286 .legalFor({S32, S64})
1287 .clampScalar(0, S32, S64)
1288 .scalarize(0);
1289 } else {
1290 getActionDefinitionsBuilder(
1291 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1292 .legalFor({S32})
1293 .customFor({S64})
1294 .clampScalar(0, S32, S64)
1295 .scalarize(0);
1296 }
1297
1298 getActionDefinitionsBuilder(G_PTR_ADD)
1299 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1300 .legalIf(all(isPointer(0), sameSize(0, 1)))
1301 .scalarize(0)
1302 .scalarSameSizeAs(1, 0);
1303
1304 getActionDefinitionsBuilder(G_PTRMASK)
1305 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1306 .scalarSameSizeAs(1, 0)
1307 .scalarize(0);
1308
1309 auto &CmpBuilder =
1310 getActionDefinitionsBuilder(G_ICMP)
1311 // The compare output type differs based on the register bank of the output,
1312 // so make both s1 and s32 legal.
1313 //
1314 // Scalar compares producing output in scc will be promoted to s32, as that
1315 // is the allocatable register type that will be needed for the copy from
1316 // scc. This will be promoted during RegBankSelect, and we assume something
1317 // before that won't try to use s32 result types.
1318 //
1319 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1320 // bank.
1322 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1323 .legalForCartesianProduct(
1324 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1325 if (ST.has16BitInsts()) {
1326 CmpBuilder.legalFor({{S1, S16}});
1327 }
1328
1329 CmpBuilder
1331 .clampScalar(1, S32, S64)
1332 .scalarize(0)
1333 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1334
1335 auto &FCmpBuilder =
1336 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1337 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1338
1339 if (ST.hasSALUFloatInsts())
1340 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1341
1342 FCmpBuilder
1344 .clampScalar(1, S32, S64)
1345 .scalarize(0);
1346
1347 // FIXME: fpow has a selection pattern that should move to custom lowering.
1348 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1349 if (ST.has16BitInsts())
1350 ExpOps.customFor({{S32}, {S16}});
1351 else
1352 ExpOps.customFor({S32});
1353 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1354 .scalarize(0);
1355
1356 getActionDefinitionsBuilder(G_FPOWI)
1357 .clampScalar(0, MinScalarFPTy, S32)
1358 .lower();
1359
1360 getActionDefinitionsBuilder(G_FLOG2)
1361 .legalFor(ST.has16BitInsts(), {S16})
1362 .customFor({S32, S16})
1363 .scalarize(0)
1364 .lower();
1365
1366 getActionDefinitionsBuilder(G_FEXP2)
1367 .legalFor(ST.has16BitInsts(), {S16})
1368 .customFor({S32, S64, S16})
1369 .scalarize(0)
1370 .lower();
1371
1372 auto &LogOps =
1373 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1374 LogOps.customFor({S32, S16, S64});
1375 LogOps.clampScalar(0, MinScalarFPTy, S32)
1376 .scalarize(0);
1377
1378 // The 64-bit versions produce 32-bit results, but only on the SALU.
1379 getActionDefinitionsBuilder(G_CTPOP)
1380 .legalFor({{S32, S32}, {S32, S64}})
1381 .clampScalar(0, S32, S32)
1382 .widenScalarToNextPow2(1, 32)
1383 .clampScalar(1, S32, S64)
1384 .scalarize(0)
1385 .widenScalarToNextPow2(0, 32);
1386
1387 // If no 16 bit instr is available, lower into different instructions.
1388 if (ST.has16BitInsts())
1389 getActionDefinitionsBuilder(G_IS_FPCLASS)
1390 .legalForCartesianProduct({S1}, FPTypes16)
1391 .widenScalarToNextPow2(1)
1392 .scalarize(0)
1393 .lower();
1394 else
1395 getActionDefinitionsBuilder(G_IS_FPCLASS)
1396 .legalForCartesianProduct({S1}, FPTypesBase)
1397 .lowerFor({S1, S16})
1398 .widenScalarToNextPow2(1)
1399 .scalarize(0)
1400 .lower();
1401
1402 // The hardware instructions return a different result on 0 than the generic
1403 // instructions expect. The hardware produces -1, but these produce the
1404 // bitwidth.
1405 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1406 .scalarize(0)
1407 .clampScalar(0, S32, S32)
1408 .clampScalar(1, S32, S64)
1409 .widenScalarToNextPow2(0, 32)
1410 .widenScalarToNextPow2(1, 32)
1411 .custom();
1412
1413 // The 64-bit versions produce 32-bit results, but only on the SALU.
1414 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1415 .legalFor({{S32, S32}, {S32, S64}})
1416 .customIf(scalarNarrowerThan(1, 32))
1417 .clampScalar(0, S32, S32)
1418 .clampScalar(1, S32, S64)
1419 .scalarize(0)
1420 .widenScalarToNextPow2(0, 32)
1421 .widenScalarToNextPow2(1, 32);
1422
1423 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1424 .legalFor({{S32, S32}, {S32, S64}})
1425 .clampScalar(0, S32, S32)
1426 .clampScalar(1, S32, S64)
1427 .scalarize(0)
1428 .widenScalarToNextPow2(0, 32)
1429 .widenScalarToNextPow2(1, 32);
1430
1431 getActionDefinitionsBuilder(G_CTLS)
1432 .customFor({{S32, S32}})
1433 .scalarize(0)
1434 .clampScalar(0, S32, S32)
1435 .clampScalar(1, S32, S32);
1436
1437 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1438 // RegBankSelect.
1439 getActionDefinitionsBuilder(G_BITREVERSE)
1440 .legalFor({S32, S64})
1441 .clampScalar(0, S32, S64)
1442 .scalarize(0)
1443 .widenScalarToNextPow2(0);
1444
1445 if (ST.has16BitInsts()) {
1446 getActionDefinitionsBuilder(G_BSWAP)
1447 .legalFor({S16, S32, V2S16})
1448 .clampMaxNumElementsStrict(0, S16, 2)
1449 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1450 // narrowScalar limitation.
1451 .widenScalarToNextPow2(0)
1452 .clampScalar(0, S16, S32)
1453 .scalarize(0);
1454
1455 if (ST.hasVOP3PInsts()) {
1456 getActionDefinitionsBuilder(G_ABS)
1457 .legalFor({S32, S16, V2S16})
1458 .clampMaxNumElements(0, S16, 2)
1459 .minScalar(0, S16)
1460 .widenScalarToNextPow2(0)
1461 .scalarize(0)
1462 .lower();
1463 if (ST.hasMinMaxI64Insts()) {
1464 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1465 .legalFor({S32, S16, S64, V2S16})
1466 .clampMaxNumElements(0, S16, 2)
1467 .minScalar(0, S16)
1468 .widenScalarToNextPow2(0)
1469 .scalarize(0)
1470 .lower();
1471 } else {
1472 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1473 .legalFor({S32, S16, V2S16})
1474 .clampMaxNumElements(0, S16, 2)
1475 .minScalar(0, S16)
1476 .widenScalarToNextPow2(0)
1477 .scalarize(0)
1478 .lower();
1479 }
1480 } else {
1481 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1482 .legalFor({S32, S16})
1483 .widenScalarToNextPow2(0)
1484 .minScalar(0, S16)
1485 .scalarize(0)
1486 .lower();
1487 }
1488 } else {
1489 // TODO: Should have same legality without v_perm_b32
1490 getActionDefinitionsBuilder(G_BSWAP)
1491 .legalFor({S32})
1492 .lowerIf(scalarNarrowerThan(0, 32))
1493 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1494 // narrowScalar limitation.
1495 .widenScalarToNextPow2(0)
1496 .maxScalar(0, S32)
1497 .scalarize(0)
1498 .lower();
1499
1500 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1501 .legalFor({S32})
1502 .minScalar(0, S32)
1503 .widenScalarToNextPow2(0)
1504 .scalarize(0)
1505 .lower();
1506 }
1507
1508 getActionDefinitionsBuilder(G_INTTOPTR)
1509 // List the common cases
1510 .legalForCartesianProduct(AddrSpaces64, {S64})
1511 .legalForCartesianProduct(AddrSpaces32, {S32})
1512 .scalarize(0)
1513 // Accept any address space as long as the size matches
1514 .legalIf(sameSize(0, 1))
1515 .widenScalarIf(smallerThan(1, 0),
1516 [](const LegalityQuery &Query) {
1517 return std::pair(
1518 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1519 })
1520 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1521 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1522 });
1523
1524 getActionDefinitionsBuilder(G_PTRTOINT)
1525 // List the common cases
1526 .legalForCartesianProduct(AddrSpaces64, {S64})
1527 .legalForCartesianProduct(AddrSpaces32, {S32})
1528 .scalarize(0)
1529 // Accept any address space as long as the size matches
1530 .legalIf(sameSize(0, 1))
1531 .widenScalarIf(smallerThan(0, 1),
1532 [](const LegalityQuery &Query) {
1533 return std::pair(
1534 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1535 })
1536 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1537 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1538 });
1539
1540 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1541 .scalarize(0)
1542 .custom();
1543
1544 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1545 bool IsLoad) -> bool {
1546 const LLT DstTy = Query.Types[0];
1547
1548 // Split vector extloads.
1549 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1550
1551 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1552 return true;
1553
1554 const LLT PtrTy = Query.Types[1];
1555 unsigned AS = PtrTy.getAddressSpace();
1556 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1557 Query.MMODescrs[0].Ordering !=
1559 return true;
1560
1561 // Catch weird sized loads that don't evenly divide into the access sizes
1562 // TODO: May be able to widen depending on alignment etc.
1563 unsigned NumRegs = (MemSize + 31) / 32;
1564 if (NumRegs == 3) {
1565 if (!ST.hasDwordx3LoadStores())
1566 return true;
1567 } else {
1568 // If the alignment allows, these should have been widened.
1569 if (!isPowerOf2_32(NumRegs))
1570 return true;
1571 }
1572
1573 return false;
1574 };
1575
1576 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1577 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1578 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1579
1580 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1581 // LDS
1582 // TODO: Unsupported flat for SI.
1583
1584 for (unsigned Op : {G_LOAD, G_STORE}) {
1585 const bool IsStore = Op == G_STORE;
1586
1587 auto &Actions = getActionDefinitionsBuilder(Op);
1588 // Explicitly list some common cases.
1589 // TODO: Does this help compile time at all?
1590 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1591 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1592 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1593 {S64, GlobalPtr, S64, GlobalAlign32},
1594 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1595 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1596 {S32, GlobalPtr, S8, GlobalAlign8},
1597 {S32, GlobalPtr, S16, GlobalAlign16},
1598
1599 {S32, LocalPtr, S32, 32},
1600 {S64, LocalPtr, S64, 32},
1601 {V2S32, LocalPtr, V2S32, 32},
1602 {S32, LocalPtr, S8, 8},
1603 {S32, LocalPtr, S16, 16},
1604 {V2S16, LocalPtr, S32, 32},
1605
1606 {S32, PrivatePtr, S32, 32},
1607 {S32, PrivatePtr, S8, 8},
1608 {S32, PrivatePtr, S16, 16},
1609 {V2S16, PrivatePtr, S32, 32},
1610
1611 {S32, ConstantPtr, S32, GlobalAlign32},
1612 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1613 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1614 {S64, ConstantPtr, S64, GlobalAlign32},
1615 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1616
1617 Actions.legalForTypesWithMemDesc(ST.useRealTrue16Insts(), /* Pred */
1618 {{S16, GlobalPtr, S8, GlobalAlign8},
1619 {S16, GlobalPtr, S16, GlobalAlign16},
1620 {S16, LocalPtr, S8, 8},
1621 {S16, LocalPtr, S16, 16},
1622 {S16, PrivatePtr, S8, 8},
1623 {S16, PrivatePtr, S16, 16}});
1624
1625 Actions.legalIf(
1626 [=](const LegalityQuery &Query) -> bool {
1627 return isLoadStoreLegal(ST, Query);
1628 });
1629
1630 // The custom pointers (fat pointers, buffer resources) don't work with load
1631 // and store at this level. Fat pointers should have been lowered to
1632 // intrinsics before the translation to MIR.
1633 Actions.unsupportedIf(
1634 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1635
1636 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1637 // ptrtoint. This is needed to account for the fact that we can't have i128
1638 // as a register class for SelectionDAG reasons.
1639 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1640 return hasBufferRsrcWorkaround(Query.Types[0]);
1641 });
1642
1643 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1644 // 64-bits.
1645 //
1646 // TODO: Should generalize bitcast action into coerce, which will also cover
1647 // inserting addrspacecasts.
1648 Actions.customIf(typeIs(1, Constant32Ptr));
1649
1650 // Turn any illegal element vectors into something easier to deal
1651 // with. These will ultimately produce 32-bit scalar shifts to extract the
1652 // parts anyway.
1653 //
1654 // For odd 16-bit element vectors, prefer to split those into pieces with
1655 // 16-bit vector parts.
1656 Actions.bitcastIf(
1657 [=](const LegalityQuery &Query) -> bool {
1658 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1659 Query.MMODescrs[0].MemoryTy);
1660 }, bitcastToRegisterType(0));
1661
1662 if (!IsStore) {
1663 // Widen suitably aligned loads by loading extra bytes. The standard
1664 // legalization actions can't properly express widening memory operands.
1665 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1666 return shouldWidenLoad(ST, Query, G_LOAD);
1667 });
1668 }
1669
1670 // FIXME: load/store narrowing should be moved to lower action
1671 Actions
1672 .narrowScalarIf(
1673 [=](const LegalityQuery &Query) -> bool {
1674 return !Query.Types[0].isVector() &&
1675 needToSplitMemOp(Query, Op == G_LOAD);
1676 },
1677 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1678 const LLT DstTy = Query.Types[0];
1679 const LLT PtrTy = Query.Types[1];
1680
1681 const unsigned DstSize = DstTy.getSizeInBits();
1682 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1683
1684 // Split extloads.
1685 if (DstSize > MemSize)
1686 return std::pair(0, LLT::scalar(MemSize));
1687
1688 unsigned MaxSize = maxSizeForAddrSpace(
1689 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1690 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1691 if (MemSize > MaxSize)
1692 return std::pair(0, LLT::scalar(MaxSize));
1693
1694 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1695 return std::pair(0, LLT::scalar(Align));
1696 })
1697 .fewerElementsIf(
1698 [=](const LegalityQuery &Query) -> bool {
1699 return Query.Types[0].isVector() &&
1700 needToSplitMemOp(Query, Op == G_LOAD);
1701 },
1702 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1703 const LLT DstTy = Query.Types[0];
1704 const LLT PtrTy = Query.Types[1];
1705
1706 LLT EltTy = DstTy.getElementType();
1707 unsigned MaxSize = maxSizeForAddrSpace(
1708 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1709 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1710
1711 // FIXME: Handle widened to power of 2 results better. This ends
1712 // up scalarizing.
1713 // FIXME: 3 element stores scalarized on SI
1714
1715 // Split if it's too large for the address space.
1716 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1717 if (MemSize > MaxSize) {
1718 unsigned NumElts = DstTy.getNumElements();
1719 unsigned EltSize = EltTy.getSizeInBits();
1720
1721 if (MaxSize % EltSize == 0) {
1722 return std::pair(
1724 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1725 }
1726
1727 unsigned NumPieces = MemSize / MaxSize;
1728
1729 // FIXME: Refine when odd breakdowns handled
1730 // The scalars will need to be re-legalized.
1731 if (NumPieces == 1 || NumPieces >= NumElts ||
1732 NumElts % NumPieces != 0)
1733 return std::pair(0, EltTy);
1734
1735 return std::pair(0,
1736 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1737 }
1738
1739 // FIXME: We could probably handle weird extending loads better.
1740 if (DstTy.getSizeInBits() > MemSize)
1741 return std::pair(0, EltTy);
1742
1743 unsigned EltSize = EltTy.getSizeInBits();
1744 unsigned DstSize = DstTy.getSizeInBits();
1745 if (!isPowerOf2_32(DstSize)) {
1746 // We're probably decomposing an odd sized store. Try to split
1747 // to the widest type. TODO: Account for alignment. As-is it
1748 // should be OK, since the new parts will be further legalized.
1749 unsigned FloorSize = llvm::bit_floor(DstSize);
1750 return std::pair(
1752 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1753 }
1754
1755 // May need relegalization for the scalars.
1756 return std::pair(0, EltTy);
1757 })
1758 .minScalar(0, S32)
1759 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1761 .widenScalarToNextPow2(0)
1762 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1763 .lower();
1764 }
1765
1766 // FIXME: Unaligned accesses not lowered.
1767 auto &ExtLoads =
1768 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1769 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1770 {S32, GlobalPtr, S16, 2 * 8},
1771 {S32, LocalPtr, S8, 8},
1772 {S32, LocalPtr, S16, 16},
1773 {S32, PrivatePtr, S8, 8},
1774 {S32, PrivatePtr, S16, 16},
1775 {S32, ConstantPtr, S8, 8},
1776 {S32, ConstantPtr, S16, 2 * 8}})
1777 .legalForTypesWithMemDesc(ST.useRealTrue16Insts(),
1778 {{S16, GlobalPtr, S8, GlobalAlign8},
1779 {S16, LocalPtr, S8, GlobalAlign8},
1780 {S16, PrivatePtr, S8, GlobalAlign8},
1781 {S16, ConstantPtr, S8, GlobalAlign8}})
1782 .legalIf([=](const LegalityQuery &Query) -> bool {
1783 return isLoadStoreLegal(ST, Query);
1784 });
1785
1786 if (ST.hasFlatAddressSpace()) {
1787 ExtLoads.legalForTypesWithMemDesc(
1788 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1789
1790 ExtLoads.legalForTypesWithMemDesc(ST.useRealTrue16Insts(),
1791 {{S16, FlatPtr, S8, GlobalAlign8}});
1792 }
1793
1794 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1795 // 64-bits.
1796 //
1797 // TODO: Should generalize bitcast action into coerce, which will also cover
1798 // inserting addrspacecasts.
1799 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1800
1801 ExtLoads.narrowScalarIf(
1802 [](const LegalityQuery &Query) {
1803 LLT MemTy = Query.MMODescrs[0].MemoryTy;
1804 return MemTy.isAnyScalar() && MemTy.getSizeInBits() > 32 &&
1805 Query.Types[0].getSizeInBits() > MemTy.getSizeInBits();
1806 }, // For large MemSize, narrowscalar to MemSize (load MemSize + ext)
1808 ExtLoads.clampScalar(0, S32, S32)
1809 .widenScalarToNextPow2(0)
1810 .lower();
1811
1812 auto &Atomics = getActionDefinitionsBuilder(
1813 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1814 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1815 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1816 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1817 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1818 {S64, GlobalPtr}, {S64, LocalPtr},
1819 {S32, RegionPtr}, {S64, RegionPtr}});
1820 if (ST.hasFlatAddressSpace()) {
1821 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1822 }
1823
1824 auto &Atomics32 =
1825 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1826 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1827 if (ST.hasFlatAddressSpace()) {
1828 Atomics32.legalFor({{S32, FlatPtr}});
1829 }
1830
1831 // TODO: v2bf16 operations, and fat buffer pointer support.
1832 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1833 if (ST.hasLDSFPAtomicAddF32()) {
1834 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1835 if (ST.hasLdsAtomicAddF64())
1836 Atomic.legalFor({{S64, LocalPtr}});
1837 if (ST.hasAtomicDsPkAdd16Insts())
1838 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1839 }
1840 if (ST.hasAtomicFaddInsts())
1841 Atomic.legalFor({{S32, GlobalPtr}});
1842 if (ST.hasFlatAtomicFaddF32Inst())
1843 Atomic.legalFor({{S32, FlatPtr}});
1844
1845 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1846 // These are legal with some caveats, and should have undergone expansion in
1847 // the IR in most situations
1848 // TODO: Move atomic expansion into legalizer
1849 Atomic.legalFor({
1850 {S32, GlobalPtr},
1851 {S64, GlobalPtr},
1852 {S64, FlatPtr}
1853 });
1854 }
1855
1856 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1857 ST.hasAtomicBufferGlobalPkAddF16Insts())
1858 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1859 if (ST.hasAtomicGlobalPkAddBF16Inst())
1860 Atomic.legalFor({{V2BF16, GlobalPtr}});
1861 if (ST.hasAtomicFlatPkAdd16Insts())
1862 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1863
1864
1865 // Most of the legalization work here is done by AtomicExpand. We could
1866 // probably use a simpler legality rule that just assumes anything is OK.
1867 auto &AtomicFMinFMax =
1868 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1869 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1870
1871 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1872 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1873 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1874 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1875 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1876 AtomicFMinFMax.legalFor({F32, FlatPtr});
1877 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1878 AtomicFMinFMax.legalFor({F64, FlatPtr});
1879
1880 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1881 // demarshalling
1882 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1883 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1884 {S32, FlatPtr}, {S64, FlatPtr}})
1885 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1886 {S32, RegionPtr}, {S64, RegionPtr}});
1887 // TODO: Pointer types, any 32-bit or 64-bit vector
1888
1889 // Condition should be s32 for scalar, s1 for vector.
1890 getActionDefinitionsBuilder(G_SELECT)
1891 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1892 LocalPtr, FlatPtr, PrivatePtr,
1893 LLT::fixed_vector(2, LocalPtr),
1894 LLT::fixed_vector(2, PrivatePtr)},
1895 {S1, S32})
1896 .clampScalar(0, S16, S64)
1897 .scalarize(1)
1898 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1899 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1900 .clampMaxNumElements(0, S32, 2)
1901 .clampMaxNumElements(0, LocalPtr, 2)
1902 .clampMaxNumElements(0, PrivatePtr, 2)
1903 .scalarize(0)
1904 .widenScalarToNextPow2(0)
1905 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1906
1907 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1908 // be more flexible with the shift amount type.
1909 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1910 .legalFor({{S32, S32}, {S64, S32}});
1911 if (ST.has16BitInsts()) {
1912 if (ST.hasVOP3PInsts()) {
1913 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1914 .clampMaxNumElements(0, S16, 2);
1915 } else
1916 Shifts.legalFor({{S16, S16}});
1917
1918 // TODO: Support 16-bit shift amounts for all types
1919 Shifts.widenScalarIf(
1920 [=](const LegalityQuery &Query) {
1921 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1922 // 32-bit amount.
1923 const LLT ValTy = Query.Types[0];
1924 const LLT AmountTy = Query.Types[1];
1925 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1926 AmountTy.getSizeInBits() < 16;
1927 }, changeTo(1, S16));
1928 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1929 Shifts.clampScalar(1, S32, S32);
1930 Shifts.widenScalarToNextPow2(0, 16);
1931 Shifts.clampScalar(0, S16, S64);
1932
1933 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1934 .minScalar(0, S16)
1935 .scalarize(0)
1936 .lower();
1937 } else {
1938 // Make sure we legalize the shift amount type first, as the general
1939 // expansion for the shifted type will produce much worse code if it hasn't
1940 // been truncated already.
1941 Shifts.clampScalar(1, S32, S32);
1942 Shifts.widenScalarToNextPow2(0, 32);
1943 Shifts.clampScalar(0, S32, S64);
1944
1945 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1946 .minScalar(0, S32)
1947 .scalarize(0)
1948 .lower();
1949 }
1950 Shifts.scalarize(0);
1951
1952 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1953 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1954 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1955 unsigned IdxTypeIdx = 2;
1956
1957 getActionDefinitionsBuilder(Op)
1958 .customIf([=](const LegalityQuery &Query) {
1959 const LLT EltTy = Query.Types[EltTypeIdx];
1960 const LLT VecTy = Query.Types[VecTypeIdx];
1961 const LLT IdxTy = Query.Types[IdxTypeIdx];
1962 const unsigned EltSize = EltTy.getSizeInBits();
1963 const bool isLegalVecType =
1965 // Address space 8 pointers are 128-bit wide values, but the logic
1966 // below will try to bitcast them to 2N x s64, which will fail.
1967 // Therefore, as an intermediate step, wrap extracts/insertions from a
1968 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1969 // extraction result) in order to produce a vector operation that can
1970 // be handled by the logic below.
1971 if (EltTy.isPointer() && EltSize > 64)
1972 return true;
1973 return (EltSize == 32 || EltSize == 64) &&
1974 VecTy.getSizeInBits() % 32 == 0 &&
1975 VecTy.getSizeInBits() <= MaxRegisterSize &&
1976 IdxTy.getSizeInBits() == 32 &&
1977 isLegalVecType;
1978 })
1979 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1980 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1981 bitcastToVectorElement32(VecTypeIdx))
1982 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1983 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1984 scalarOrEltWiderThan(VecTypeIdx, 64)),
1985 [=](const LegalityQuery &Query) {
1986 // For > 64-bit element types, try to turn this into a
1987 // 64-bit element vector since we may be able to do better
1988 // indexing if this is scalar. If not, fall back to 32.
1989 const LLT EltTy = Query.Types[EltTypeIdx];
1990 const LLT VecTy = Query.Types[VecTypeIdx];
1991 const unsigned DstEltSize = EltTy.getSizeInBits();
1992 const unsigned VecSize = VecTy.getSizeInBits();
1993
1994 const unsigned TargetEltSize =
1995 DstEltSize % 64 == 0 ? 64 : 32;
1996 return std::pair(VecTypeIdx,
1997 LLT::fixed_vector(VecSize / TargetEltSize,
1998 TargetEltSize));
1999 })
2000 .clampScalar(EltTypeIdx, S32, S64)
2001 .clampScalar(VecTypeIdx, S32, S64)
2002 .clampScalar(IdxTypeIdx, S32, S32)
2003 .clampMaxNumElements(VecTypeIdx, S32, 32)
2004 // TODO: Clamp elements for 64-bit vectors?
2005 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
2007 // It should only be necessary with variable indexes.
2008 // As a last resort, lower to the stack
2009 .lower();
2010 }
2011
2012 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
2013 .unsupportedIf([=](const LegalityQuery &Query) {
2014 const LLT &EltTy = Query.Types[1].getElementType();
2015 return Query.Types[0] != EltTy;
2016 });
2017
2018 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
2019 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
2020 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
2021 getActionDefinitionsBuilder(Op)
2022 .widenScalarIf(
2023 [=](const LegalityQuery &Query) {
2024 const LLT BigTy = Query.Types[BigTyIdx];
2025 return (BigTy.getScalarSizeInBits() < 16);
2026 },
2028 .widenScalarIf(
2029 [=](const LegalityQuery &Query) {
2030 const LLT LitTy = Query.Types[LitTyIdx];
2031 return (LitTy.getScalarSizeInBits() < 16);
2032 },
2034 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
2035 .widenScalarToNextPow2(BigTyIdx, 32)
2036 .customIf([=](const LegalityQuery &Query) {
2037 // Generic lower operates on the full-width value, producing
2038 // shift+trunc/mask sequences. For simple cases where extract/insert
2039 // values are 32-bit aligned, we can instead unmerge/merge and work on
2040 // the 32-bit components. However, we can't check the offset here so
2041 // custom lower function will have to call generic lowering if offset
2042 // is not 32-bit aligned.
2043 const LLT BigTy = Query.Types[BigTyIdx];
2044 const LLT LitTy = Query.Types[LitTyIdx];
2045 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
2046 LitTy.getSizeInBits() % 32 == 0;
2047 })
2048 .lower();
2049 }
2050
2051 auto &BuildVector =
2052 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2053 .legalForCartesianProduct(AllS32Vectors, {S32})
2054 .legalForCartesianProduct(AllS64Vectors, {S64})
2055 .clampNumElements(0, V16S32, V32S32)
2056 .clampNumElements(0, V2S64, V16S64)
2057 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
2058 .moreElementsIf(isIllegalRegisterType(ST, 0),
2060
2061 if (ST.hasScalarPackInsts()) {
2062 BuildVector
2063 // FIXME: Should probably widen s1 vectors straight to s32
2064 .minScalarOrElt(0, S16)
2065 .minScalar(1, S16);
2066
2067 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2068 .legalFor({V2S16, S32})
2069 .lower();
2070 } else {
2071 BuildVector.customFor({V2S16, S16});
2072 BuildVector.minScalarOrElt(0, S32);
2073
2074 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2075 .customFor({V2S16, S32})
2076 .lower();
2077 }
2078
2079 BuildVector.legalIf(isRegisterType(ST, 0));
2080
2081 // FIXME: Clamp maximum size
2082 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2083 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2084 .clampMaxNumElements(0, S32, 32)
2085 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2086 .clampMaxNumElements(0, S16, 64);
2087
2088 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2089
2090 // Merge/Unmerge
2091 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2092 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2093 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2094
2095 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2096 const LLT Ty = Query.Types[TypeIdx];
2097 if (Ty.isVector()) {
2098 const LLT &EltTy = Ty.getElementType();
2099 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2100 return true;
2102 return true;
2103 }
2104 return false;
2105 };
2106
2107 auto &Builder =
2108 getActionDefinitionsBuilder(Op)
2109 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2110 .lowerFor({{S16, V2S16}})
2111 .lowerIf([=](const LegalityQuery &Query) {
2112 const LLT BigTy = Query.Types[BigTyIdx];
2113 return BigTy.getSizeInBits() == 32;
2114 })
2115 // Try to widen to s16 first for small types.
2116 // TODO: Only do this on targets with legal s16 shifts
2117 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2118 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2119 .moreElementsIf(isSmallOddVector(BigTyIdx),
2120 oneMoreElement(BigTyIdx))
2121 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
2122 elementTypeIs(1, S16)),
2123 changeTo(1, V2S16))
2124 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2125 // not worth considering the multiples of 64 since 2*192 and 2*384
2126 // are not valid.
2127 .clampScalar(LitTyIdx, S32, S512)
2128 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2129 // Break up vectors with weird elements into scalars
2130 .fewerElementsIf(
2131 [=](const LegalityQuery &Query) {
2132 return notValidElt(Query, LitTyIdx);
2133 },
2134 scalarize(0))
2135 .fewerElementsIf(
2136 [=](const LegalityQuery &Query) {
2137 return notValidElt(Query, BigTyIdx);
2138 },
2139 scalarize(1))
2140 .clampScalar(BigTyIdx, S32, MaxScalar);
2141
2142 if (Op == G_MERGE_VALUES) {
2143 Builder.widenScalarIf(
2144 // TODO: Use 16-bit shifts if legal for 8-bit values?
2145 [=](const LegalityQuery &Query) {
2146 const LLT Ty = Query.Types[LitTyIdx];
2147 return Ty.getSizeInBits() < 32;
2148 },
2149 changeTo(LitTyIdx, S32));
2150 }
2151
2152 Builder.widenScalarIf(
2153 [=](const LegalityQuery &Query) {
2154 const LLT Ty = Query.Types[BigTyIdx];
2155 return Ty.getSizeInBits() % 16 != 0;
2156 },
2157 [=](const LegalityQuery &Query) {
2158 // Pick the next power of 2, or a multiple of 64 over 128.
2159 // Whichever is smaller.
2160 const LLT &Ty = Query.Types[BigTyIdx];
2161 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2162 if (NewSizeInBits >= 256) {
2163 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2164 if (RoundedTo < NewSizeInBits)
2165 NewSizeInBits = RoundedTo;
2166 }
2167 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2168 })
2169 // Any vectors left are the wrong size. Scalarize them.
2170 .scalarize(0)
2171 .scalarize(1);
2172 }
2173
2174 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2175 // RegBankSelect.
2176 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2177 .legalFor({{S32}, {S64}})
2178 .clampScalar(0, S32, S64);
2179
2180 if (ST.hasVOP3PInsts()) {
2181 SextInReg.lowerFor({{V2S16}})
2182 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2183 // get more vector shift opportunities, since we'll get those when
2184 // expanded.
2185 .clampMaxNumElementsStrict(0, S16, 2);
2186 } else if (ST.has16BitInsts()) {
2187 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2188 } else {
2189 // Prefer to promote to s32 before lowering if we don't have 16-bit
2190 // shifts. This avoid a lot of intermediate truncate and extend operations.
2191 SextInReg.lowerFor({{S32}, {S64}});
2192 }
2193
2194 SextInReg
2195 .scalarize(0)
2196 .clampScalar(0, S32, S64)
2197 .lower();
2198
2199 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2200 .scalarize(0)
2201 .lower();
2202
2203 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2204 FSHRActionDefs.legalFor({{S32, S32}})
2205 .clampMaxNumElementsStrict(0, S16, 2);
2206 if (ST.hasVOP3PInsts())
2207 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2208 FSHRActionDefs.scalarize(0).lower();
2209
2210 if (ST.hasVOP3PInsts()) {
2211 getActionDefinitionsBuilder(G_FSHL)
2212 .lowerFor({{V2S16, V2S16}})
2213 .clampMaxNumElementsStrict(0, S16, 2)
2214 .scalarize(0)
2215 .lower();
2216 } else {
2217 getActionDefinitionsBuilder(G_FSHL)
2218 .scalarize(0)
2219 .lower();
2220 }
2221
2222 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2223 .legalFor({S64});
2224
2225 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2226
2227 getActionDefinitionsBuilder(G_FENCE)
2228 .alwaysLegal();
2229
2230 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2231 .scalarize(0)
2232 .minScalar(0, S32)
2233 .lower();
2234
2235 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2236 .legalFor({{S32, S32}, {S64, S32}})
2237 .clampScalar(1, S32, S32)
2238 .clampScalar(0, S32, S64)
2239 .widenScalarToNextPow2(0)
2240 .scalarize(0);
2241
2242 getActionDefinitionsBuilder(
2243 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2244 G_FCOPYSIGN,
2245
2246 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2247 G_READ_REGISTER, G_WRITE_REGISTER,
2248
2249 G_SADDO, G_SSUBO})
2250 .lower();
2251
2252 if (ST.hasIEEEMinimumMaximumInsts()) {
2253 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2254 .legalFor(FPTypesPK16)
2255 .clampMaxNumElements(0, S16, 2)
2256 .scalarize(0);
2257 } else if (ST.hasVOP3PInsts()) {
2258 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2259 .lowerFor({V2S16})
2260 .clampMaxNumElementsStrict(0, S16, 2)
2261 .scalarize(0)
2262 .lower();
2263 } else {
2264 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2265 .scalarize(0)
2266 .clampScalar(0, S32, S64)
2267 .lower();
2268 }
2269
2270 getActionDefinitionsBuilder(
2271 {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET, G_MEMSET_INLINE})
2272 .lower();
2273
2274 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2275
2276 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2277 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2278 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2279 .unsupported();
2280
2281 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2282
2283 getActionDefinitionsBuilder(
2284 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2285 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2286 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2287 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2288 .legalFor(AllVectors)
2289 .scalarize(1)
2290 .lower();
2291
2292 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2293 G_INTRINSIC_CONVERGENT,
2294 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2295 .alwaysLegal();
2296
2297 getLegacyLegalizerInfo().computeTables();
2298 verify(*ST.getInstrInfo());
2299}
2300
2303 LostDebugLocObserver &LocObserver) const {
2304 MachineIRBuilder &B = Helper.MIRBuilder;
2305 MachineRegisterInfo &MRI = *B.getMRI();
2306
2307 switch (MI.getOpcode()) {
2308 case TargetOpcode::G_ADDRSPACE_CAST:
2309 return legalizeAddrSpaceCast(MI, MRI, B);
2310 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2311 return legalizeFroundeven(MI, MRI, B);
2312 case TargetOpcode::G_FCEIL:
2313 return legalizeFceil(MI, MRI, B);
2314 case TargetOpcode::G_FREM:
2315 return legalizeFrem(MI, MRI, B);
2316 case TargetOpcode::G_INTRINSIC_TRUNC:
2317 return legalizeIntrinsicTrunc(MI, MRI, B);
2318 case TargetOpcode::G_SITOFP:
2319 return legalizeITOFP(MI, MRI, B, true);
2320 case TargetOpcode::G_UITOFP:
2321 return legalizeITOFP(MI, MRI, B, false);
2322 case TargetOpcode::G_FPTOSI:
2323 return legalizeFPTOI(MI, MRI, B, true);
2324 case TargetOpcode::G_FPTOUI:
2325 return legalizeFPTOI(MI, MRI, B, false);
2326 case TargetOpcode::G_FMINNUM:
2327 case TargetOpcode::G_FMAXNUM:
2328 case TargetOpcode::G_FMINIMUMNUM:
2329 case TargetOpcode::G_FMAXIMUMNUM:
2330 return legalizeMinNumMaxNum(Helper, MI);
2331 case TargetOpcode::G_EXTRACT:
2332 return legalizeExtract(Helper, MI);
2333 case TargetOpcode::G_INSERT:
2334 return legalizeInsert(Helper, MI);
2335 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2336 return legalizeExtractVectorElt(MI, MRI, B);
2337 case TargetOpcode::G_INSERT_VECTOR_ELT:
2338 return legalizeInsertVectorElt(MI, MRI, B);
2339 case TargetOpcode::G_FSIN:
2340 case TargetOpcode::G_FCOS:
2341 return legalizeSinCos(MI, MRI, B);
2342 case TargetOpcode::G_GLOBAL_VALUE:
2343 return legalizeGlobalValue(MI, MRI, B);
2344 case TargetOpcode::G_LOAD:
2345 case TargetOpcode::G_SEXTLOAD:
2346 case TargetOpcode::G_ZEXTLOAD:
2347 return legalizeLoad(Helper, MI);
2348 case TargetOpcode::G_STORE:
2349 return legalizeStore(Helper, MI);
2350 case TargetOpcode::G_FMAD:
2351 return legalizeFMad(MI, MRI, B);
2352 case TargetOpcode::G_FDIV:
2353 return legalizeFDIV(MI, MRI, B);
2354 case TargetOpcode::G_FFREXP:
2355 return legalizeFFREXP(MI, MRI, B);
2356 case TargetOpcode::G_FSQRT:
2357 return legalizeFSQRT(MI, MRI, B);
2358 case TargetOpcode::G_UDIV:
2359 case TargetOpcode::G_UREM:
2360 case TargetOpcode::G_UDIVREM:
2361 return legalizeUnsignedDIV_REM(MI, MRI, B);
2362 case TargetOpcode::G_SDIV:
2363 case TargetOpcode::G_SREM:
2364 case TargetOpcode::G_SDIVREM:
2365 return legalizeSignedDIV_REM(MI, MRI, B);
2366 case TargetOpcode::G_ATOMIC_CMPXCHG:
2367 return legalizeAtomicCmpXChg(MI, MRI, B);
2368 case TargetOpcode::G_FLOG2:
2369 return legalizeFlog2(MI, B);
2370 case TargetOpcode::G_FLOG:
2371 case TargetOpcode::G_FLOG10:
2372 return legalizeFlogCommon(MI, B);
2373 case TargetOpcode::G_FEXP2:
2374 return legalizeFExp2(MI, B);
2375 case TargetOpcode::G_FEXP:
2376 case TargetOpcode::G_FEXP10:
2377 return legalizeFExp(MI, B);
2378 case TargetOpcode::G_FPOW:
2379 return legalizeFPow(MI, B);
2380 case TargetOpcode::G_FFLOOR:
2381 return legalizeFFloor(MI, MRI, B);
2382 case TargetOpcode::G_BUILD_VECTOR:
2383 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2384 return legalizeBuildVector(MI, MRI, B);
2385 case TargetOpcode::G_MUL:
2386 return legalizeMul(Helper, MI);
2387 case TargetOpcode::G_CTLZ:
2388 case TargetOpcode::G_CTTZ:
2389 return legalizeCTLZ_CTTZ(MI, MRI, B);
2390 case TargetOpcode::G_CTLS:
2391 return legalizeCTLS(MI, MRI, B);
2392 case TargetOpcode::G_CTLZ_ZERO_POISON:
2393 return legalizeCTLZ_ZERO_POISON(MI, MRI, B);
2394 case TargetOpcode::G_STACKSAVE:
2395 return legalizeStackSave(MI, B);
2396 case TargetOpcode::G_GET_FPENV:
2397 return legalizeGetFPEnv(MI, MRI, B);
2398 case TargetOpcode::G_SET_FPENV:
2399 return legalizeSetFPEnv(MI, MRI, B);
2400 case TargetOpcode::G_TRAP:
2401 return legalizeTrap(MI, MRI, B);
2402 case TargetOpcode::G_DEBUGTRAP:
2403 return legalizeDebugTrap(MI, MRI, B);
2404 default:
2405 return false;
2406 }
2407
2408 llvm_unreachable("expected switch to return");
2409}
2410
2412 unsigned AS,
2414 MachineIRBuilder &B) const {
2415 MachineFunction &MF = B.getMF();
2416 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2417 const LLT S32 = LLT::scalar(32);
2418 const LLT S64 = LLT::scalar(64);
2419
2421
2422 if (ST.hasApertureRegs()) {
2423 // Note: this register is somewhat broken. When used as a 32-bit operand,
2424 // it only returns zeroes. The real value is in the upper 32 bits.
2425 // Thus, we must emit extract the high 32 bits.
2426 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2427 ? AMDGPU::SRC_SHARED_BASE
2428 : AMDGPU::SRC_PRIVATE_BASE;
2429 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2430 !ST.hasGloballyAddressableScratch()) &&
2431 "Cannot use src_private_base with globally addressable scratch!");
2433 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2434 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2435 return B.buildUnmerge(S32, Dst).getReg(1);
2436 }
2437
2440 // For code object version 5, private_base and shared_base are passed through
2441 // implicit kernargs.
2445
2450 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2451
2452 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2454
2455 if (!loadInputValue(KernargPtrReg, B,
2457 return Register();
2458
2460 PtrInfo.getWithOffset(Offset),
2464
2465 // Pointer address
2466 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2467 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2468 // Load address
2469 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2470 }
2471
2474
2476 return Register();
2477
2478 // TODO: Use custom PseudoSourceValue
2480
2481 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2482 // private_segment_aperture_base_hi.
2483 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2484
2486 PtrInfo,
2489 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2490
2491 B.buildObjectPtrOffset(
2492 LoadAddr, QueuePtr,
2493 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2494 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2495}
2496
2497/// Return true if the value is a known valid address, such that a null check is
2498/// not necessary.
2500 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2501 MachineInstr *Def = MRI.getVRegDef(Val);
2502 switch (Def->getOpcode()) {
2503 case AMDGPU::G_FRAME_INDEX:
2504 case AMDGPU::G_GLOBAL_VALUE:
2505 case AMDGPU::G_BLOCK_ADDR:
2506 return true;
2507 case AMDGPU::G_CONSTANT: {
2508 const ConstantInt *CI = Def->getOperand(1).getCImm();
2509 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2510 }
2511 default:
2512 return false;
2513 }
2514
2515 return false;
2516}
2517
2520 MachineIRBuilder &B) const {
2521 MachineFunction &MF = B.getMF();
2522
2523 // MI can either be a G_ADDRSPACE_CAST or a
2524 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2525 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2526 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2527 Intrinsic::amdgcn_addrspacecast_nonnull));
2528
2529 const LLT S32 = LLT::scalar(32);
2530 Register Dst = MI.getOperand(0).getReg();
2531 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2532 : MI.getOperand(1).getReg();
2533 LLT DstTy = MRI.getType(Dst);
2534 LLT SrcTy = MRI.getType(Src);
2535 unsigned DestAS = DstTy.getAddressSpace();
2536 unsigned SrcAS = SrcTy.getAddressSpace();
2537
2538 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2539 // vector element.
2540 assert(!DstTy.isVector());
2541
2542 const AMDGPUTargetMachine &TM
2543 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2544
2545 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2546 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2547 return true;
2548 }
2549
2550 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2551 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2552 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2553 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2554 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2555 ST.hasGloballyAddressableScratch()) {
2556 // flat -> private with globally addressable scratch: subtract
2557 // src_flat_scratch_base_lo.
2558 const LLT S32 = LLT::scalar(32);
2559 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2560 Register FlatScratchBaseLo =
2561 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2562 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2563 .getReg(0);
2564 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2565 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2566 return B.buildIntToPtr(Dst, Sub).getReg(0);
2567 }
2568
2569 // Extract low 32-bits of the pointer.
2570 return B.buildExtract(Dst, Src, 0).getReg(0);
2571 };
2572
2573 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2574 // G_ADDRSPACE_CAST we need to guess.
2575 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2576 castFlatToLocalOrPrivate(Dst);
2577 MI.eraseFromParent();
2578 return true;
2579 }
2580
2581 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2582
2583 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2584 auto FlatNull = B.buildConstant(SrcTy, 0);
2585
2586 // Extract low 32-bits of the pointer.
2587 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2588
2589 auto CmpRes =
2590 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2591 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2592
2593 MI.eraseFromParent();
2594 return true;
2595 }
2596
2597 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2598 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2599 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2600 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2601 // Coerce the type of the low half of the result so we can use
2602 // merge_values.
2603 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2604
2605 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2606 ST.hasGloballyAddressableScratch()) {
2607 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2608 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2609 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2610 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2611 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2612 .addUse(AllOnes)
2613 .addUse(ThreadID)
2614 .getReg(0);
2615 if (ST.isWave64()) {
2616 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2617 .addUse(AllOnes)
2618 .addUse(ThreadID)
2619 .getReg(0);
2620 }
2621 Register ShAmt =
2622 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2623 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2624 Register CvtPtr =
2625 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2626 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2627 // 64-bit hi:lo value.
2628 Register FlatScratchBase =
2629 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2630 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2631 .getReg(0);
2632 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2633 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2634 }
2635
2636 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2637 if (!ApertureReg.isValid())
2638 return false;
2639
2640 // TODO: Should we allow mismatched types but matching sizes in merges to
2641 // avoid the ptrtoint?
2642 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2643 };
2644
2645 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2646 // G_ADDRSPACE_CAST we need to guess.
2647 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2648 castLocalOrPrivateToFlat(Dst);
2649 MI.eraseFromParent();
2650 return true;
2651 }
2652
2653 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2654
2655 auto SegmentNull =
2656 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2657 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2658
2659 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2660 SegmentNull.getReg(0));
2661
2662 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2663
2664 MI.eraseFromParent();
2665 return true;
2666 }
2667
2668 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2669 SrcTy.getSizeInBits() == 64) {
2670 // Truncate.
2671 B.buildExtract(Dst, Src, 0);
2672 MI.eraseFromParent();
2673 return true;
2674 }
2675
2676 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2677 DstTy.getSizeInBits() == 64) {
2679 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2680 auto PtrLo = B.buildPtrToInt(S32, Src);
2681 if (AddrHiVal == 0) {
2682 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2683 B.buildIntToPtr(Dst, Zext);
2684 } else {
2685 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2686 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2687 }
2688
2689 MI.eraseFromParent();
2690 return true;
2691 }
2692
2693 // Invalid casts are poison.
2694 // TODO: Should return poison
2695 B.buildUndef(Dst);
2696 MI.eraseFromParent();
2697 return true;
2698}
2699
2702 MachineIRBuilder &B) const {
2703 Register Src = MI.getOperand(1).getReg();
2704 LLT Ty = MRI.getType(Src);
2705 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2706
2707 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2708 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2709
2710 auto C1 = B.buildFConstant(Ty, C1Val);
2711 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2712
2713 // TODO: Should this propagate fast-math-flags?
2714 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2715 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2716
2717 auto C2 = B.buildFConstant(Ty, C2Val);
2718 auto Fabs = B.buildFAbs(Ty, Src);
2719
2720 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2721 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2722 MI.eraseFromParent();
2723 return true;
2724}
2725
2728 MachineIRBuilder &B) const {
2729
2730 const LLT S1 = LLT::scalar(1);
2731 const LLT S64 = LLT::scalar(64);
2732
2733 Register Src = MI.getOperand(1).getReg();
2734 assert(MRI.getType(Src) == S64);
2735
2736 // result = trunc(src)
2737 // if (src > 0.0 && src != result)
2738 // result += 1.0
2739
2740 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2741
2742 const auto Zero = B.buildFConstant(S64, 0.0);
2743 const auto One = B.buildFConstant(S64, 1.0);
2744 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2745 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2746 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2747 auto Add = B.buildSelect(S64, And, One, Zero);
2748
2749 // TODO: Should this propagate fast-math-flags?
2750 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2751 MI.eraseFromParent();
2752 return true;
2753}
2754
2757 MachineIRBuilder &B) const {
2758 Register DstReg = MI.getOperand(0).getReg();
2759 Register Src0Reg = MI.getOperand(1).getReg();
2760 Register Src1Reg = MI.getOperand(2).getReg();
2761 auto Flags = MI.getFlags();
2762 LLT Ty = MRI.getType(DstReg);
2763
2764 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2765 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2766 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2767 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2768 MI.eraseFromParent();
2769 return true;
2770}
2771
2774 const unsigned FractBits = 52;
2775 const unsigned ExpBits = 11;
2776 LLT S32 = LLT::scalar(32);
2777
2778 auto Const0 = B.buildConstant(S32, FractBits - 32);
2779 auto Const1 = B.buildConstant(S32, ExpBits);
2780
2781 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2782 .addUse(Hi)
2783 .addUse(Const0.getReg(0))
2784 .addUse(Const1.getReg(0));
2785
2786 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2787}
2788
2791 MachineIRBuilder &B) const {
2792 const LLT S1 = LLT::scalar(1);
2793 const LLT S32 = LLT::scalar(32);
2794 const LLT S64 = LLT::scalar(64);
2795
2796 Register Src = MI.getOperand(1).getReg();
2797 assert(MRI.getType(Src) == S64);
2798
2799 // TODO: Should this use extract since the low half is unused?
2800 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2801 Register Hi = Unmerge.getReg(1);
2802
2803 // Extract the upper half, since this is where we will find the sign and
2804 // exponent.
2805 auto Exp = extractF64Exponent(Hi, B);
2806
2807 const unsigned FractBits = 52;
2808
2809 // Extract the sign bit.
2810 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2811 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2812
2813 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2814
2815 const auto Zero32 = B.buildConstant(S32, 0);
2816
2817 // Extend back to 64-bits.
2818 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2819
2820 auto Shr = B.buildAShr(S64, FractMask, Exp);
2821 auto Not = B.buildNot(S64, Shr);
2822 auto Tmp0 = B.buildAnd(S64, Src, Not);
2823 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2824
2825 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2826 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2827
2828 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2829 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2830 MI.eraseFromParent();
2831 return true;
2832}
2833
2836 MachineIRBuilder &B, bool Signed) const {
2837
2838 Register Dst = MI.getOperand(0).getReg();
2839 Register Src = MI.getOperand(1).getReg();
2840
2841 const LLT S64 = LLT::scalar(64);
2842 const LLT S32 = LLT::scalar(32);
2843
2844 assert(MRI.getType(Src) == S64);
2845
2846 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2847 auto ThirtyTwo = B.buildConstant(S32, 32);
2848
2849 if (MRI.getType(Dst) == S64) {
2850 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2851 : B.buildUITOFP(S64, Unmerge.getReg(1));
2852
2853 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2854 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2855
2856 // TODO: Should this propagate fast-math-flags?
2857 B.buildFAdd(Dst, LdExp, CvtLo);
2858 MI.eraseFromParent();
2859 return true;
2860 }
2861
2862 assert(MRI.getType(Dst) == S32);
2863
2864 auto One = B.buildConstant(S32, 1);
2865
2866 MachineInstrBuilder ShAmt;
2867 if (Signed) {
2868 auto ThirtyOne = B.buildConstant(S32, 31);
2869 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2870 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2871 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2872 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2873 .addUse(Unmerge.getReg(1));
2874 auto LS2 = B.buildSub(S32, LS, One);
2875 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2876 } else
2877 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2878 auto Norm = B.buildShl(S64, Src, ShAmt);
2879 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2880 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2881 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2882 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2883 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2884 B.buildFLdexp(Dst, FVal, Scale);
2885 MI.eraseFromParent();
2886 return true;
2887}
2888
2889// TODO: Copied from DAG implementation. Verify logic and document how this
2890// actually works.
2894 bool Signed) const {
2895
2896 Register Dst = MI.getOperand(0).getReg();
2897 Register Src = MI.getOperand(1).getReg();
2898
2899 const LLT S64 = LLT::scalar(64);
2900 const LLT S32 = LLT::scalar(32);
2901
2902 const LLT SrcLT = MRI.getType(Src);
2903 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2904
2905 unsigned Flags = MI.getFlags();
2906
2907 // The basic idea of converting a floating point number into a pair of 32-bit
2908 // integers is illustrated as follows:
2909 //
2910 // tf := trunc(val);
2911 // hif := floor(tf * 2^-32);
2912 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2913 // hi := fptoi(hif);
2914 // lo := fptoi(lof);
2915 //
2916 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2918 if (Signed && SrcLT == S32) {
2919 // However, a 32-bit floating point number has only 23 bits mantissa and
2920 // it's not enough to hold all the significant bits of `lof` if val is
2921 // negative. To avoid the loss of precision, We need to take the absolute
2922 // value after truncating and flip the result back based on the original
2923 // signedness.
2924 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2925 Trunc = B.buildFAbs(S32, Trunc, Flags);
2926 }
2927 MachineInstrBuilder K0, K1;
2928 if (SrcLT == S64) {
2929 K0 = B.buildFConstant(
2930 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2931 K1 = B.buildFConstant(
2932 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2933 } else {
2934 K0 = B.buildFConstant(
2935 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2936 K1 = B.buildFConstant(
2937 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2938 }
2939
2940 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2941 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2942 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2943
2944 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2945 : B.buildFPTOUI(S32, FloorMul);
2946 auto Lo = B.buildFPTOUI(S32, Fma);
2947
2948 if (Signed && SrcLT == S32) {
2949 // Flip the result based on the signedness, which is either all 0s or 1s.
2950 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2951 // r := xor({lo, hi}, sign) - sign;
2952 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2953 Sign);
2954 } else
2955 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2956 MI.eraseFromParent();
2957
2958 return true;
2959}
2960
2962 MachineInstr &MI) const {
2963 MachineFunction &MF = Helper.MIRBuilder.getMF();
2965
2966 // With ieee_mode disabled, the instructions have the correct behavior.
2967 if (!MFI->getMode().IEEE)
2968 return true;
2969
2971}
2972
2974 MachineInstr &MI) const {
2975 MachineIRBuilder &B = Helper.MIRBuilder;
2976 MachineRegisterInfo &MRI = *B.getMRI();
2977 Register DstReg = MI.getOperand(0).getReg();
2978 Register SrcReg = MI.getOperand(1).getReg();
2979 uint64_t Offset = MI.getOperand(2).getImm();
2980
2981 // Fall back to generic lowering for offset 0 (trivial trunc) and
2982 // non-32-bit-aligned cases which require shift+trunc sequences
2983 // that generic code handles correctly.
2984 if (Offset == 0 || Offset % 32 != 0)
2985 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2986
2987 const LLT DstTy = MRI.getType(DstReg);
2988 unsigned StartIdx = Offset / 32;
2989 unsigned DstCount = DstTy.getSizeInBits() / 32;
2990 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2991
2992 if (DstCount == 1) {
2993 if (DstTy.isPointer())
2994 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2995 else
2996 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2997 } else {
2998 SmallVector<Register, 8> MergeVec;
2999 for (unsigned I = 0; I < DstCount; ++I)
3000 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
3001 B.buildMergeLikeInstr(DstReg, MergeVec);
3002 }
3003
3004 MI.eraseFromParent();
3005 return true;
3006}
3007
3009 MachineInstr &MI) const {
3010 MachineIRBuilder &B = Helper.MIRBuilder;
3011 MachineRegisterInfo &MRI = *B.getMRI();
3012 Register DstReg = MI.getOperand(0).getReg();
3013 Register SrcReg = MI.getOperand(1).getReg();
3014 Register InsertSrc = MI.getOperand(2).getReg();
3015 uint64_t Offset = MI.getOperand(3).getImm();
3016
3017 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3018 const LLT InsertTy = MRI.getType(InsertSrc);
3019 unsigned InsertSize = InsertTy.getSizeInBits();
3020
3021 // Fall back to generic lowering for non-32-bit-aligned cases which
3022 // require shift+mask sequences that generic code handles correctly.
3023 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
3024 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
3025
3026 const LLT S32 = LLT::scalar(32);
3027 unsigned DstCount = DstSize / 32;
3028 unsigned InsertCount = InsertSize / 32;
3029 unsigned StartIdx = Offset / 32;
3030
3031 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
3032
3033 SmallVector<Register, 8> MergeVec;
3034 for (unsigned I = 0; I < StartIdx; ++I)
3035 MergeVec.push_back(SrcUnmerge.getReg(I));
3036
3037 if (InsertCount == 1) {
3038 // Merge-like instructions require same source types. Convert pointer
3039 // to scalar when inserting a pointer value into a scalar.
3040 if (InsertTy.isPointer())
3041 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
3042 MergeVec.push_back(InsertSrc);
3043 } else {
3044 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
3045 for (unsigned I = 0; I < InsertCount; ++I)
3046 MergeVec.push_back(InsertUnmerge.getReg(I));
3047 }
3048
3049 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
3050 MergeVec.push_back(SrcUnmerge.getReg(I));
3051
3052 B.buildMergeLikeInstr(DstReg, MergeVec);
3053
3054 MI.eraseFromParent();
3055 return true;
3056}
3057
3060 MachineIRBuilder &B) const {
3061 // TODO: Should move some of this into LegalizerHelper.
3062
3063 // TODO: Promote dynamic indexing of s16 to s32
3064
3065 Register Dst = MI.getOperand(0).getReg();
3066 Register Vec = MI.getOperand(1).getReg();
3067
3068 LLT VecTy = MRI.getType(Vec);
3069 LLT EltTy = VecTy.getElementType();
3070 assert(EltTy == MRI.getType(Dst));
3071
3072 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3073 // but we can't go directly to that logic becasue you can't bitcast a vector
3074 // of pointers to a vector of integers. Therefore, introduce an intermediate
3075 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3076 // drive the legalization forward.
3077 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3078 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3079 LLT IntVecTy = VecTy.changeElementType(IntTy);
3080
3081 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3082 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3083 B.buildIntToPtr(Dst, IntElt);
3084
3085 MI.eraseFromParent();
3086 return true;
3087 }
3088
3089 // FIXME: Artifact combiner probably should have replaced the truncated
3090 // constant before this, so we shouldn't need
3091 // getIConstantVRegValWithLookThrough.
3092 std::optional<ValueAndVReg> MaybeIdxVal =
3093 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3094 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3095 return true;
3096 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3097
3098 if (IdxVal < VecTy.getNumElements()) {
3099 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3100 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3101 } else {
3102 B.buildUndef(Dst);
3103 }
3104
3105 MI.eraseFromParent();
3106 return true;
3107}
3108
3111 MachineIRBuilder &B) const {
3112 // TODO: Should move some of this into LegalizerHelper.
3113
3114 // TODO: Promote dynamic indexing of s16 to s32
3115
3116 Register Dst = MI.getOperand(0).getReg();
3117 Register Vec = MI.getOperand(1).getReg();
3118 Register Ins = MI.getOperand(2).getReg();
3119
3120 LLT VecTy = MRI.getType(Vec);
3121 LLT EltTy = VecTy.getElementType();
3122 assert(EltTy == MRI.getType(Ins));
3123
3124 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3125 // but we can't go directly to that logic becasue you can't bitcast a vector
3126 // of pointers to a vector of integers. Therefore, make the pointer vector
3127 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3128 // new value, and then inttoptr the result vector back. This will then allow
3129 // the rest of legalization to take over.
3130 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3131 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3132 LLT IntVecTy = VecTy.changeElementType(IntTy);
3133
3134 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3135 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3136 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3137 MI.getOperand(3));
3138 B.buildIntToPtr(Dst, IntVecDest);
3139 MI.eraseFromParent();
3140 return true;
3141 }
3142
3143 // FIXME: Artifact combiner probably should have replaced the truncated
3144 // constant before this, so we shouldn't need
3145 // getIConstantVRegValWithLookThrough.
3146 std::optional<ValueAndVReg> MaybeIdxVal =
3147 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3148 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3149 return true;
3150
3151 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3152
3153 unsigned NumElts = VecTy.getNumElements();
3154 if (IdxVal < NumElts) {
3156 for (unsigned i = 0; i < NumElts; ++i)
3157 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3158 B.buildUnmerge(SrcRegs, Vec);
3159
3160 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3161 B.buildMergeLikeInstr(Dst, SrcRegs);
3162 } else {
3163 B.buildUndef(Dst);
3164 }
3165
3166 MI.eraseFromParent();
3167 return true;
3168}
3169
3172 MachineIRBuilder &B) const {
3173
3174 Register DstReg = MI.getOperand(0).getReg();
3175 Register SrcReg = MI.getOperand(1).getReg();
3176 LLT Ty = MRI.getType(DstReg);
3177 unsigned Flags = MI.getFlags();
3178
3179 Register TrigVal;
3180 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3181 if (ST.hasTrigReducedRange()) {
3182 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3183 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3184 .addUse(MulVal.getReg(0))
3185 .setMIFlags(Flags)
3186 .getReg(0);
3187 } else
3188 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3189
3190 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3191 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3192 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3193 .addUse(TrigVal)
3194 .setMIFlags(Flags);
3195 MI.eraseFromParent();
3196 return true;
3197}
3198
3201 const GlobalValue *GV,
3202 int64_t Offset,
3203 unsigned GAFlags) const {
3204 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3205 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3206 // to the following code sequence:
3207 //
3208 // For constant address space:
3209 // s_getpc_b64 s[0:1]
3210 // s_add_u32 s0, s0, $symbol
3211 // s_addc_u32 s1, s1, 0
3212 //
3213 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3214 // a fixup or relocation is emitted to replace $symbol with a literal
3215 // constant, which is a pc-relative offset from the encoding of the $symbol
3216 // operand to the global variable.
3217 //
3218 // For global address space:
3219 // s_getpc_b64 s[0:1]
3220 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3221 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3222 //
3223 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3224 // fixups or relocations are emitted to replace $symbol@*@lo and
3225 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3226 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3227 // operand to the global variable.
3228
3230
3231 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3232 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3233
3234 if (ST.has64BitLiterals()) {
3235 assert(GAFlags != SIInstrInfo::MO_NONE);
3236
3238 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3239 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3240 } else {
3242 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3243
3244 MIB.addGlobalAddress(GV, Offset, GAFlags);
3245 if (GAFlags == SIInstrInfo::MO_NONE)
3246 MIB.addImm(0);
3247 else
3248 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3249 }
3250
3251 if (!B.getMRI()->getRegClassOrNull(PCReg))
3252 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3253
3254 if (PtrTy.getSizeInBits() == 32)
3255 B.buildExtract(DstReg, PCReg, 0);
3256 return true;
3257}
3258
3259// Emit a ABS32_LO / ABS32_HI relocation stub.
3261 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3262 MachineRegisterInfo &MRI) const {
3263 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3264
3265 if (RequiresHighHalf && ST.has64BitLiterals()) {
3266 if (!MRI.getRegClassOrNull(DstReg))
3267 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3268 B.buildInstr(AMDGPU::S_MOV_B64)
3269 .addDef(DstReg)
3270 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3271 return;
3272 }
3273
3274 LLT S32 = LLT::scalar(32);
3275
3276 // Use the destination directly, if and only if we store the lower address
3277 // part only and we don't have a register class being set.
3278 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3279 ? DstReg
3281
3282 if (!MRI.getRegClassOrNull(AddrLo))
3283 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3284
3285 // Write the lower half.
3286 B.buildInstr(AMDGPU::S_MOV_B32)
3287 .addDef(AddrLo)
3288 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3289
3290 // If required, write the upper half as well.
3291 if (RequiresHighHalf) {
3292 assert(PtrTy.getSizeInBits() == 64 &&
3293 "Must provide a 64-bit pointer type!");
3294
3296 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3297
3298 B.buildInstr(AMDGPU::S_MOV_B32)
3299 .addDef(AddrHi)
3300 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3301
3302 // Use the destination directly, if and only if we don't have a register
3303 // class being set.
3304 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3305 ? DstReg
3307
3308 if (!MRI.getRegClassOrNull(AddrDst))
3309 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3310
3311 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3312
3313 // If we created a new register for the destination, cast the result into
3314 // the final output.
3315 if (AddrDst != DstReg)
3316 B.buildCast(DstReg, AddrDst);
3317 } else if (AddrLo != DstReg) {
3318 // If we created a new register for the destination, cast the result into
3319 // the final output.
3320 B.buildCast(DstReg, AddrLo);
3321 }
3322}
3323
3326 MachineIRBuilder &B) const {
3327 Register DstReg = MI.getOperand(0).getReg();
3328 LLT Ty = MRI.getType(DstReg);
3329 unsigned AS = Ty.getAddressSpace();
3330
3331 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3332 MachineFunction &MF = B.getMF();
3334
3336 if (!MFI->isModuleEntryFunction() &&
3337 GV->getName() != "llvm.amdgcn.module.lds" &&
3339 const Function &Fn = MF.getFunction();
3341 Fn, "local memory global used by non-kernel function",
3342 MI.getDebugLoc(), DS_Warning));
3343
3344 // We currently don't have a way to correctly allocate LDS objects that
3345 // aren't directly associated with a kernel. We do force inlining of
3346 // functions that use local objects. However, if these dead functions are
3347 // not eliminated, we don't want a compile time error. Just emit a warning
3348 // and a trap, since there should be no callable path here.
3349 B.buildTrap();
3350 B.buildUndef(DstReg);
3351 MI.eraseFromParent();
3352 return true;
3353 }
3354
3355 // TODO: We could emit code to handle the initialization somewhere.
3356 // We ignore the initializer for now and legalize it to allow selection.
3357 // The initializer will anyway get errored out during assembly emission.
3358 const SITargetLowering *TLI = ST.getTargetLowering();
3359 if (!TLI->shouldUseLDSConstAddress(GV)) {
3360 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3361 return true; // Leave in place;
3362 }
3363
3364 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3365 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3366 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3367 // zero-sized type in other languages to declare the dynamic shared
3368 // memory which size is not known at the compile time. They will be
3369 // allocated by the runtime and placed directly after the static
3370 // allocated ones. They all share the same offset.
3371 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3372 // Adjust alignment for that dynamic shared memory array.
3373 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3374 LLT S32 = LLT::scalar(32);
3375 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3376 B.buildIntToPtr(DstReg, Sz);
3377 MI.eraseFromParent();
3378 return true;
3379 }
3380 }
3381
3382 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3383 MI.eraseFromParent();
3384 return true;
3385 }
3386
3387 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3388 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3389 MI.eraseFromParent();
3390 return true;
3391 }
3392
3393 const SITargetLowering *TLI = ST.getTargetLowering();
3394
3395 if (TLI->shouldEmitFixup(GV)) {
3396 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3397 MI.eraseFromParent();
3398 return true;
3399 }
3400
3401 if (TLI->shouldEmitPCReloc(GV)) {
3402 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3403 MI.eraseFromParent();
3404 return true;
3405 }
3406
3408 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3409
3410 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3415 LoadTy, Align(8));
3416
3417 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3418
3419 if (Ty.getSizeInBits() == 32) {
3420 // Truncate if this is a 32-bit constant address.
3421 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3422 B.buildExtract(DstReg, Load, 0);
3423 } else
3424 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3425
3426 MI.eraseFromParent();
3427 return true;
3428}
3429
3431 if (Ty.isVector())
3432 return Ty.changeElementCount(
3433 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3434 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3435}
3436
3438 MachineInstr &MI) const {
3439 MachineIRBuilder &B = Helper.MIRBuilder;
3440 MachineRegisterInfo &MRI = *B.getMRI();
3441 GISelChangeObserver &Observer = Helper.Observer;
3442
3443 Register PtrReg = MI.getOperand(1).getReg();
3444 LLT PtrTy = MRI.getType(PtrReg);
3445 unsigned AddrSpace = PtrTy.getAddressSpace();
3446
3447 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3449 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3450 Observer.changingInstr(MI);
3451 MI.getOperand(1).setReg(Cast.getReg(0));
3452 Observer.changedInstr(MI);
3453 return true;
3454 }
3455
3456 if (MI.getOpcode() != AMDGPU::G_LOAD)
3457 return false;
3458
3459 Register ValReg = MI.getOperand(0).getReg();
3460 LLT ValTy = MRI.getType(ValReg);
3461
3462 if (hasBufferRsrcWorkaround(ValTy)) {
3463 Observer.changingInstr(MI);
3464 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3465 Observer.changedInstr(MI);
3466 return true;
3467 }
3468
3469 MachineMemOperand *MMO = *MI.memoperands_begin();
3470 const unsigned ValSize = ValTy.getSizeInBits();
3471 const LLT MemTy = MMO->getMemoryType();
3472 const Align MemAlign = MMO->getAlign();
3473 const unsigned MemSize = MemTy.getSizeInBits();
3474 const uint64_t AlignInBits = 8 * MemAlign.value();
3475
3476 // Widen non-power-of-2 loads to the alignment if needed
3477 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3478 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3479
3480 // This was already the correct extending load result type, so just adjust
3481 // the memory type.
3482 if (WideMemSize == ValSize) {
3483 MachineFunction &MF = B.getMF();
3484
3485 MachineMemOperand *WideMMO =
3486 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3487 Observer.changingInstr(MI);
3488 MI.setMemRefs(MF, {WideMMO});
3489 Observer.changedInstr(MI);
3490 return true;
3491 }
3492
3493 // Don't bother handling edge case that should probably never be produced.
3494 if (ValSize > WideMemSize)
3495 return false;
3496
3497 LLT WideTy = widenToNextPowerOf2(ValTy);
3498
3499 Register WideLoad;
3500 if (!WideTy.isVector()) {
3501 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3502 B.buildTrunc(ValReg, WideLoad).getReg(0);
3503 } else {
3504 // Extract the subvector.
3505
3506 if (isRegisterType(ST, ValTy)) {
3507 // If this a case where G_EXTRACT is legal, use it.
3508 // (e.g. <3 x s32> -> <4 x s32>)
3509 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3510 B.buildExtract(ValReg, WideLoad, 0);
3511 } else {
3512 // For cases where the widened type isn't a nice register value, unmerge
3513 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3514 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3515 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3516 }
3517 }
3518
3519 MI.eraseFromParent();
3520 return true;
3521 }
3522
3523 return false;
3524}
3525
3527 MachineInstr &MI) const {
3528 MachineIRBuilder &B = Helper.MIRBuilder;
3529 MachineRegisterInfo &MRI = *B.getMRI();
3530 GISelChangeObserver &Observer = Helper.Observer;
3531
3532 Register DataReg = MI.getOperand(0).getReg();
3533 LLT DataTy = MRI.getType(DataReg);
3534
3535 if (hasBufferRsrcWorkaround(DataTy)) {
3536 Observer.changingInstr(MI);
3538 Observer.changedInstr(MI);
3539 return true;
3540 }
3541 return false;
3542}
3543
3546 MachineIRBuilder &B) const {
3547 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3548 assert(Ty.isScalar());
3549
3550 MachineFunction &MF = B.getMF();
3552
3553 // TODO: Always legal with future ftz flag.
3554 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3555 // FIXME: Do we need just output?
3556 if (Ty == LLT::scalar(32) &&
3558 return true;
3559 if (Ty == LLT::scalar(16) &&
3561 return true;
3562
3563 MachineIRBuilder HelperBuilder(MI);
3564 GISelObserverWrapper DummyObserver;
3565 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3566 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3567}
3568
3571 Register DstReg = MI.getOperand(0).getReg();
3572 Register PtrReg = MI.getOperand(1).getReg();
3573 Register CmpVal = MI.getOperand(2).getReg();
3574 Register NewVal = MI.getOperand(3).getReg();
3575
3577 "this should not have been custom lowered");
3578
3579 LLT ValTy = MRI.getType(CmpVal);
3580 LLT VecTy = LLT::fixed_vector(2, ValTy);
3581
3582 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3583
3584 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3585 .addDef(DstReg)
3586 .addUse(PtrReg)
3587 .addUse(PackedVal)
3588 .setMemRefs(MI.memoperands());
3589
3590 MI.eraseFromParent();
3591 return true;
3592}
3593
3594/// Return true if it's known that \p Src can never be an f32 denormal value.
3596 Register Src) {
3597 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3598 switch (DefMI->getOpcode()) {
3599 case TargetOpcode::G_INTRINSIC: {
3601 case Intrinsic::amdgcn_frexp_mant:
3602 case Intrinsic::amdgcn_log:
3603 case Intrinsic::amdgcn_log_clamp:
3604 case Intrinsic::amdgcn_exp2:
3605 case Intrinsic::amdgcn_sqrt:
3606 return true;
3607 default:
3608 break;
3609 }
3610
3611 break;
3612 }
3613 case TargetOpcode::G_FSQRT:
3614 return true;
3615 case TargetOpcode::G_FFREXP: {
3616 if (DefMI->getOperand(0).getReg() == Src)
3617 return true;
3618 break;
3619 }
3620 case TargetOpcode::G_FPEXT: {
3621 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3622 }
3623 default:
3624 return false;
3625 }
3626
3627 return false;
3628}
3629
3630static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3631 return Flags & MachineInstr::FmAfn;
3632}
3633
3635 unsigned Flags) {
3636 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3639}
3640
3641std::pair<Register, Register>
3643 unsigned Flags) const {
3644 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3645 return {};
3646
3647 const LLT F32 = LLT::scalar(32);
3648 auto SmallestNormal = B.buildFConstant(
3650 auto IsLtSmallestNormal =
3651 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3652
3653 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3654 auto One = B.buildFConstant(F32, 1.0);
3655 auto ScaleFactor =
3656 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3657 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3658
3659 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3660}
3661
3663 MachineIRBuilder &B) const {
3664 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3665 // If we have to handle denormals, scale up the input and adjust the result.
3666
3667 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3668 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3669
3670 Register Dst = MI.getOperand(0).getReg();
3671 Register Src = MI.getOperand(1).getReg();
3672 LLT Ty = B.getMRI()->getType(Dst);
3673 unsigned Flags = MI.getFlags();
3674
3675 if (Ty == LLT::scalar(16)) {
3676 const LLT F32 = LLT::scalar(32);
3677 // Nothing in half is a denormal when promoted to f32.
3678 auto Ext = B.buildFPExt(F32, Src, Flags);
3679 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3680 .addUse(Ext.getReg(0))
3681 .setMIFlags(Flags);
3682 B.buildFPTrunc(Dst, Log2, Flags);
3683 MI.eraseFromParent();
3684 return true;
3685 }
3686
3687 assert(Ty == LLT::scalar(32));
3688
3689 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3690 if (!ScaledInput) {
3691 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3692 .addUse(Src)
3693 .setMIFlags(Flags);
3694 MI.eraseFromParent();
3695 return true;
3696 }
3697
3698 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3699 .addUse(ScaledInput)
3700 .setMIFlags(Flags);
3701
3702 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3703 auto Zero = B.buildFConstant(Ty, 0.0);
3704 auto ResultOffset =
3705 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3706 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3707
3708 MI.eraseFromParent();
3709 return true;
3710}
3711
3713 Register Z, unsigned Flags) {
3714 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3715 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3716}
3717
3719 MachineIRBuilder &B) const {
3720 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3721 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3722
3723 MachineRegisterInfo &MRI = *B.getMRI();
3724 Register Dst = MI.getOperand(0).getReg();
3725 Register X = MI.getOperand(1).getReg();
3726 unsigned Flags = MI.getFlags();
3727 const LLT Ty = MRI.getType(X);
3728
3729 const LLT F32 = LLT::scalar(32);
3730 const LLT F16 = LLT::scalar(16);
3731
3732 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3733 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3734 // depending on !fpmath metadata.
3735 bool PromoteToF32 =
3736 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3737 if (PromoteToF32) {
3739 auto PromoteSrc = B.buildFPExt(F32, X);
3740 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3741 B.buildFPTrunc(Dst, LogVal);
3742 } else {
3743 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3744 }
3745
3746 MI.eraseFromParent();
3747 return true;
3748 }
3749
3750 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3751 if (ScaledInput)
3752 X = ScaledInput;
3753
3754 auto Y =
3755 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3756
3757 Register R;
3758 if (ST.hasFastFMAF32()) {
3759 // c+cc are ln(2)/ln(10) to more than 49 bits
3760 const float c_log10 = 0x1.344134p-2f;
3761 const float cc_log10 = 0x1.09f79ep-26f;
3762
3763 // c + cc is ln(2) to more than 49 bits
3764 const float c_log = 0x1.62e42ep-1f;
3765 const float cc_log = 0x1.efa39ep-25f;
3766
3767 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3768 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3769 // This adds correction terms for which contraction may lead to an increase
3770 // in the error of the approximation, so disable it.
3771 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3772 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3773 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3774 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3775 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3776 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3777 } else {
3778 // ch+ct is ln(2)/ln(10) to more than 36 bits
3779 const float ch_log10 = 0x1.344000p-2f;
3780 const float ct_log10 = 0x1.3509f6p-18f;
3781
3782 // ch + ct is ln(2) to more than 36 bits
3783 const float ch_log = 0x1.62e000p-1f;
3784 const float ct_log = 0x1.0bfbe8p-15f;
3785
3786 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3787 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3788
3789 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3790 auto YH = B.buildAnd(Ty, Y, MaskConst);
3791 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3792 // This adds correction terms for which contraction may lead to an increase
3793 // in the error of the approximation, so disable it.
3794 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3795 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3796
3797 Register Mad0 =
3798 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3799 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3800 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3801 }
3802
3803 const bool IsFiniteOnly =
3805
3806 if (!IsFiniteOnly) {
3807 // Expand isfinite(x) => fabs(x) < inf
3808 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3809 auto Fabs = B.buildFAbs(Ty, Y);
3810 auto IsFinite =
3811 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3812 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3813 }
3814
3815 if (ScaledInput) {
3816 auto Zero = B.buildFConstant(Ty, 0.0);
3817 auto ShiftK =
3818 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3819 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3820 B.buildFSub(Dst, R, Shift, Flags);
3821 } else {
3822 B.buildCopy(Dst, R);
3823 }
3824
3825 MI.eraseFromParent();
3826 return true;
3827}
3828
3830 Register Src, bool IsLog10,
3831 unsigned Flags) const {
3832 const double Log2BaseInverted =
3834
3835 LLT Ty = B.getMRI()->getType(Dst);
3836
3837 if (Ty == LLT::scalar(32)) {
3838 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3839 if (ScaledInput) {
3840 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3841 .addUse(Src)
3842 .setMIFlags(Flags);
3843 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3844 auto Zero = B.buildFConstant(Ty, 0.0);
3845 auto ResultOffset =
3846 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3847 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3848
3849 if (ST.hasFastFMAF32())
3850 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3851 else {
3852 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3853 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3854 }
3855
3856 return true;
3857 }
3858 }
3859
3860 auto Log2Operand = Ty == LLT::scalar(16)
3861 ? B.buildFLog2(Ty, Src, Flags)
3862 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3863 .addUse(Src)
3864 .setMIFlags(Flags);
3865 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3866 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3867 return true;
3868}
3869
3871 MachineIRBuilder &B) const {
3872 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3873 // If we have to handle denormals, scale up the input and adjust the result.
3874
3875 Register Dst = MI.getOperand(0).getReg();
3876 Register Src = MI.getOperand(1).getReg();
3877 unsigned Flags = MI.getFlags();
3878 LLT Ty = B.getMRI()->getType(Dst);
3879 const LLT F16 = LLT::scalar(16);
3880 const LLT F32 = LLT::scalar(32);
3881 const LLT F64 = LLT::scalar(64);
3882
3883 if (Ty == F64)
3884 return legalizeFEXPF64(MI, B);
3885
3886 if (Ty == F16) {
3887 // Nothing in half is a denormal when promoted to f32.
3888 auto Ext = B.buildFPExt(F32, Src, Flags);
3889 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3890 .addUse(Ext.getReg(0))
3891 .setMIFlags(Flags);
3892 B.buildFPTrunc(Dst, Log2, Flags);
3893 MI.eraseFromParent();
3894 return true;
3895 }
3896
3897 assert(Ty == F32);
3898
3899 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3900 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3901 .addUse(Src)
3902 .setMIFlags(Flags);
3903 MI.eraseFromParent();
3904 return true;
3905 }
3906
3907 // bool needs_scaling = x < -0x1.f80000p+6f;
3908 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3909
3910 // -nextafter(128.0, -1)
3911 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3912 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3913 RangeCheckConst, Flags);
3914
3915 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3916 auto Zero = B.buildFConstant(Ty, 0.0);
3917 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3918 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3919
3920 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3921 .addUse(AddInput.getReg(0))
3922 .setMIFlags(Flags);
3923
3924 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3925 auto One = B.buildFConstant(Ty, 1.0);
3926 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3927 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3928 MI.eraseFromParent();
3929 return true;
3930}
3931
3933 const SrcOp &Src, unsigned Flags) {
3934 LLT Ty = Dst.getLLTTy(*B.getMRI());
3935
3936 if (Ty == LLT::scalar(32)) {
3937 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3938 .addUse(Src.getReg())
3939 .setMIFlags(Flags);
3940 }
3941 return B.buildFExp2(Dst, Src, Flags);
3942}
3943
3945 Register Dst, Register X,
3946 unsigned Flags,
3947 bool IsExp10) const {
3948 LLT Ty = B.getMRI()->getType(X);
3949
3950 // exp(x) -> exp2(M_LOG2E_F * x);
3951 // exp10(x) -> exp2(log2(10) * x);
3952 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3953 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3954 buildExp(B, Dst, Mul, Flags);
3955 return true;
3956}
3957
3959 Register X, unsigned Flags) const {
3960 LLT Ty = B.getMRI()->getType(Dst);
3961 LLT F32 = LLT::scalar(32);
3962
3963 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3964 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3965 }
3966
3967 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3968 auto NeedsScaling =
3969 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3970 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3971 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3972 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3973
3974 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3975 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3976
3977 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3978 .addUse(ExpInput.getReg(0))
3979 .setMIFlags(Flags);
3980
3981 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3982 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3983 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3984 return true;
3985}
3986
3988 Register Dst, Register X,
3989 unsigned Flags) const {
3990 LLT Ty = B.getMRI()->getType(Dst);
3991 LLT F32 = LLT::scalar(32);
3992
3993 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3994 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3995 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3996 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3997
3998 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3999 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
4000 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
4001 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
4002 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
4003 return true;
4004 }
4005
4006 // bool s = x < -0x1.2f7030p+5f;
4007 // x += s ? 0x1.0p+5f : 0.0f;
4008 // exp10 = exp2(x * 0x1.a92000p+1f) *
4009 // exp2(x * 0x1.4f0978p-11f) *
4010 // (s ? 0x1.9f623ep-107f : 1.0f);
4011
4012 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
4013 auto NeedsScaling =
4014 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
4015
4016 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
4017 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
4018 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
4019
4020 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
4021 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
4022
4023 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
4024 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
4025 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
4026 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
4027
4028 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
4029 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
4030 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
4031
4032 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4033 return true;
4034}
4035
4036// This expansion gives a result slightly better than 1ulp.
4038 MachineIRBuilder &B) const {
4039
4040 Register X = MI.getOperand(1).getReg();
4041 LLT S64 = LLT::scalar(64);
4042 LLT S32 = LLT::scalar(32);
4043 LLT S1 = LLT::scalar(1);
4044
4045 // TODO: Check if reassoc is safe. There is an output change in exp2 and
4046 // exp10, which slightly increases ulp.
4047 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
4048
4049 Register Dn, F, T;
4050
4051 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
4052 // Dn = rint(X)
4053 Dn = B.buildFRint(S64, X, Flags).getReg(0);
4054 // F = X - Dn
4055 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
4056 // T = F*C1 + F*C2
4057 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4058 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4059 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
4060 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
4061
4062 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
4063 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
4064 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4065 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4066
4067 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4068 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
4069 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
4070 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4071 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4072
4073 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
4074 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
4075 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
4076 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
4077
4078 } else { // G_FEXP
4079 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4080 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4081 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4082
4083 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4084 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4085 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4086 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4087 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4088 }
4089
4090 // Polynomial chain for P
4091 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4092 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4093 Flags);
4094 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4095 Flags);
4096 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4097 Flags);
4098 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4099 Flags);
4100 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4101 Flags);
4102 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4103 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4104 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4105 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4106
4107 auto One = B.buildFConstant(S64, 1.0);
4108 P = B.buildFMA(S64, T, P, One, Flags);
4109 P = B.buildFMA(S64, T, P, One, Flags);
4110
4111 // Z = FLDEXP(P, (int)Dn)
4112 auto DnInt = B.buildFPTOSI(S32, Dn);
4113 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4114
4115 if (!(Flags & MachineInstr::FmNoInfs)) {
4116 // Overflow guard: if X <= 1024.0 then Z else +inf
4117 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4118 B.buildFConstant(S64, APFloat(1024.0)));
4119 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4120 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4121 }
4122
4123 // Underflow guard: if X >= -1075.0 then Z else 0.0
4124 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4125 B.buildFConstant(S64, APFloat(-1075.0)));
4126 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4127 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4128
4129 MI.eraseFromParent();
4130 return true;
4131}
4132
4134 MachineIRBuilder &B) const {
4135 Register Dst = MI.getOperand(0).getReg();
4136 Register X = MI.getOperand(1).getReg();
4137 const unsigned Flags = MI.getFlags();
4138 MachineFunction &MF = B.getMF();
4139 MachineRegisterInfo &MRI = *B.getMRI();
4140 LLT Ty = MRI.getType(Dst);
4141
4142 const LLT F64 = LLT::scalar(64);
4143
4144 if (Ty == F64)
4145 return legalizeFEXPF64(MI, B);
4146
4147 const LLT F16 = LLT::scalar(16);
4148 const LLT F32 = LLT::scalar(32);
4149 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4150
4151 if (Ty == F16) {
4152 // v_exp_f16 (fmul x, log2e)
4153 if (allowApproxFunc(MF, Flags)) {
4154 // TODO: Does this really require fast?
4155 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4156 : legalizeFExpUnsafe(B, Dst, X, Flags);
4157 MI.eraseFromParent();
4158 return true;
4159 }
4160
4161 // Nothing in half is a denormal when promoted to f32.
4162 //
4163 // exp(f16 x) ->
4164 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4165 //
4166 // exp10(f16 x) ->
4167 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4168 auto Ext = B.buildFPExt(F32, X, Flags);
4170 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4171 B.buildFPTrunc(Dst, Lowered, Flags);
4172 MI.eraseFromParent();
4173 return true;
4174 }
4175
4176 assert(Ty == F32);
4177
4178 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4179 // library behavior. Also, is known-not-daz source sufficient?
4180 if (allowApproxFunc(MF, Flags)) {
4181 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4182 : legalizeFExpUnsafe(B, Dst, X, Flags);
4183 MI.eraseFromParent();
4184 return true;
4185 }
4186
4187 // Algorithm:
4188 //
4189 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4190 //
4191 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4192 // n = 64*m + j, 0 <= j < 64
4193 //
4194 // e^x = 2^((64*m + j + f)/64)
4195 // = (2^m) * (2^(j/64)) * 2^(f/64)
4196 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4197 //
4198 // f = x*(64/ln(2)) - n
4199 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4200 //
4201 // e^x = (2^m) * (2^(j/64)) * e^r
4202 //
4203 // (2^(j/64)) is precomputed
4204 //
4205 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4206 // e^r = 1 + q
4207 //
4208 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4209 //
4210 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4211 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4212 Register PH, PL;
4213
4214 if (ST.hasFastFMAF32()) {
4215 const float c_exp = numbers::log2ef;
4216 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4217 const float c_exp10 = 0x1.a934f0p+1f;
4218 const float cc_exp10 = 0x1.2f346ep-24f;
4219
4220 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4221 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4222 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4223 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4224
4225 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4226 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4227 } else {
4228 const float ch_exp = 0x1.714000p+0f;
4229 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4230
4231 const float ch_exp10 = 0x1.a92000p+1f;
4232 const float cl_exp10 = 0x1.4f0978p-11f;
4233
4234 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4235 auto XH = B.buildAnd(Ty, X, MaskConst);
4236 auto XL = B.buildFSub(Ty, X, XH, Flags);
4237
4238 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4239 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4240
4241 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4242 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4243
4244 Register Mad0 =
4245 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4246 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4247 }
4248
4249 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4250
4251 // It is unsafe to contract this fsub into the PH multiply.
4252 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4253 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4254 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4255
4256 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4257 .addUse(A.getReg(0))
4258 .setMIFlags(Flags);
4259 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4260
4261 auto UnderflowCheckConst =
4262 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4263 auto Zero = B.buildFConstant(Ty, 0.0);
4264 auto Underflow =
4265 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4266
4267 R = B.buildSelect(Ty, Underflow, Zero, R);
4268
4269 if (!(Flags & MachineInstr::FmNoInfs)) {
4270 auto OverflowCheckConst =
4271 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4272
4273 auto Overflow =
4274 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4275 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4276 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4277 }
4278
4279 B.buildCopy(Dst, R);
4280 MI.eraseFromParent();
4281 return true;
4282}
4283
4285 MachineIRBuilder &B) const {
4286 Register Dst = MI.getOperand(0).getReg();
4287 Register Src0 = MI.getOperand(1).getReg();
4288 Register Src1 = MI.getOperand(2).getReg();
4289 unsigned Flags = MI.getFlags();
4290 LLT Ty = B.getMRI()->getType(Dst);
4291 const LLT F16 = LLT::scalar(16); // TODO: Expected LLT::float16()
4292 const LLT F32 = LLT::scalar(32); // TODO: Expected LLT::float32()
4293
4294 if (Ty == F32) {
4295 auto Log = B.buildFLog2(F32, Src0, Flags);
4296 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4297 .addUse(Log.getReg(0))
4298 .addUse(Src1)
4299 .setMIFlags(Flags);
4300 B.buildFExp2(Dst, Mul, Flags);
4301 } else if (Ty == F16) {
4302 // There's no f16 fmul_legacy, so we need to convert for it.
4303 auto Log = B.buildFLog2(F16, Src0, Flags);
4304 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4305 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4306 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4307 .addUse(Ext0.getReg(0))
4308 .addUse(Ext1.getReg(0))
4309 .setMIFlags(Flags);
4310 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4311 } else
4312 return false;
4313
4314 MI.eraseFromParent();
4315 return true;
4316}
4317
4318// Find a source register, ignoring any possible source modifiers.
4320 Register ModSrc = OrigSrc;
4321 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4322 ModSrc = SrcFNeg->getOperand(1).getReg();
4323 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4324 ModSrc = SrcFAbs->getOperand(1).getReg();
4325 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4326 ModSrc = SrcFAbs->getOperand(1).getReg();
4327 return ModSrc;
4328}
4329
4332 MachineIRBuilder &B) const {
4333
4334 const LLT S1 = LLT::scalar(1);
4335 const LLT F64 = LLT::scalar(64); // TODO: Expected float64
4336 Register Dst = MI.getOperand(0).getReg();
4337 Register OrigSrc = MI.getOperand(1).getReg();
4338 unsigned Flags = MI.getFlags();
4339 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4340 "this should not have been custom lowered");
4341
4342 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4343 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4344 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4345 // V_FRACT bug is:
4346 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4347 //
4348 // Convert floor(x) to (x - fract(x))
4349
4350 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4351 .addUse(OrigSrc)
4352 .setMIFlags(Flags);
4353
4354 // Give source modifier matching some assistance before obscuring a foldable
4355 // pattern.
4356
4357 // TODO: We can avoid the neg on the fract? The input sign to fract
4358 // shouldn't matter?
4359 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4360
4361 auto Const =
4362 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4363
4365
4366 // We don't need to concern ourselves with the snan handling difference, so
4367 // use the one which will directly select.
4368 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4369 if (MFI->getMode().IEEE)
4370 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4371 else
4372 B.buildFMinNum(Min, Fract, Const, Flags);
4373
4374 Register CorrectedFract = Min;
4375 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4376 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4377 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4378 }
4379
4380 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4381 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4382
4383 MI.eraseFromParent();
4384 return true;
4385}
4386
4387// Turn an illegal packed v2s16 build vector into bit operations.
4388// TODO: This should probably be a bitcast action in LegalizerHelper.
4391 Register Dst = MI.getOperand(0).getReg();
4392 const LLT S32 = LLT::scalar(32);
4393 const LLT S16 = LLT::scalar(16);
4394 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4395
4396 Register Src0 = MI.getOperand(1).getReg();
4397 Register Src1 = MI.getOperand(2).getReg();
4398
4399 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4400 assert(MRI.getType(Src0) == S32);
4401 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4402 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4403 }
4404
4405 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4406 B.buildBitcast(Dst, Merge);
4407
4408 MI.eraseFromParent();
4409 return true;
4410}
4411
4412// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4413//
4414// Source and accumulation registers must all be 32-bits.
4415//
4416// TODO: When the multiply is uniform, we should produce a code sequence
4417// that is better suited to instruction selection on the SALU. Instead of
4418// the outer loop going over parts of the result, the outer loop should go
4419// over parts of one of the factors. This should result in instruction
4420// selection that makes full use of S_ADDC_U32 instructions.
4423 ArrayRef<Register> Src0,
4424 ArrayRef<Register> Src1,
4425 bool UsePartialMad64_32,
4426 bool SeparateOddAlignedProducts) const {
4427 // Use (possibly empty) vectors of S1 registers to represent the set of
4428 // carries from one pair of positions to the next.
4429 using Carry = SmallVector<Register, 2>;
4430
4431 MachineIRBuilder &B = Helper.MIRBuilder;
4432 GISelValueTracking &VT = *Helper.getValueTracking();
4433
4434 const LLT S1 = LLT::scalar(1);
4435 const LLT S32 = LLT::scalar(32);
4436 const LLT S64 = LLT::scalar(64);
4437
4438 Register Zero32;
4439 Register Zero64;
4440
4441 auto getZero32 = [&]() -> Register {
4442 if (!Zero32)
4443 Zero32 = B.buildConstant(S32, 0).getReg(0);
4444 return Zero32;
4445 };
4446 auto getZero64 = [&]() -> Register {
4447 if (!Zero64)
4448 Zero64 = B.buildConstant(S64, 0).getReg(0);
4449 return Zero64;
4450 };
4451
4452 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4453 for (unsigned i = 0; i < Src0.size(); ++i) {
4454 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4455 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4456 }
4457
4458 // Merge the given carries into the 32-bit LocalAccum, which is modified
4459 // in-place.
4460 //
4461 // Returns the carry-out, which is a single S1 register or null.
4462 auto mergeCarry =
4463 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4464 if (CarryIn.empty())
4465 return Register();
4466
4467 bool HaveCarryOut = true;
4468 Register CarryAccum;
4469 if (CarryIn.size() == 1) {
4470 if (!LocalAccum) {
4471 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4472 return Register();
4473 }
4474
4475 CarryAccum = getZero32();
4476 } else {
4477 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4478 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4479 CarryAccum =
4480 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4481 .getReg(0);
4482 }
4483
4484 if (!LocalAccum) {
4485 LocalAccum = getZero32();
4486 HaveCarryOut = false;
4487 }
4488 }
4489
4490 auto Add =
4491 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4492 LocalAccum = Add.getReg(0);
4493 return HaveCarryOut ? Add.getReg(1) : Register();
4494 };
4495
4496 // Build a multiply-add chain to compute
4497 //
4498 // LocalAccum + (partial products at DstIndex)
4499 // + (opportunistic subset of CarryIn)
4500 //
4501 // LocalAccum is an array of one or two 32-bit registers that are updated
4502 // in-place. The incoming registers may be null.
4503 //
4504 // In some edge cases, carry-ins can be consumed "for free". In that case,
4505 // the consumed carry bits are removed from CarryIn in-place.
4506 auto buildMadChain =
4507 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4508 -> Carry {
4509 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4510 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4511
4512 Carry CarryOut;
4513 unsigned j0 = 0;
4514
4515 // Use plain 32-bit multiplication for the most significant part of the
4516 // result by default.
4517 if (LocalAccum.size() == 1 &&
4518 (!UsePartialMad64_32 || !CarryIn.empty())) {
4519 do {
4520 // Skip multiplication if one of the operands is 0
4521 unsigned j1 = DstIndex - j0;
4522 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4523 ++j0;
4524 continue;
4525 }
4526 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4527 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4528 LocalAccum[0] = Mul.getReg(0);
4529 } else {
4530 if (CarryIn.empty()) {
4531 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4532 } else {
4533 LocalAccum[0] =
4534 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4535 .getReg(0);
4536 CarryIn.pop_back();
4537 }
4538 }
4539 ++j0;
4540 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4541 }
4542
4543 // Build full 64-bit multiplies.
4544 if (j0 <= DstIndex) {
4545 bool HaveSmallAccum = false;
4546 Register Tmp;
4547
4548 if (LocalAccum[0]) {
4549 if (LocalAccum.size() == 1) {
4550 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4551 HaveSmallAccum = true;
4552 } else if (LocalAccum[1]) {
4553 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4554 HaveSmallAccum = false;
4555 } else {
4556 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4557 HaveSmallAccum = true;
4558 }
4559 } else {
4560 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4561 Tmp = getZero64();
4562 HaveSmallAccum = true;
4563 }
4564
4565 do {
4566 unsigned j1 = DstIndex - j0;
4567 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4568 ++j0;
4569 continue;
4570 }
4571 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4572 {Src0[j0], Src1[j1], Tmp});
4573 Tmp = Mad.getReg(0);
4574 if (!HaveSmallAccum)
4575 CarryOut.push_back(Mad.getReg(1));
4576 HaveSmallAccum = false;
4577
4578 ++j0;
4579 } while (j0 <= DstIndex);
4580
4581 auto Unmerge = B.buildUnmerge(S32, Tmp);
4582 LocalAccum[0] = Unmerge.getReg(0);
4583 if (LocalAccum.size() > 1)
4584 LocalAccum[1] = Unmerge.getReg(1);
4585 }
4586
4587 return CarryOut;
4588 };
4589
4590 // Outer multiply loop, iterating over destination parts from least
4591 // significant to most significant parts.
4592 //
4593 // The columns of the following diagram correspond to the destination parts
4594 // affected by one iteration of the outer loop (ignoring boundary
4595 // conditions).
4596 //
4597 // Dest index relative to 2 * i: 1 0 -1
4598 // ------
4599 // Carries from previous iteration: e o
4600 // Even-aligned partial product sum: E E .
4601 // Odd-aligned partial product sum: O O
4602 //
4603 // 'o' is OddCarry, 'e' is EvenCarry.
4604 // EE and OO are computed from partial products via buildMadChain and use
4605 // accumulation where possible and appropriate.
4606 //
4607 Register SeparateOddCarry;
4608 Carry EvenCarry;
4609 Carry OddCarry;
4610
4611 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4612 Carry OddCarryIn = std::move(OddCarry);
4613 Carry EvenCarryIn = std::move(EvenCarry);
4614 OddCarry.clear();
4615 EvenCarry.clear();
4616
4617 // Partial products at offset 2 * i.
4618 if (2 * i < Accum.size()) {
4619 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4620 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4621 }
4622
4623 // Partial products at offset 2 * i - 1.
4624 if (i > 0) {
4625 if (!SeparateOddAlignedProducts) {
4626 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4627 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4628 } else {
4629 bool IsHighest = 2 * i >= Accum.size();
4630 Register SeparateOddOut[2];
4631 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4632 .take_front(IsHighest ? 1 : 2);
4633 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4634
4636
4637 if (i == 1) {
4638 if (!IsHighest)
4639 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4640 else
4641 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4642 } else {
4643 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4644 SeparateOddCarry);
4645 }
4646 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4647
4648 if (!IsHighest) {
4649 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4650 Lo->getOperand(1).getReg());
4651 Accum[2 * i] = Hi.getReg(0);
4652 SeparateOddCarry = Hi.getReg(1);
4653 }
4654 }
4655 }
4656
4657 // Add in the carries from the previous iteration
4658 if (i > 0) {
4659 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4660 EvenCarryIn.push_back(CarryOut);
4661
4662 if (2 * i < Accum.size()) {
4663 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4664 OddCarry.push_back(CarryOut);
4665 }
4666 }
4667 }
4668}
4669
4670// Custom narrowing of wide multiplies using wide multiply-add instructions.
4671//
4672// TODO: If the multiply is followed by an addition, we should attempt to
4673// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4675 MachineInstr &MI) const {
4676 assert(ST.hasMad64_32());
4677 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4678
4679 MachineIRBuilder &B = Helper.MIRBuilder;
4680 MachineRegisterInfo &MRI = *B.getMRI();
4681
4682 Register DstReg = MI.getOperand(0).getReg();
4683 Register Src0 = MI.getOperand(1).getReg();
4684 Register Src1 = MI.getOperand(2).getReg();
4685
4686 LLT Ty = MRI.getType(DstReg);
4687 assert(Ty.isScalar());
4688
4689 unsigned Size = Ty.getSizeInBits();
4690 if (ST.hasVMulU64Inst() && Size == 64)
4691 return true;
4692
4693 unsigned NumParts = Size / 32;
4694 assert((Size % 32) == 0);
4695 assert(NumParts >= 2);
4696
4697 // Whether to use MAD_64_32 for partial products whose high half is
4698 // discarded. This avoids some ADD instructions but risks false dependency
4699 // stalls on some subtargets in some cases.
4700 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4701
4702 // Whether to compute odd-aligned partial products separately. This is
4703 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4704 // in an even-aligned VGPR.
4705 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4706
4707 LLT S32 = LLT::scalar(32);
4708 SmallVector<Register, 2> Src0Parts, Src1Parts;
4709 for (unsigned i = 0; i < NumParts; ++i) {
4712 }
4713 B.buildUnmerge(Src0Parts, Src0);
4714 B.buildUnmerge(Src1Parts, Src1);
4715
4716 SmallVector<Register, 2> AccumRegs(NumParts);
4717 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4718 SeparateOddAlignedProducts);
4719
4720 B.buildMergeLikeInstr(DstReg, AccumRegs);
4721 MI.eraseFromParent();
4722 return true;
4723}
4724
4725// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4726// ctlz/cttz_zero_poison. This allows us to fix up the result for the zero input
4727// case with a single min instruction instead of a compare+select.
4730 MachineIRBuilder &B) const {
4731 Register Dst = MI.getOperand(0).getReg();
4732 Register Src = MI.getOperand(1).getReg();
4733 LLT DstTy = MRI.getType(Dst);
4734 LLT SrcTy = MRI.getType(Src);
4735
4736 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4737 ? AMDGPU::G_AMDGPU_FFBH_U32
4738 : AMDGPU::G_AMDGPU_FFBL_B32;
4739 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4740 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4741
4742 MI.eraseFromParent();
4743 return true;
4744}
4745
4748 MachineIRBuilder &B) const {
4749 Register Dst = MI.getOperand(0).getReg();
4750 Register Src = MI.getOperand(1).getReg();
4751 LLT SrcTy = MRI.getType(Src);
4752 TypeSize NumBits = SrcTy.getSizeInBits();
4753
4754 assert(NumBits < 32u);
4755
4756 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4757 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4758 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4759 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4760 B.buildTrunc(Dst, Ctlz);
4761 MI.eraseFromParent();
4762 return true;
4763}
4764
4767 MachineIRBuilder &B) const {
4768 Register Dst = MI.getOperand(0).getReg();
4769 Register Src = MI.getOperand(1).getReg();
4770 LLT SrcTy = MRI.getType(Src);
4771 const LLT S32 = LLT::scalar(32);
4772 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4773 unsigned BitWidth = SrcTy.getSizeInBits();
4774
4775 auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
4776 auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
4777 B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
4778 MI.eraseFromParent();
4779 return true;
4780}
4781
4782// Check that this is a G_XOR x, -1
4783static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4784 if (MI.getOpcode() != TargetOpcode::G_XOR)
4785 return false;
4786 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4787 return ConstVal == -1;
4788}
4789
4790// Return the use branch instruction, otherwise null if the usage is invalid.
4791static MachineInstr *
4793 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4794 Register CondDef = MI.getOperand(0).getReg();
4795 if (!MRI.hasOneNonDBGUse(CondDef))
4796 return nullptr;
4797
4798 MachineBasicBlock *Parent = MI.getParent();
4799 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4800
4801 if (isNot(MRI, *UseMI)) {
4802 Register NegatedCond = UseMI->getOperand(0).getReg();
4803 if (!MRI.hasOneNonDBGUse(NegatedCond))
4804 return nullptr;
4805
4806 // We're deleting the def of this value, so we need to remove it.
4807 eraseInstr(*UseMI, MRI);
4808
4809 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4810 Negated = true;
4811 }
4812
4813 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4814 return nullptr;
4815
4816 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4817 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4818 if (Next == Parent->end()) {
4819 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4820 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4821 return nullptr;
4822 UncondBrTarget = &*NextMBB;
4823 } else {
4824 if (Next->getOpcode() != AMDGPU::G_BR)
4825 return nullptr;
4826 Br = &*Next;
4827 UncondBrTarget = Br->getOperand(0).getMBB();
4828 }
4829
4830 return UseMI;
4831}
4832
4835 const ArgDescriptor *Arg,
4836 const TargetRegisterClass *ArgRC,
4837 LLT ArgTy) const {
4838 MCRegister SrcReg = Arg->getRegister();
4839 assert(SrcReg.isPhysical() && "Physical register expected");
4840 assert(DstReg.isVirtual() && "Virtual register expected");
4841
4842 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4843 *ArgRC, B.getDebugLoc(), ArgTy);
4844 if (Arg->isMasked()) {
4845 // TODO: Should we try to emit this once in the entry block?
4846 const LLT S32 = LLT::scalar(32);
4847 const unsigned Mask = Arg->getMask();
4848 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4849
4850 Register AndMaskSrc = LiveIn;
4851
4852 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4853 // 0.
4854 if (Shift != 0) {
4855 auto ShiftAmt = B.buildConstant(S32, Shift);
4856 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4857 }
4858
4859 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4860 } else {
4861 B.buildCopy(DstReg, LiveIn);
4862 }
4863}
4864
4869 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4870 Register DstReg = MI.getOperand(0).getReg();
4871 if (!ST.hasClusters()) {
4872 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4873 return false;
4874 MI.eraseFromParent();
4875 return true;
4876 }
4877
4878 // Clusters are supported. Return the global position in the grid. If clusters
4879 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4880
4881 // WorkGroupIdXYZ = ClusterId == 0 ?
4882 // ClusterIdXYZ :
4883 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4884 MachineRegisterInfo &MRI = *B.getMRI();
4885 const LLT S32 = LLT::scalar(32);
4886 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4887 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4888 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4889 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4890 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4891 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4892 return false;
4893
4894 auto One = B.buildConstant(S32, 1);
4895 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4896 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4897 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4898
4899 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4900
4901 switch (MFI->getClusterDims().getKind()) {
4904 B.buildCopy(DstReg, GlobalIdXYZ);
4905 MI.eraseFromParent();
4906 return true;
4907 }
4909 B.buildCopy(DstReg, ClusterIdXYZ);
4910 MI.eraseFromParent();
4911 return true;
4912 }
4914 using namespace AMDGPU::Hwreg;
4915 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4916 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4917 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4918 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4919 .addDef(ClusterId)
4920 .addImm(ClusterIdField);
4921 auto Zero = B.buildConstant(S32, 0);
4922 auto NoClusters =
4923 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4924 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4925 MI.eraseFromParent();
4926 return true;
4927 }
4928 }
4929
4930 llvm_unreachable("nothing should reach here");
4931}
4932
4934 Register DstReg, MachineIRBuilder &B,
4936 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4937 const ArgDescriptor *Arg = nullptr;
4938 const TargetRegisterClass *ArgRC;
4939 LLT ArgTy;
4940
4941 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4942 const ArgDescriptor WorkGroupIDX =
4943 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4944 // If GridZ is not programmed in an entry function then the hardware will set
4945 // it to all zeros, so there is no need to mask the GridY value in the low
4946 // order bits.
4947 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4948 AMDGPU::TTMP7,
4949 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4950 const ArgDescriptor WorkGroupIDZ =
4951 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4952 const ArgDescriptor ClusterWorkGroupIDX =
4953 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4954 const ArgDescriptor ClusterWorkGroupIDY =
4955 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4956 const ArgDescriptor ClusterWorkGroupIDZ =
4957 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4958 const ArgDescriptor ClusterWorkGroupMaxIDX =
4959 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4960 const ArgDescriptor ClusterWorkGroupMaxIDY =
4961 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4962 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4963 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4964 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4965 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4966
4967 auto LoadConstant = [&](unsigned N) {
4968 B.buildConstant(DstReg, N);
4969 return true;
4970 };
4971
4972 if (ST.hasArchitectedSGPRs() &&
4974 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4975 bool HasFixedDims = ClusterDims.isFixedDims();
4976
4977 switch (ArgType) {
4979 Arg = &WorkGroupIDX;
4980 ArgRC = &AMDGPU::SReg_32RegClass;
4981 ArgTy = LLT::scalar(32);
4982 break;
4984 Arg = &WorkGroupIDY;
4985 ArgRC = &AMDGPU::SReg_32RegClass;
4986 ArgTy = LLT::scalar(32);
4987 break;
4989 Arg = &WorkGroupIDZ;
4990 ArgRC = &AMDGPU::SReg_32RegClass;
4991 ArgTy = LLT::scalar(32);
4992 break;
4994 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4995 return LoadConstant(0);
4996 Arg = &ClusterWorkGroupIDX;
4997 ArgRC = &AMDGPU::SReg_32RegClass;
4998 ArgTy = LLT::scalar(32);
4999 break;
5001 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
5002 return LoadConstant(0);
5003 Arg = &ClusterWorkGroupIDY;
5004 ArgRC = &AMDGPU::SReg_32RegClass;
5005 ArgTy = LLT::scalar(32);
5006 break;
5008 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
5009 return LoadConstant(0);
5010 Arg = &ClusterWorkGroupIDZ;
5011 ArgRC = &AMDGPU::SReg_32RegClass;
5012 ArgTy = LLT::scalar(32);
5013 break;
5015 if (HasFixedDims)
5016 return LoadConstant(ClusterDims.getDims()[0] - 1);
5017 Arg = &ClusterWorkGroupMaxIDX;
5018 ArgRC = &AMDGPU::SReg_32RegClass;
5019 ArgTy = LLT::scalar(32);
5020 break;
5022 if (HasFixedDims)
5023 return LoadConstant(ClusterDims.getDims()[1] - 1);
5024 Arg = &ClusterWorkGroupMaxIDY;
5025 ArgRC = &AMDGPU::SReg_32RegClass;
5026 ArgTy = LLT::scalar(32);
5027 break;
5029 if (HasFixedDims)
5030 return LoadConstant(ClusterDims.getDims()[2] - 1);
5031 Arg = &ClusterWorkGroupMaxIDZ;
5032 ArgRC = &AMDGPU::SReg_32RegClass;
5033 ArgTy = LLT::scalar(32);
5034 break;
5036 Arg = &ClusterWorkGroupMaxFlatID;
5037 ArgRC = &AMDGPU::SReg_32RegClass;
5038 ArgTy = LLT::scalar(32);
5039 break;
5040 default:
5041 break;
5042 }
5043 }
5044
5045 if (!Arg)
5046 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5047
5048 if (!Arg) {
5050 // The intrinsic may appear when we have a 0 sized kernarg segment, in
5051 // which case the pointer argument may be missing and we use null.
5052 return LoadConstant(0);
5053 }
5054
5055 // It's undefined behavior if a function marked with the amdgpu-no-*
5056 // attributes uses the corresponding intrinsic.
5057 B.buildUndef(DstReg);
5058 return true;
5059 }
5060
5061 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5062 return false; // TODO: Handle these
5063 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5064 return true;
5065}
5066
5070 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
5071 return false;
5072
5073 MI.eraseFromParent();
5074 return true;
5075}
5076
5078 int64_t C) {
5079 B.buildConstant(MI.getOperand(0).getReg(), C);
5080 MI.eraseFromParent();
5081 return true;
5082}
5083
5086 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5087 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
5088 if (MaxID == 0)
5089 return replaceWithConstant(B, MI, 0);
5090
5091 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5092 const ArgDescriptor *Arg;
5093 const TargetRegisterClass *ArgRC;
5094 LLT ArgTy;
5095 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5096
5097 Register DstReg = MI.getOperand(0).getReg();
5098 if (!Arg) {
5099 // It's undefined behavior if a function marked with the amdgpu-no-*
5100 // attributes uses the corresponding intrinsic.
5101 B.buildUndef(DstReg);
5102 MI.eraseFromParent();
5103 return true;
5104 }
5105
5106 if (Arg->isMasked()) {
5107 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5108 // masking operations anyway.
5109 //
5110 // TODO: We could assert the top bit is 0 for the source copy.
5111 if (!loadInputValue(DstReg, B, ArgType))
5112 return false;
5113 } else {
5115 if (!loadInputValue(TmpReg, B, ArgType))
5116 return false;
5117 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5118 }
5119
5120 MI.eraseFromParent();
5121 return true;
5122}
5123
5126 // This isn't really a constant pool but close enough.
5129 return PtrInfo;
5130}
5131
5133 int64_t Offset) const {
5135 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5136
5137 // TODO: If we passed in the base kernel offset we could have a better
5138 // alignment than 4, but we don't really need it.
5139 if (!loadInputValue(KernArgReg, B,
5141 llvm_unreachable("failed to find kernarg segment ptr");
5142
5143 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5144 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5145}
5146
5147/// Legalize a value that's loaded from kernel arguments. This is only used by
5148/// legacy intrinsics.
5152 Align Alignment) const {
5153 Register DstReg = MI.getOperand(0).getReg();
5154
5155 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5156 "unexpected kernarg parameter type");
5157
5160 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5163 MI.eraseFromParent();
5164 return true;
5165}
5166
5169 MachineIRBuilder &B) const {
5170 Register Dst = MI.getOperand(0).getReg();
5171 LLT DstTy = MRI.getType(Dst);
5172 LLT S16 = LLT::scalar(16);
5173 LLT S32 = LLT::scalar(32);
5174 LLT S64 = LLT::scalar(64);
5175
5176 if (DstTy == S16)
5177 return legalizeFDIV16(MI, MRI, B);
5178 if (DstTy == S32)
5179 return legalizeFDIV32(MI, MRI, B);
5180 if (DstTy == S64)
5181 return legalizeFDIV64(MI, MRI, B);
5182
5183 return false;
5184}
5185
5187 Register DstDivReg,
5188 Register DstRemReg,
5189 Register X,
5190 Register Y) const {
5191 const LLT S1 = LLT::scalar(1);
5192 const LLT S32 = LLT::scalar(32);
5193
5194 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5195 // algorithm used here.
5196
5197 // Initial estimate of inv(y).
5198 auto FloatY = B.buildUITOFP(S32, Y);
5199 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5200 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5201 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5202 auto Z = B.buildFPTOUI(S32, ScaledY);
5203
5204 // One round of UNR.
5205 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5206 auto NegYZ = B.buildMul(S32, NegY, Z);
5207 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5208
5209 // Quotient/remainder estimate.
5210 auto Q = B.buildUMulH(S32, X, Z);
5211 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5212
5213 // First quotient/remainder refinement.
5214 auto One = B.buildConstant(S32, 1);
5215 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5216 if (DstDivReg)
5217 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5218 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5219
5220 // Second quotient/remainder refinement.
5221 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5222 if (DstDivReg)
5223 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5224
5225 if (DstRemReg)
5226 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5227}
5228
5229// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5230//
5231// Return lo, hi of result
5232//
5233// %cvt.lo = G_UITOFP Val.lo
5234// %cvt.hi = G_UITOFP Val.hi
5235// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5236// %rcp = G_AMDGPU_RCP_IFLAG %mad
5237// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5238// %mul2 = G_FMUL %mul1, 2**(-32)
5239// %trunc = G_INTRINSIC_TRUNC %mul2
5240// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5241// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5242static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5243 Register Val) {
5244 const LLT S32 = LLT::scalar(32);
5245 auto Unmerge = B.buildUnmerge(S32, Val);
5246
5247 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5248 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5249
5250 auto Mad = B.buildFMAD(
5251 S32, CvtHi, // 2**32
5252 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5253
5254 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5255 auto Mul1 = B.buildFMul(
5256 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5257
5258 // 2**(-32)
5259 auto Mul2 = B.buildFMul(
5260 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5261 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5262
5263 // -(2**32)
5264 auto Mad2 = B.buildFMAD(
5265 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5266 Mul1);
5267
5268 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5269 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5270
5271 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5272}
5273
5275 Register DstDivReg,
5276 Register DstRemReg,
5277 Register Numer,
5278 Register Denom) const {
5279 const LLT S32 = LLT::scalar(32);
5280 const LLT S64 = LLT::scalar(64);
5281 const LLT S1 = LLT::scalar(1);
5282 Register RcpLo, RcpHi;
5283
5284 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5285
5286 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5287
5288 auto Zero64 = B.buildConstant(S64, 0);
5289 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5290
5291 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5292 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5293
5294 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5295 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5296 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5297
5298 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5299 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5300 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5301
5302 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5303 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5304 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5305 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5306 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5307
5308 auto Zero32 = B.buildConstant(S32, 0);
5309 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5310 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5311 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5312
5313 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5314 Register NumerLo = UnmergeNumer.getReg(0);
5315 Register NumerHi = UnmergeNumer.getReg(1);
5316
5317 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5318 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5319 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5320 Register Mul3_Lo = UnmergeMul3.getReg(0);
5321 Register Mul3_Hi = UnmergeMul3.getReg(1);
5322 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5323 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5324 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5325 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5326
5327 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5328 Register DenomLo = UnmergeDenom.getReg(0);
5329 Register DenomHi = UnmergeDenom.getReg(1);
5330
5331 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5332 auto C1 = B.buildSExt(S32, CmpHi);
5333
5334 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5335 auto C2 = B.buildSExt(S32, CmpLo);
5336
5337 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5338 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5339
5340 // TODO: Here and below portions of the code can be enclosed into if/endif.
5341 // Currently control flow is unconditional and we have 4 selects after
5342 // potential endif to substitute PHIs.
5343
5344 // if C3 != 0 ...
5345 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5346 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5347 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5348 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5349
5350 auto One64 = B.buildConstant(S64, 1);
5351 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5352
5353 auto C4 =
5354 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5355 auto C5 =
5356 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5357 auto C6 = B.buildSelect(
5358 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5359
5360 // if (C6 != 0)
5361 auto Add4 = B.buildAdd(S64, Add3, One64);
5362 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5363
5364 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5365 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5366 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5367
5368 // endif C6
5369 // endif C3
5370
5371 if (DstDivReg) {
5372 auto Sel1 = B.buildSelect(
5373 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5374 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5375 Sel1, MulHi3);
5376 }
5377
5378 if (DstRemReg) {
5379 auto Sel2 = B.buildSelect(
5380 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5381 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5382 Sel2, Sub1);
5383 }
5384}
5385
5388 MachineIRBuilder &B) const {
5389 Register DstDivReg, DstRemReg;
5390 switch (MI.getOpcode()) {
5391 default:
5392 llvm_unreachable("Unexpected opcode!");
5393 case AMDGPU::G_UDIV: {
5394 DstDivReg = MI.getOperand(0).getReg();
5395 break;
5396 }
5397 case AMDGPU::G_UREM: {
5398 DstRemReg = MI.getOperand(0).getReg();
5399 break;
5400 }
5401 case AMDGPU::G_UDIVREM: {
5402 DstDivReg = MI.getOperand(0).getReg();
5403 DstRemReg = MI.getOperand(1).getReg();
5404 break;
5405 }
5406 }
5407
5408 const LLT S64 = LLT::scalar(64);
5409 const LLT S32 = LLT::scalar(32);
5410 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5411 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5412 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5413 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5414
5415 if (Ty == S32)
5416 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5417 else if (Ty == S64)
5418 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5419 else
5420 return false;
5421
5422 MI.eraseFromParent();
5423 return true;
5424}
5425
5428 MachineIRBuilder &B) const {
5429 const LLT S64 = LLT::scalar(64);
5430 const LLT S32 = LLT::scalar(32);
5431
5432 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5433 if (Ty != S32 && Ty != S64)
5434 return false;
5435
5436 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5437 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5438 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5439
5440 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5441 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5442 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5443
5444 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5445 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5446
5447 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5448 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5449
5450 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5451 switch (MI.getOpcode()) {
5452 default:
5453 llvm_unreachable("Unexpected opcode!");
5454 case AMDGPU::G_SDIV: {
5455 DstDivReg = MI.getOperand(0).getReg();
5456 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5457 break;
5458 }
5459 case AMDGPU::G_SREM: {
5460 DstRemReg = MI.getOperand(0).getReg();
5461 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5462 break;
5463 }
5464 case AMDGPU::G_SDIVREM: {
5465 DstDivReg = MI.getOperand(0).getReg();
5466 DstRemReg = MI.getOperand(1).getReg();
5467 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5468 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5469 break;
5470 }
5471 }
5472
5473 if (Ty == S32)
5474 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5475 else
5476 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5477
5478 if (DstDivReg) {
5479 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5480 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5481 B.buildSub(DstDivReg, SignXor, Sign);
5482 }
5483
5484 if (DstRemReg) {
5485 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5486 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5487 B.buildSub(DstRemReg, SignXor, Sign);
5488 }
5489
5490 MI.eraseFromParent();
5491 return true;
5492}
5493
5496 MachineIRBuilder &B) const {
5497 Register Res = MI.getOperand(0).getReg();
5498 Register LHS = MI.getOperand(1).getReg();
5499 Register RHS = MI.getOperand(2).getReg();
5500 uint16_t Flags = MI.getFlags();
5501 LLT ResTy = MRI.getType(Res);
5502
5503 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5504
5505 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5506 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5507 return false;
5508
5509 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5510 // the CI documentation has a worst case error of 1 ulp.
5511 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5512 // use it as long as we aren't trying to use denormals.
5513 //
5514 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5515
5516 // 1 / x -> RCP(x)
5517 if (CLHS->isExactlyValue(1.0)) {
5518 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5519 .addUse(RHS)
5520 .setMIFlags(Flags);
5521
5522 MI.eraseFromParent();
5523 return true;
5524 }
5525
5526 // -1 / x -> RCP( FNEG(x) )
5527 if (CLHS->isExactlyValue(-1.0)) {
5528 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5529 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5530 .addUse(FNeg.getReg(0))
5531 .setMIFlags(Flags);
5532
5533 MI.eraseFromParent();
5534 return true;
5535 }
5536 }
5537
5538 // For f16 require afn or arcp.
5539 // For f32 require afn.
5540 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5541 !MI.getFlag(MachineInstr::FmArcp)))
5542 return false;
5543
5544 // x / y -> x * (1.0 / y)
5545 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5546 .addUse(RHS)
5547 .setMIFlags(Flags);
5548 B.buildFMul(Res, LHS, RCP, Flags);
5549
5550 MI.eraseFromParent();
5551 return true;
5552}
5553
5556 MachineIRBuilder &B) const {
5557 Register Res = MI.getOperand(0).getReg();
5558 Register X = MI.getOperand(1).getReg();
5559 Register Y = MI.getOperand(2).getReg();
5560 uint16_t Flags = MI.getFlags();
5561 LLT ResTy = MRI.getType(Res);
5562
5563 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5564
5565 if (!AllowInaccurateRcp)
5566 return false;
5567
5568 const ConstantFP *CLHS = getConstantFPVRegVal(X, MRI);
5569 bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);
5570
5571 // Pull out the negation so it folds for free into the source modifiers.
5572 if (IsNegRcp)
5573 X = B.buildFConstant(ResTy, 1.0).getReg(0);
5574
5575 Register NegY = IsNegRcp ? Y : B.buildFNeg(ResTy, Y).getReg(0);
5576 auto One = B.buildFConstant(ResTy, 1.0);
5577
5578 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5579 .addUse(Y)
5580 .setMIFlags(Flags);
5581 if (IsNegRcp)
5582 R = B.buildFNeg(ResTy, R);
5583
5584 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5585 R = B.buildFMA(ResTy, Tmp0, R, R);
5586
5587 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5588 R = B.buildFMA(ResTy, Tmp1, R, R);
5589
5590 // Skip the last 2 correction terms for reciprocal.
5591 if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0))) {
5592 B.buildCopy(Res, R);
5593 MI.eraseFromParent();
5594 return true;
5595 }
5596
5597 auto Ret = B.buildFMul(ResTy, X, R);
5598 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5599
5600 B.buildFMA(Res, Tmp2, R, Ret);
5601 MI.eraseFromParent();
5602 return true;
5603}
5604
5607 MachineIRBuilder &B) const {
5608 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5609 return true;
5610
5611 Register Res = MI.getOperand(0).getReg();
5612 Register LHS = MI.getOperand(1).getReg();
5613 Register RHS = MI.getOperand(2).getReg();
5614
5615 uint16_t Flags = MI.getFlags();
5616
5617 LLT S16 = LLT::scalar(16);
5618 LLT S32 = LLT::scalar(32);
5619
5620 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5621 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5622 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5623 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5624 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5625 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5626 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5627 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5628 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5629 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5630 // q16.u = opx(V_CVT_F16_F32, q32.u);
5631 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5632
5633 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5634 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5635 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5636 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5637 .addUse(RHSExt.getReg(0))
5638 .setMIFlags(Flags);
5639 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5641 if (ST.hasMadMacF32Insts()) {
5642 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5643 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5644 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5645 } else {
5646 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5647 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5648 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5649 }
5650 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5651 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5652 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5653 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5654 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5655 .addUse(RDst.getReg(0))
5656 .addUse(RHS)
5657 .addUse(LHS)
5658 .setMIFlags(Flags);
5659
5660 MI.eraseFromParent();
5661 return true;
5662}
5663
5664static constexpr unsigned SPDenormModeBitField =
5666
5667// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5668// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5670 const GCNSubtarget &ST,
5672 // Set SP denorm mode to this value.
5673 unsigned SPDenormMode =
5674 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5675
5676 if (ST.hasDenormModeInst()) {
5677 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5678 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5679
5680 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5681 B.buildInstr(AMDGPU::S_DENORM_MODE)
5682 .addImm(NewDenormModeValue);
5683
5684 } else {
5685 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5686 .addImm(SPDenormMode)
5687 .addImm(SPDenormModeBitField);
5688 }
5689}
5690
5693 MachineIRBuilder &B) const {
5694 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5695 return true;
5696
5697 Register Res = MI.getOperand(0).getReg();
5698 Register LHS = MI.getOperand(1).getReg();
5699 Register RHS = MI.getOperand(2).getReg();
5700 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5701 SIModeRegisterDefaults Mode = MFI->getMode();
5702
5703 uint16_t Flags = MI.getFlags();
5704
5705 LLT S32 = LLT::scalar(32);
5706 LLT S1 = LLT::scalar(1);
5707
5708 auto One = B.buildFConstant(S32, 1.0f);
5709
5710 auto DenominatorScaled =
5711 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5712 .addUse(LHS)
5713 .addUse(RHS)
5714 .addImm(0)
5715 .setMIFlags(Flags);
5716 auto NumeratorScaled =
5717 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5718 .addUse(LHS)
5719 .addUse(RHS)
5720 .addImm(1)
5721 .setMIFlags(Flags);
5722
5723 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5724 .addUse(DenominatorScaled.getReg(0))
5725 .setMIFlags(Flags);
5726 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5727
5728 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5729 const bool HasDynamicDenormals =
5730 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5731 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5732
5733 Register SavedSPDenormMode;
5734 if (!PreservesDenormals) {
5735 if (HasDynamicDenormals) {
5736 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5737 B.buildInstr(AMDGPU::S_GETREG_B32)
5738 .addDef(SavedSPDenormMode)
5739 .addImm(SPDenormModeBitField);
5740 }
5741 toggleSPDenormMode(true, B, ST, Mode);
5742 }
5743
5744 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5745 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5746 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5747 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5748 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5749 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5750
5751 if (!PreservesDenormals) {
5752 if (HasDynamicDenormals) {
5753 assert(SavedSPDenormMode);
5754 B.buildInstr(AMDGPU::S_SETREG_B32)
5755 .addReg(SavedSPDenormMode)
5756 .addImm(SPDenormModeBitField);
5757 } else
5758 toggleSPDenormMode(false, B, ST, Mode);
5759 }
5760
5761 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5762 .addUse(Fma4.getReg(0))
5763 .addUse(Fma1.getReg(0))
5764 .addUse(Fma3.getReg(0))
5765 .addUse(NumeratorScaled.getReg(1))
5766 .setMIFlags(Flags);
5767
5768 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5769 .addUse(Fmas.getReg(0))
5770 .addUse(RHS)
5771 .addUse(LHS)
5772 .setMIFlags(Flags);
5773
5774 MI.eraseFromParent();
5775 return true;
5776}
5777
5780 MachineIRBuilder &B) const {
5781 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5782 return true;
5783
5784 Register Res = MI.getOperand(0).getReg();
5785 Register LHS = MI.getOperand(1).getReg();
5786 Register RHS = MI.getOperand(2).getReg();
5787
5788 uint16_t Flags = MI.getFlags();
5789
5790 LLT S64 = LLT::scalar(64);
5791 LLT S1 = LLT::scalar(1);
5792
5793 auto One = B.buildFConstant(S64, 1.0);
5794
5795 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5796 .addUse(LHS)
5797 .addUse(RHS)
5798 .addImm(0)
5799 .setMIFlags(Flags);
5800
5801 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5802
5803 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5804 .addUse(DivScale0.getReg(0))
5805 .setMIFlags(Flags);
5806
5807 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5808 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5809 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5810
5811 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5812 .addUse(LHS)
5813 .addUse(RHS)
5814 .addImm(1)
5815 .setMIFlags(Flags);
5816
5817 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5818 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5819 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5820
5821 Register Scale;
5822 if (!ST.hasUsableDivScaleConditionOutput()) {
5823 // Workaround a hardware bug on SI where the condition output from div_scale
5824 // is not usable.
5825
5826 LLT S32 = LLT::scalar(32);
5827
5828 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5829 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5830 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5831 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5832
5833 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5834 Scale1Unmerge.getReg(1));
5835 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5836 Scale0Unmerge.getReg(1));
5837 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5838 } else {
5839 Scale = DivScale1.getReg(1);
5840 }
5841
5842 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5843 .addUse(Fma4.getReg(0))
5844 .addUse(Fma3.getReg(0))
5845 .addUse(Mul.getReg(0))
5846 .addUse(Scale)
5847 .setMIFlags(Flags);
5848
5849 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5850 .addUse(Fmas.getReg(0))
5851 .addUse(RHS)
5852 .addUse(LHS)
5853 .setMIFlags(Flags);
5854
5855 MI.eraseFromParent();
5856 return true;
5857}
5858
5861 MachineIRBuilder &B) const {
5862 Register Res0 = MI.getOperand(0).getReg();
5863 Register Res1 = MI.getOperand(1).getReg();
5864 Register Val = MI.getOperand(2).getReg();
5865 uint16_t Flags = MI.getFlags();
5866
5867 LLT Ty = MRI.getType(Res0);
5868 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5869
5870 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5871 .addUse(Val)
5872 .setMIFlags(Flags);
5873 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5874 .addUse(Val)
5875 .setMIFlags(Flags);
5876
5877 if (ST.hasFractBug()) {
5878 auto Fabs = B.buildFAbs(Ty, Val);
5879 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5880 auto IsFinite =
5881 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5882 auto Zero = B.buildConstant(InstrExpTy, 0);
5883 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5884 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5885 }
5886
5887 B.buildCopy(Res0, Mant);
5888 B.buildSExtOrTrunc(Res1, Exp);
5889
5890 MI.eraseFromParent();
5891 return true;
5892}
5893
5896 MachineIRBuilder &B) const {
5897 Register Res = MI.getOperand(0).getReg();
5898 Register LHS = MI.getOperand(2).getReg();
5899 Register RHS = MI.getOperand(3).getReg();
5900 uint16_t Flags = MI.getFlags();
5901
5902 LLT S32 = LLT::scalar(32);
5903 LLT S1 = LLT::scalar(1);
5904
5905 auto Abs = B.buildFAbs(S32, RHS, Flags);
5906 const APFloat C0Val(1.0f);
5907
5908 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5909 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5910 auto C2 = B.buildFConstant(S32, 1.0f);
5911
5912 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5913 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5914
5915 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5916
5917 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5918 .addUse(Mul0.getReg(0))
5919 .setMIFlags(Flags);
5920
5921 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5922
5923 B.buildFMul(Res, Sel, Mul1, Flags);
5924
5925 MI.eraseFromParent();
5926 return true;
5927}
5928
5931 MachineIRBuilder &B) const {
5932 // Bypass the correct expansion a standard promotion through G_FSQRT would
5933 // get. The f32 op is accurate enough for the f16 cas.
5934 unsigned Flags = MI.getFlags();
5935 assert(!ST.has16BitInsts());
5936 const LLT F32 = LLT::scalar(32);
5937 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5938 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5939 .addUse(Ext.getReg(0))
5940 .setMIFlags(Flags);
5941 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5942 MI.eraseFromParent();
5943 return true;
5944}
5945
5948 MachineIRBuilder &B) const {
5949 MachineFunction &MF = B.getMF();
5950 Register Dst = MI.getOperand(0).getReg();
5951 Register X = MI.getOperand(1).getReg();
5952 const unsigned Flags = MI.getFlags();
5953 const LLT S1 = LLT::scalar(1);
5954 const LLT F32 = LLT::scalar(32);
5955 const LLT I32 = LLT::scalar(32);
5956
5957 if (allowApproxFunc(MF, Flags)) {
5958 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5959 .addUse(X)
5960 .setMIFlags(Flags);
5961 MI.eraseFromParent();
5962 return true;
5963 }
5964
5965 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5966 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5967 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5968 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5969 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5970
5972 if (needsDenormHandlingF32(MF, X, Flags)) {
5973 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5974 .addUse(SqrtX.getReg(0))
5975 .setMIFlags(Flags);
5976
5977 auto NegOne = B.buildConstant(I32, -1);
5978 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5979
5980 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5981 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5982
5983 auto PosOne = B.buildConstant(I32, 1);
5984 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5985
5986 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5987 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5988
5989 auto Zero = B.buildFConstant(F32, 0.0f);
5990 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5991
5992 SqrtS =
5993 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5994
5995 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5996 SqrtS =
5997 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5998 } else {
5999 auto SqrtR =
6000 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
6001 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
6002
6003 auto Half = B.buildFConstant(F32, 0.5f);
6004 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
6005 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
6006 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
6007 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
6008 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
6009 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
6010 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
6011 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
6012 }
6013
6014 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
6015
6016 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
6017
6018 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
6019
6020 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
6021 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
6022
6023 MI.eraseFromParent();
6024 return true;
6025}
6026
6029 MachineIRBuilder &B) const {
6030 // For double type, the SQRT and RSQ instructions don't have required
6031 // precision, we apply Goldschmidt's algorithm to improve the result:
6032 //
6033 // y0 = rsq(x)
6034 // g0 = x * y0
6035 // h0 = 0.5 * y0
6036 //
6037 // r0 = 0.5 - h0 * g0
6038 // g1 = g0 * r0 + g0
6039 // h1 = h0 * r0 + h0
6040 //
6041 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
6042 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
6043 // h2 = h1 * r1 + h1
6044 //
6045 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
6046 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
6047 //
6048 // sqrt(x) = g3
6049
6050 const LLT S1 = LLT::scalar(1);
6051 const LLT S32 = LLT::scalar(32);
6052 const LLT F64 = LLT::scalar(64);
6053
6054 Register Dst = MI.getOperand(0).getReg();
6055 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
6056
6057 Register X = MI.getOperand(1).getReg();
6058 unsigned Flags = MI.getFlags();
6059
6060 Register SqrtX = X;
6061 Register Scaling, ZeroInt;
6062 if (!MI.getFlag(MachineInstr::FmAfn)) {
6063 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
6064
6065 ZeroInt = B.buildConstant(S32, 0).getReg(0);
6066 Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0);
6067
6068 // Scale up input if it is too small.
6069 auto ScaleUpFactor = B.buildConstant(S32, 256);
6070 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
6071 SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0);
6072 }
6073
6074 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX);
6075
6076 auto Half = B.buildFConstant(F64, 0.5);
6077 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
6078 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
6079
6080 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
6081 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
6082
6083 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
6084 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
6085
6086 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
6087 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
6088
6089 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
6090
6091 Register SqrtRet = SqrtS2.getReg(0);
6092 if (!MI.getFlag(MachineInstr::FmAfn)) {
6093 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
6094 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
6095 auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
6096
6097 // Scale down the result.
6098 auto ScaleDownFactor = B.buildConstant(S32, -128);
6099 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
6100 SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0);
6101 }
6102
6103 Register IsZeroOrInf;
6104 if (MI.getFlag(MachineInstr::FmNoInfs)) {
6105 auto ZeroFP = B.buildFConstant(F64, 0.0);
6106 IsZeroOrInf = B.buildFCmp(FCmpInst::FCMP_OEQ, S1, SqrtX, ZeroFP).getReg(0);
6107 } else {
6108 IsZeroOrInf = B.buildIsFPClass(S1, SqrtX, fcZero | fcPosInf).getReg(0);
6109 }
6110
6111 // TODO: Check for DAZ and expand to subnormals
6112
6113 // If x is +INF, +0, or -0, use its original value
6114 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6115
6116 MI.eraseFromParent();
6117 return true;
6118}
6119
6122 MachineIRBuilder &B) const {
6123 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6124 if (Ty == LLT::scalar(32))
6125 return legalizeFSQRTF32(MI, MRI, B);
6126 if (Ty == LLT::scalar(64))
6127 return legalizeFSQRTF64(MI, MRI, B);
6128 if (Ty == LLT::scalar(16))
6129 return legalizeFSQRTF16(MI, MRI, B);
6130 return false;
6131}
6132
6133// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6134// FIXME: Why do we handle this one but not other removed instructions?
6135//
6136// Reciprocal square root. The clamp prevents infinite results, clamping
6137// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6138// +-max_float.
6141 MachineIRBuilder &B) const {
6142 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6143 return true;
6144
6145 Register Dst = MI.getOperand(0).getReg();
6146 Register Src = MI.getOperand(2).getReg();
6147 auto Flags = MI.getFlags();
6148
6149 LLT Ty = MRI.getType(Dst);
6150
6151 const fltSemantics *FltSemantics;
6152 if (Ty == LLT::scalar(32))
6153 FltSemantics = &APFloat::IEEEsingle();
6154 else if (Ty == LLT::scalar(64))
6155 FltSemantics = &APFloat::IEEEdouble();
6156 else
6157 return false;
6158
6159 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6160 .addUse(Src)
6161 .setMIFlags(Flags);
6162
6163 // We don't need to concern ourselves with the snan handling difference, since
6164 // the rsq quieted (or not) so use the one which will directly select.
6165 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6166 const bool UseIEEE = MFI->getMode().IEEE;
6167
6168 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6169 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6170 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6171
6172 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6173
6174 if (UseIEEE)
6175 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6176 else
6177 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6178 MI.eraseFromParent();
6179 return true;
6180}
6181
6182// TODO: Fix pointer type handling
6185 Intrinsic::ID IID) const {
6186
6187 MachineIRBuilder &B = Helper.MIRBuilder;
6188 MachineRegisterInfo &MRI = *B.getMRI();
6189
6190 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6191 IID == Intrinsic::amdgcn_permlanex16;
6192 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6193 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6194 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6195 IID == Intrinsic::amdgcn_permlane_up ||
6196 IID == Intrinsic::amdgcn_permlane_down ||
6197 IID == Intrinsic::amdgcn_permlane_xor;
6198
6199 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6200 Register Src2, LLT VT) -> Register {
6201 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6202 switch (IID) {
6203 case Intrinsic::amdgcn_readfirstlane:
6204 case Intrinsic::amdgcn_permlane64:
6205 return LaneOp.getReg(0);
6206 case Intrinsic::amdgcn_readlane:
6207 case Intrinsic::amdgcn_set_inactive:
6208 case Intrinsic::amdgcn_set_inactive_chain_arg:
6209 return LaneOp.addUse(Src1).getReg(0);
6210 case Intrinsic::amdgcn_writelane:
6211 case Intrinsic::amdgcn_permlane_bcast:
6212 case Intrinsic::amdgcn_permlane_up:
6213 case Intrinsic::amdgcn_permlane_down:
6214 case Intrinsic::amdgcn_permlane_xor:
6215 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6216 case Intrinsic::amdgcn_permlane16:
6217 case Intrinsic::amdgcn_permlanex16: {
6218 Register Src3 = MI.getOperand(5).getReg();
6219 int64_t Src4 = MI.getOperand(6).getImm();
6220 int64_t Src5 = MI.getOperand(7).getImm();
6221 return LaneOp.addUse(Src1)
6222 .addUse(Src2)
6223 .addUse(Src3)
6224 .addImm(Src4)
6225 .addImm(Src5)
6226 .getReg(0);
6227 }
6228 case Intrinsic::amdgcn_mov_dpp8:
6229 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6230 case Intrinsic::amdgcn_update_dpp:
6231 return LaneOp.addUse(Src1)
6232 .addImm(MI.getOperand(4).getImm())
6233 .addImm(MI.getOperand(5).getImm())
6234 .addImm(MI.getOperand(6).getImm())
6235 .addImm(MI.getOperand(7).getImm())
6236 .getReg(0);
6237 default:
6238 llvm_unreachable("unhandled lane op");
6239 }
6240 };
6241
6242 Register DstReg = MI.getOperand(0).getReg();
6243 Register Src0 = MI.getOperand(2).getReg();
6244 Register Src1, Src2;
6245 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6246 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6247 IsPermlaneShuffle) {
6248 Src1 = MI.getOperand(3).getReg();
6249 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6250 IsPermlaneShuffle) {
6251 Src2 = MI.getOperand(4).getReg();
6252 }
6253 }
6254
6255 LLT Ty = MRI.getType(DstReg);
6256 unsigned Size = Ty.getSizeInBits();
6257
6258 unsigned SplitSize = 32;
6259 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6260 ST.hasDPALU_DPP() &&
6261 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6262 SplitSize = 64;
6263
6264 if (Size == SplitSize) {
6265 // Already legal
6266 return true;
6267 }
6268
6269 if (Size < 32) {
6270 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6271
6272 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6273 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6274
6275 if (IID == Intrinsic::amdgcn_writelane)
6276 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6277
6278 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6279 B.buildTrunc(DstReg, LaneOpDst);
6280 MI.eraseFromParent();
6281 return true;
6282 }
6283
6284 if (Size % SplitSize != 0)
6285 return false;
6286
6287 LLT PartialResTy = LLT::scalar(SplitSize);
6288 bool NeedsBitcast = false;
6289 if (Ty.isVector()) {
6290 LLT EltTy = Ty.getElementType();
6291 unsigned EltSize = EltTy.getSizeInBits();
6292 if (EltSize == SplitSize) {
6293 PartialResTy = EltTy;
6294 } else if (EltSize == 16 || EltSize == 32) {
6295 unsigned NElem = SplitSize / EltSize;
6296 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6297 } else {
6298 // Handle all other cases via S32/S64 pieces
6299 NeedsBitcast = true;
6300 }
6301 }
6302
6303 SmallVector<Register, 4> PartialRes;
6304 unsigned NumParts = Size / SplitSize;
6305 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6306 MachineInstrBuilder Src1Parts, Src2Parts;
6307
6308 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6309 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6310
6311 if (IID == Intrinsic::amdgcn_writelane)
6312 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6313
6314 for (unsigned i = 0; i < NumParts; ++i) {
6315 Src0 = Src0Parts.getReg(i);
6316
6317 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6318 Src1 = Src1Parts.getReg(i);
6319
6320 if (IID == Intrinsic::amdgcn_writelane)
6321 Src2 = Src2Parts.getReg(i);
6322
6323 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6324 }
6325
6326 if (NeedsBitcast)
6327 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6328 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6329 else
6330 B.buildMergeLikeInstr(DstReg, PartialRes);
6331
6332 MI.eraseFromParent();
6333 return true;
6334}
6335
6338 MachineIRBuilder &B) const {
6340 ST.getTargetLowering()->getImplicitParameterOffset(
6342 LLT DstTy = MRI.getType(DstReg);
6343 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6344
6345 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6346 if (!loadInputValue(KernargPtrReg, B,
6348 return false;
6349
6350 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6351 B.buildConstant(IdxTy, Offset).getReg(0));
6352 return true;
6353}
6354
6355/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6356/// bits of the pointer and replace them with the stride argument, then
6357/// merge_values everything together. In the common case of a raw buffer (the
6358/// stride component is 0), we can just AND off the upper half.
6361 Register Result = MI.getOperand(0).getReg();
6362 Register Pointer = MI.getOperand(2).getReg();
6363 Register Stride = MI.getOperand(3).getReg();
6364 Register NumRecords = MI.getOperand(4).getReg();
6365 Register Flags = MI.getOperand(5).getReg();
6366
6367 LLT S32 = LLT::scalar(32);
6368 LLT S64 = LLT::scalar(64);
6369
6370 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6371
6372 auto ExtStride = B.buildAnyExt(S32, Stride);
6373
6374 if (ST.has45BitNumRecordsBufferResource()) {
6375 Register Zero = B.buildConstant(S32, 0).getReg(0);
6376 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6377 // num_records.
6378 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6379 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6380 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6381 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6382 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6383
6384 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6385 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6386 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6387 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6388 auto ExtShiftedStride =
6389 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6390 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6391 auto ExtShiftedFlags =
6392 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6393 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6394 Register HighHalf =
6395 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6396 B.buildMergeValues(Result, {LowHalf, HighHalf});
6397 } else {
6398 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6399 auto Unmerge = B.buildUnmerge(S32, Pointer);
6400 auto LowHalf = Unmerge.getReg(0);
6401 auto HighHalf = Unmerge.getReg(1);
6402
6403 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6404 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6405 auto ShiftConst = B.buildConstant(S32, 16);
6406 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6407 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6408 Register NewHighHalfReg = NewHighHalf.getReg(0);
6409 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6410 }
6411
6412 MI.eraseFromParent();
6413 return true;
6414}
6415
6418 MachineIRBuilder &B) const {
6419 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6420 if (!MFI->isEntryFunction()) {
6421 return legalizePreloadedArgIntrin(MI, MRI, B,
6423 }
6424
6425 Register DstReg = MI.getOperand(0).getReg();
6426 if (!getImplicitArgPtr(DstReg, MRI, B))
6427 return false;
6428
6429 MI.eraseFromParent();
6430 return true;
6431}
6432
6435 MachineIRBuilder &B) const {
6436 Function &F = B.getMF().getFunction();
6437 std::optional<uint32_t> KnownSize =
6439 if (KnownSize.has_value())
6440 B.buildConstant(DstReg, *KnownSize);
6441 return false;
6442}
6443
6446 MachineIRBuilder &B) const {
6447
6448 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6449 if (!MFI->isEntryFunction()) {
6450 return legalizePreloadedArgIntrin(MI, MRI, B,
6452 }
6453
6454 Register DstReg = MI.getOperand(0).getReg();
6455 if (!getLDSKernelId(DstReg, MRI, B))
6456 return false;
6457
6458 MI.eraseFromParent();
6459 return true;
6460}
6461
6465 unsigned AddrSpace) const {
6466 const LLT S32 = LLT::scalar(32);
6467 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6468 Register Hi32 = Unmerge.getReg(1);
6469
6470 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6471 ST.hasGloballyAddressableScratch()) {
6472 Register FlatScratchBaseHi =
6473 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6474 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6475 .getReg(0);
6476 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6477 // Test bits 63..58 against the aperture address.
6478 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6479 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6480 B.buildConstant(S32, 1u << 26));
6481 } else {
6482 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6483 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6484 }
6485 MI.eraseFromParent();
6486 return true;
6487}
6488
6489// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6490// offset (the offset that is included in bounds checking and swizzling, to be
6491// split between the instruction's voffset and immoffset fields) and soffset
6492// (the offset that is excluded from bounds checking and swizzling, to go in
6493// the instruction's soffset field). This function takes the first kind of
6494// offset and figures out how to split it between voffset and immoffset.
6495std::pair<Register, unsigned>
6497 Register OrigOffset) const {
6498 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6499 Register BaseReg;
6500 unsigned ImmOffset;
6501 const LLT S32 = LLT::scalar(32);
6502 MachineRegisterInfo &MRI = *B.getMRI();
6503
6504 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6505 // being added, so we can only safely match a 32-bit addition with no unsigned
6506 // overflow.
6507 bool CheckNUW = ST.hasGFX1250Insts();
6508 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6509 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6510
6511 // If BaseReg is a pointer, convert it to int.
6512 if (MRI.getType(BaseReg).isPointer())
6513 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6514
6515 // If the immediate value is too big for the immoffset field, put only bits
6516 // that would normally fit in the immoffset field. The remaining value that
6517 // is copied/added for the voffset field is a large power of 2, and it
6518 // stands more chance of being CSEd with the copy/add for another similar
6519 // load/store.
6520 // However, do not do that rounding down if that is a negative
6521 // number, as it appears to be illegal to have a negative offset in the
6522 // vgpr, even if adding the immediate offset makes it positive.
6523 unsigned Overflow = ImmOffset & ~MaxImm;
6524 ImmOffset -= Overflow;
6525 if ((int32_t)Overflow < 0) {
6526 Overflow += ImmOffset;
6527 ImmOffset = 0;
6528 }
6529
6530 if (Overflow != 0) {
6531 if (!BaseReg) {
6532 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6533 } else {
6534 auto OverflowVal = B.buildConstant(S32, Overflow);
6535 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6536 }
6537 }
6538
6539 if (!BaseReg)
6540 BaseReg = B.buildConstant(S32, 0).getReg(0);
6541
6542 return std::pair(BaseReg, ImmOffset);
6543}
6544
6545/// Handle register layout difference for f16 images for some subtargets.
6548 Register Reg,
6549 bool ImageStore) const {
6550 const LLT S16 = LLT::scalar(16);
6551 const LLT S32 = LLT::scalar(32);
6552 LLT StoreVT = MRI.getType(Reg);
6553 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6554
6555 if (ST.hasUnpackedD16VMem()) {
6556 auto Unmerge = B.buildUnmerge(S16, Reg);
6557
6558 SmallVector<Register, 4> WideRegs;
6559 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6560 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6561
6562 int NumElts = StoreVT.getNumElements();
6563
6564 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6565 .getReg(0);
6566 }
6567
6568 if (ImageStore && ST.hasImageStoreD16Bug()) {
6569 if (StoreVT.getNumElements() == 2) {
6570 SmallVector<Register, 4> PackedRegs;
6571 Reg = B.buildBitcast(S32, Reg).getReg(0);
6572 PackedRegs.push_back(Reg);
6573 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6574 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6575 .getReg(0);
6576 }
6577
6578 if (StoreVT.getNumElements() == 3) {
6579 SmallVector<Register, 4> PackedRegs;
6580 auto Unmerge = B.buildUnmerge(S16, Reg);
6581 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6582 PackedRegs.push_back(Unmerge.getReg(I));
6583 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6584 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6585 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6586 }
6587
6588 if (StoreVT.getNumElements() == 4) {
6589 SmallVector<Register, 4> PackedRegs;
6590 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6591 auto Unmerge = B.buildUnmerge(S32, Reg);
6592 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6593 PackedRegs.push_back(Unmerge.getReg(I));
6594 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6595 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6596 .getReg(0);
6597 }
6598
6599 llvm_unreachable("invalid data type");
6600 }
6601
6602 if (StoreVT == LLT::fixed_vector(3, S16)) {
6603 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6604 .getReg(0);
6605 }
6606 return Reg;
6607}
6608
6610 Register VData, LLT MemTy,
6611 bool IsFormat) const {
6612 MachineRegisterInfo *MRI = B.getMRI();
6613 LLT Ty = MRI->getType(VData);
6614
6615 const LLT S16 = LLT::scalar(16);
6616
6617 // Fixup buffer resources themselves needing to be v4i128.
6619 return castBufferRsrcToV4I32(VData, B);
6620
6621 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6622 Ty = getBitcastRegisterType(Ty);
6623 VData = B.buildBitcast(Ty, VData).getReg(0);
6624 }
6625 // Fixup illegal register types for i8 stores.
6626 if (Ty == LLT::scalar(8) || Ty == S16) {
6627 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6628 return AnyExt;
6629 }
6630
6631 if (Ty.isVector()) {
6632 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6633 if (IsFormat)
6634 return handleD16VData(B, *MRI, VData);
6635 }
6636 }
6637
6638 return VData;
6639}
6640
6642 LegalizerHelper &Helper,
6643 bool IsTyped,
6644 bool IsFormat) const {
6645 MachineIRBuilder &B = Helper.MIRBuilder;
6646 MachineRegisterInfo &MRI = *B.getMRI();
6647
6648 Register VData = MI.getOperand(1).getReg();
6649 LLT Ty = MRI.getType(VData);
6650 LLT EltTy = Ty.getScalarType();
6651 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6652 const LLT S32 = LLT::scalar(32);
6653
6654 MachineMemOperand *MMO = *MI.memoperands_begin();
6655 const int MemSize = MMO->getSize().getValue();
6656 LLT MemTy = MMO->getMemoryType();
6657
6658 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6659
6661 Register RSrc = MI.getOperand(2).getReg();
6662
6663 unsigned ImmOffset;
6664
6665 // The typed intrinsics add an immediate after the registers.
6666 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6667
6668 // The struct intrinsic variants add one additional operand over raw.
6669 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6670 Register VIndex;
6671 int OpOffset = 0;
6672 if (HasVIndex) {
6673 VIndex = MI.getOperand(3).getReg();
6674 OpOffset = 1;
6675 } else {
6676 VIndex = B.buildConstant(S32, 0).getReg(0);
6677 }
6678
6679 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6680 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6681
6682 unsigned Format = 0;
6683 if (IsTyped) {
6684 Format = MI.getOperand(5 + OpOffset).getImm();
6685 ++OpOffset;
6686 }
6687
6688 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6689
6690 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6691
6692 unsigned Opc;
6693 if (IsTyped) {
6694 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6695 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6696 } else if (IsFormat) {
6697 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6698 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6699 } else {
6700 switch (MemSize) {
6701 case 1:
6702 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6703 break;
6704 case 2:
6705 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6706 break;
6707 default:
6708 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6709 break;
6710 }
6711 }
6712
6713 auto MIB = B.buildInstr(Opc)
6714 .addUse(VData) // vdata
6715 .addUse(RSrc) // rsrc
6716 .addUse(VIndex) // vindex
6717 .addUse(VOffset) // voffset
6718 .addUse(SOffset) // soffset
6719 .addImm(ImmOffset); // offset(imm)
6720
6721 if (IsTyped)
6722 MIB.addImm(Format);
6723
6724 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6725 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6726 .addMemOperand(MMO);
6727
6728 MI.eraseFromParent();
6729 return true;
6730}
6731
6732static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6733 Register VIndex, Register VOffset, Register SOffset,
6734 unsigned ImmOffset, unsigned Format,
6735 unsigned AuxiliaryData, MachineMemOperand *MMO,
6736 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6737 auto MIB = B.buildInstr(Opc)
6738 .addDef(LoadDstReg) // vdata
6739 .addUse(RSrc) // rsrc
6740 .addUse(VIndex) // vindex
6741 .addUse(VOffset) // voffset
6742 .addUse(SOffset) // soffset
6743 .addImm(ImmOffset); // offset(imm)
6744
6745 if (IsTyped)
6746 MIB.addImm(Format);
6747
6748 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6749 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6750 .addMemOperand(MMO);
6751}
6752
6754 LegalizerHelper &Helper,
6755 bool IsFormat,
6756 bool IsTyped) const {
6757 MachineIRBuilder &B = Helper.MIRBuilder;
6758 MachineRegisterInfo &MRI = *B.getMRI();
6759 GISelChangeObserver &Observer = Helper.Observer;
6760
6761 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6762 MachineMemOperand *MMO = *MI.memoperands_begin();
6763 const LLT MemTy = MMO->getMemoryType();
6764 const LLT S32 = LLT::scalar(32);
6765
6766 Register Dst = MI.getOperand(0).getReg();
6767
6768 Register StatusDst;
6769 int OpOffset = 0;
6770 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6771 bool IsTFE = MI.getNumExplicitDefs() == 2;
6772 if (IsTFE) {
6773 StatusDst = MI.getOperand(1).getReg();
6774 ++OpOffset;
6775 }
6776
6777 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6778 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6779
6780 // The typed intrinsics add an immediate after the registers.
6781 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6782
6783 // The struct intrinsic variants add one additional operand over raw.
6784 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6785 Register VIndex;
6786 if (HasVIndex) {
6787 VIndex = MI.getOperand(3 + OpOffset).getReg();
6788 ++OpOffset;
6789 } else {
6790 VIndex = B.buildConstant(S32, 0).getReg(0);
6791 }
6792
6793 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6794 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6795
6796 unsigned Format = 0;
6797 if (IsTyped) {
6798 Format = MI.getOperand(5 + OpOffset).getImm();
6799 ++OpOffset;
6800 }
6801
6802 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6803 unsigned ImmOffset;
6804
6805 LLT Ty = MRI.getType(Dst);
6806 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6807 // logic doesn't have to handle that case.
6808 if (hasBufferRsrcWorkaround(Ty)) {
6809 Observer.changingInstr(MI);
6810 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6811 Observer.changedInstr(MI);
6812 Dst = MI.getOperand(0).getReg();
6813 B.setInsertPt(B.getMBB(), MI);
6814 }
6815 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6816 Ty = getBitcastRegisterType(Ty);
6817 Observer.changingInstr(MI);
6818 Helper.bitcastDst(MI, Ty, 0);
6819 Observer.changedInstr(MI);
6820 Dst = MI.getOperand(0).getReg();
6821 B.setInsertPt(B.getMBB(), MI);
6822 }
6823
6824 LLT EltTy = Ty.getScalarType();
6825 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6826 const bool Unpacked = ST.hasUnpackedD16VMem();
6827
6828 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6829
6830 unsigned Opc;
6831
6832 // TODO: Support TFE for typed and narrow loads.
6833 if (IsTyped) {
6834 if (IsTFE)
6835 return false;
6836 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6837 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6838 } else if (IsFormat) {
6839 if (IsD16) {
6840 if (IsTFE)
6841 return false;
6842 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6843 } else {
6844 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6845 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6846 }
6847 } else {
6848 switch (MemTy.getSizeInBits()) {
6849 case 8:
6850 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6851 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6852 break;
6853 case 16:
6854 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6855 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6856 break;
6857 default:
6858 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6859 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6860 break;
6861 }
6862 }
6863
6864 if (IsTFE) {
6865 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6866 unsigned NumLoadDWords = NumValueDWords + 1;
6867 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6868 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6869 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6870 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6871 if (MemTy.getSizeInBits() < 32) {
6872 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6873 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6874 B.buildTrunc(Dst, ExtDst);
6875 } else if (NumValueDWords == 1) {
6876 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6877 } else {
6878 SmallVector<Register, 5> LoadElts;
6879 for (unsigned I = 0; I != NumValueDWords; ++I)
6880 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6881 LoadElts.push_back(StatusDst);
6882 B.buildUnmerge(LoadElts, LoadDstReg);
6883 LoadElts.truncate(NumValueDWords);
6884 B.buildMergeLikeInstr(Dst, LoadElts);
6885 }
6886 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6887 (IsD16 && !Ty.isVector())) {
6888 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6889 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6890 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6891 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6892 B.buildTrunc(Dst, LoadDstReg);
6893 } else if (Unpacked && IsD16 && Ty.isVector()) {
6894 LLT UnpackedTy = Ty.changeElementSize(32);
6895 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6896 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6897 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6898 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6899 // FIXME: G_TRUNC should work, but legalization currently fails
6900 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6902 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6903 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6904 B.buildMergeLikeInstr(Dst, Repack);
6905 } else {
6906 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6907 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6908 }
6909
6910 MI.eraseFromParent();
6911 return true;
6912}
6913
6914static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6915 switch (IntrID) {
6916 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6931 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6933 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6936 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6938 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6941 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6943 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6946 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6948 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6951 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6953 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6956 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6958 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6961 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6962 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6963 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6966 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6968 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6969 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6971 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6972 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6973 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6976 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6977 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6978 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6979 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6980 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6981 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6982 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6983 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6984 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6985 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6986 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6987 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6988 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6989 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6990 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6991 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6992 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6993 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6994 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6995 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6996 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6997 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6998 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6999 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
7000 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
7001 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
7002 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
7003 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
7004 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
7005 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
7006 default:
7007 llvm_unreachable("unhandled atomic opcode");
7008 }
7009}
7010
7013 Intrinsic::ID IID) const {
7014 const bool IsCmpSwap =
7015 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
7016 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
7017 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
7018 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
7019
7020 Register Dst = MI.getOperand(0).getReg();
7021 // Since we don't have 128-bit atomics, we don't need to handle the case of
7022 // p8 argmunents to the atomic itself
7023 Register VData = MI.getOperand(2).getReg();
7024
7025 Register CmpVal;
7026 int OpOffset = 0;
7027
7028 if (IsCmpSwap) {
7029 CmpVal = MI.getOperand(3).getReg();
7030 ++OpOffset;
7031 }
7032
7033 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
7034 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
7035 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7036
7037 // The struct intrinsic variants add one additional operand over raw.
7038 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
7039 Register VIndex;
7040 if (HasVIndex) {
7041 VIndex = MI.getOperand(4 + OpOffset).getReg();
7042 ++OpOffset;
7043 } else {
7044 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
7045 }
7046
7047 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
7048 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
7049 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
7050
7051 MachineMemOperand *MMO = *MI.memoperands_begin();
7052
7053 unsigned ImmOffset;
7054 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
7055
7056 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
7057 .addDef(Dst)
7058 .addUse(VData); // vdata
7059
7060 if (IsCmpSwap)
7061 MIB.addReg(CmpVal);
7062
7063 MIB.addUse(RSrc) // rsrc
7064 .addUse(VIndex) // vindex
7065 .addUse(VOffset) // voffset
7066 .addUse(SOffset) // soffset
7067 .addImm(ImmOffset) // offset(imm)
7068 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
7069 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
7070 .addMemOperand(MMO);
7071
7072 MI.eraseFromParent();
7073 return true;
7074}
7075
7076/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
7077/// vector with s16 typed elements.
7079 SmallVectorImpl<Register> &PackedAddrs,
7080 unsigned ArgOffset,
7082 bool IsA16, bool IsG16) {
7083 const LLT S16 = LLT::scalar(16);
7084 const LLT V2S16 = LLT::fixed_vector(2, 16);
7085 auto EndIdx = Intr->VAddrEnd;
7086
7087 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
7088 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7089 if (!SrcOp.isReg())
7090 continue; // _L to _LZ may have eliminated this.
7091
7092 Register AddrReg = SrcOp.getReg();
7093
7094 if ((I < Intr->GradientStart) ||
7095 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7096 (I >= Intr->CoordStart && !IsA16)) {
7097 if ((I < Intr->GradientStart) && IsA16 &&
7098 (B.getMRI()->getType(AddrReg) == S16)) {
7099 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7100 // Special handling of bias when A16 is on. Bias is of type half but
7101 // occupies full 32-bit.
7102 PackedAddrs.push_back(
7103 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7104 .getReg(0));
7105 } else {
7106 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7107 "Bias needs to be converted to 16 bit in A16 mode");
7108 // Handle any gradient or coordinate operands that should not be packed
7109 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
7110 PackedAddrs.push_back(AddrReg);
7111 }
7112 } else {
7113 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7114 // derivatives dx/dh and dx/dv are packed with undef.
7115 if (((I + 1) >= EndIdx) ||
7116 ((Intr->NumGradients / 2) % 2 == 1 &&
7117 (I == static_cast<unsigned>(Intr->GradientStart +
7118 (Intr->NumGradients / 2) - 1) ||
7119 I == static_cast<unsigned>(Intr->GradientStart +
7120 Intr->NumGradients - 1))) ||
7121 // Check for _L to _LZ optimization
7122 !MI.getOperand(ArgOffset + I + 1).isReg()) {
7123 PackedAddrs.push_back(
7124 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7125 .getReg(0));
7126 } else {
7127 PackedAddrs.push_back(
7128 B.buildBuildVector(
7129 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7130 .getReg(0));
7131 ++I;
7132 }
7133 }
7134 }
7135}
7136
7137/// Convert from separate vaddr components to a single vector address register,
7138/// and replace the remaining operands with $noreg.
7140 int DimIdx, int NumVAddrs) {
7141 const LLT S32 = LLT::scalar(32);
7142 (void)S32;
7143 SmallVector<Register, 8> AddrRegs;
7144 for (int I = 0; I != NumVAddrs; ++I) {
7145 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7146 if (SrcOp.isReg()) {
7147 AddrRegs.push_back(SrcOp.getReg());
7148 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7149 }
7150 }
7151
7152 int NumAddrRegs = AddrRegs.size();
7153 if (NumAddrRegs != 1) {
7154 auto VAddr =
7155 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7156 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7157 }
7158
7159 for (int I = 1; I != NumVAddrs; ++I) {
7160 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7161 if (SrcOp.isReg())
7162 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7163 }
7164}
7165
7166/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7167///
7168/// Depending on the subtarget, load/store with 16-bit element data need to be
7169/// rewritten to use the low half of 32-bit registers, or directly use a packed
7170/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7171/// registers.
7172///
7173/// We don't want to directly select image instructions just yet, but also want
7174/// to exposes all register repacking to the legalizer/combiners. We also don't
7175/// want a selected instruction entering RegBankSelect. In order to avoid
7176/// defining a multitude of intermediate image instructions, directly hack on
7177/// the intrinsic's arguments. In cases like a16 addresses, this requires
7178/// padding now unnecessary arguments with $noreg.
7181 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7182
7183 const MachineFunction &MF = *MI.getMF();
7184 const unsigned NumDefs = MI.getNumExplicitDefs();
7185 const unsigned ArgOffset = NumDefs + 1;
7186 bool IsTFE = NumDefs == 2;
7187 // We are only processing the operands of d16 image operations on subtargets
7188 // that use the unpacked register layout, or need to repack the TFE result.
7189
7190 // TODO: Do we need to guard against already legalized intrinsics?
7191 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7193
7194 MachineRegisterInfo *MRI = B.getMRI();
7195 const LLT S32 = LLT::scalar(32);
7196 const LLT S16 = LLT::scalar(16);
7197 const LLT V2S16 = LLT::fixed_vector(2, 16);
7198
7199 unsigned DMask = 0;
7200 Register VData;
7201 LLT Ty;
7202
7203 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7204 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7205 Ty = MRI->getType(VData);
7206 }
7207
7208 const bool IsAtomicPacked16Bit =
7209 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7210 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7211
7212 // Check for 16 bit addresses and pack if true.
7213 LLT GradTy =
7214 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7215 LLT AddrTy =
7216 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7217 const bool IsG16 =
7218 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7219 const bool IsA16 = AddrTy == S16;
7220 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7221
7222 int DMaskLanes = 0;
7223 if (!BaseOpcode->Atomic) {
7224 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7225 if (BaseOpcode->Gather4) {
7226 DMaskLanes = 4;
7227 } else if (DMask != 0) {
7228 DMaskLanes = llvm::popcount(DMask);
7229 } else if (!IsTFE && !BaseOpcode->Store) {
7230 // If dmask is 0, this is a no-op load. This can be eliminated.
7231 B.buildUndef(MI.getOperand(0));
7232 MI.eraseFromParent();
7233 return true;
7234 }
7235 }
7236
7237 Observer.changingInstr(MI);
7238 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7239
7240 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7241 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7242 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7243 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7244 unsigned NewOpcode = LoadOpcode;
7245 if (BaseOpcode->Store)
7246 NewOpcode = StoreOpcode;
7247 else if (BaseOpcode->NoReturn)
7248 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7249
7250 // Track that we legalized this
7251 MI.setDesc(B.getTII().get(NewOpcode));
7252
7253 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7254 // dmask to be at least 1 otherwise the instruction will fail
7255 if (IsTFE && DMask == 0) {
7256 DMask = 0x1;
7257 DMaskLanes = 1;
7258 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7259 }
7260
7261 if (BaseOpcode->Atomic) {
7262 Register VData0 = MI.getOperand(2).getReg();
7263 LLT Ty = MRI->getType(VData0);
7264
7265 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7266 if (Ty.isVector() && !IsAtomicPacked16Bit)
7267 return false;
7268
7269 if (BaseOpcode->AtomicX2) {
7270 Register VData1 = MI.getOperand(3).getReg();
7271 // The two values are packed in one register.
7272 LLT PackedTy = LLT::fixed_vector(2, Ty);
7273 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7274 MI.getOperand(2).setReg(Concat.getReg(0));
7275 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7276 }
7277 }
7278
7279 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7280
7281 // Rewrite the addressing register layout before doing anything else.
7282 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7283 // 16 bit gradients are supported, but are tied to the A16 control
7284 // so both gradients and addresses must be 16 bit
7285 return false;
7286 }
7287
7288 if (IsA16 && !ST.hasA16()) {
7289 // A16 not supported
7290 return false;
7291 }
7292
7293 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7294 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7295
7296 if (IsA16 || IsG16) {
7297 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7298 // instructions expect VGPR_32
7299 SmallVector<Register, 4> PackedRegs;
7300
7301 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7302
7303 // See also below in the non-a16 branch
7304 const bool UseNSA = ST.hasNSAEncoding() &&
7305 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7306 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7307 const bool UsePartialNSA =
7308 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7309
7310 if (UsePartialNSA) {
7311 // Pack registers that would go over NSAMaxSize into last VAddr register
7312 LLT PackedAddrTy =
7313 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7314 auto Concat = B.buildConcatVectors(
7315 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7316 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7317 PackedRegs.resize(NSAMaxSize);
7318 } else if (!UseNSA && PackedRegs.size() > 1) {
7319 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7320 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7321 PackedRegs[0] = Concat.getReg(0);
7322 PackedRegs.resize(1);
7323 }
7324
7325 const unsigned NumPacked = PackedRegs.size();
7326 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7327 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7328 if (!SrcOp.isReg()) {
7329 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7330 continue;
7331 }
7332
7333 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7334
7335 if (I - Intr->VAddrStart < NumPacked)
7336 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7337 else
7338 SrcOp.setReg(AMDGPU::NoRegister);
7339 }
7340 } else {
7341 // If the register allocator cannot place the address registers contiguously
7342 // without introducing moves, then using the non-sequential address encoding
7343 // is always preferable, since it saves VALU instructions and is usually a
7344 // wash in terms of code size or even better.
7345 //
7346 // However, we currently have no way of hinting to the register allocator
7347 // that MIMG addresses should be placed contiguously when it is possible to
7348 // do so, so force non-NSA for the common 2-address case as a heuristic.
7349 //
7350 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7351 // allocation when possible.
7352 //
7353 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7354 // set of the remaining addresses.
7355 const bool UseNSA = ST.hasNSAEncoding() &&
7356 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7357 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7358 const bool UsePartialNSA =
7359 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7360
7361 if (UsePartialNSA) {
7363 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7364 Intr->NumVAddrs - NSAMaxSize + 1);
7365 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7366 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7367 Intr->NumVAddrs);
7368 }
7369 }
7370
7371 int Flags = 0;
7372 if (IsA16)
7373 Flags |= 1;
7374 if (IsG16)
7375 Flags |= 2;
7376 MI.addOperand(MachineOperand::CreateImm(Flags));
7377
7378 if (BaseOpcode->NoReturn) { // No TFE for stores?
7379 // TODO: Handle dmask trim
7380 if (!Ty.isVector() || !IsD16)
7381 return true;
7382
7383 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7384 if (RepackedReg != VData) {
7385 MI.getOperand(1).setReg(RepackedReg);
7386 }
7387
7388 return true;
7389 }
7390
7391 Register DstReg = MI.getOperand(0).getReg();
7392 const LLT EltTy = Ty.getScalarType();
7393 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7394
7395 // Confirm that the return type is large enough for the dmask specified
7396 if (NumElts < DMaskLanes)
7397 return false;
7398
7399 if (NumElts > 4 || DMaskLanes > 4)
7400 return false;
7401
7402 // Image atomic instructions are using DMask to specify how many bits
7403 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7404 // DMaskLanes for image atomic has default value '0'.
7405 // We must be sure that atomic variants (especially packed) will not be
7406 // truncated from v2s16 or v4s16 to s16 type.
7407 //
7408 // ChangeElementCount will be needed for image load where Ty is always scalar.
7409 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7410 const LLT AdjustedTy =
7411 DMaskLanes == 0
7412 ? Ty
7413 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7414
7415 // The raw dword aligned data component of the load. The only legal cases
7416 // where this matters should be when using the packed D16 format, for
7417 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7418 LLT RoundedTy;
7419
7420 // S32 vector to cover all data, plus TFE result element.
7421 LLT TFETy;
7422
7423 // Register type to use for each loaded component. Will be S32 or V2S16.
7424 LLT RegTy;
7425
7426 if (IsD16 && ST.hasUnpackedD16VMem()) {
7427 RoundedTy =
7428 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7429 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7430 RegTy = S32;
7431 } else {
7432 unsigned EltSize = EltTy.getSizeInBits();
7433 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7434 unsigned RoundedSize = 32 * RoundedElts;
7435 RoundedTy = LLT::scalarOrVector(
7436 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7437 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7438 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7439 }
7440
7441 // The return type does not need adjustment.
7442 // TODO: Should we change s16 case to s32 or <2 x s16>?
7443 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7444 return true;
7445
7446 Register Dst1Reg;
7447
7448 // Insert after the instruction.
7449 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7450
7451 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7452 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7453 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7454 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7455
7456 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7457
7458 MI.getOperand(0).setReg(NewResultReg);
7459
7460 // In the IR, TFE is supposed to be used with a 2 element struct return
7461 // type. The instruction really returns these two values in one contiguous
7462 // register, with one additional dword beyond the loaded data. Rewrite the
7463 // return type to use a single register result.
7464
7465 if (IsTFE) {
7466 Dst1Reg = MI.getOperand(1).getReg();
7467 if (MRI->getType(Dst1Reg) != S32)
7468 return false;
7469
7470 // TODO: Make sure the TFE operand bit is set.
7471 MI.removeOperand(1);
7472
7473 // Handle the easy case that requires no repack instructions.
7474 if (Ty == S32) {
7475 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7476 return true;
7477 }
7478 }
7479
7480 // Now figure out how to copy the new result register back into the old
7481 // result.
7482 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7483
7484 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7485
7486 if (ResultNumRegs == 1) {
7487 assert(!IsTFE);
7488 ResultRegs[0] = NewResultReg;
7489 } else {
7490 // We have to repack into a new vector of some kind.
7491 for (int I = 0; I != NumDataRegs; ++I)
7492 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7493 B.buildUnmerge(ResultRegs, NewResultReg);
7494
7495 // Drop the final TFE element to get the data part. The TFE result is
7496 // directly written to the right place already.
7497 if (IsTFE)
7498 ResultRegs.resize(NumDataRegs);
7499 }
7500
7501 // For an s16 scalar result, we form an s32 result with a truncate regardless
7502 // of packed vs. unpacked.
7503 if (IsD16 && !Ty.isVector()) {
7504 B.buildTrunc(DstReg, ResultRegs[0]);
7505 return true;
7506 }
7507
7508 // Avoid a build/concat_vector of 1 entry.
7509 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7510 B.buildBitcast(DstReg, ResultRegs[0]);
7511 return true;
7512 }
7513
7514 assert(Ty.isVector());
7515
7516 if (IsD16) {
7517 // For packed D16 results with TFE enabled, all the data components are
7518 // S32. Cast back to the expected type.
7519 //
7520 // TODO: We don't really need to use load s32 elements. We would only need one
7521 // cast for the TFE result if a multiple of v2s16 was used.
7522 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7523 for (Register &Reg : ResultRegs)
7524 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7525 } else if (ST.hasUnpackedD16VMem()) {
7526 for (Register &Reg : ResultRegs)
7527 Reg = B.buildTrunc(S16, Reg).getReg(0);
7528 }
7529 }
7530
7531 auto padWithUndef = [&](LLT Ty, int NumElts) {
7532 if (NumElts == 0)
7533 return;
7534 Register Undef = B.buildUndef(Ty).getReg(0);
7535 for (int I = 0; I != NumElts; ++I)
7536 ResultRegs.push_back(Undef);
7537 };
7538
7539 // Pad out any elements eliminated due to the dmask.
7540 LLT ResTy = MRI->getType(ResultRegs[0]);
7541 if (!ResTy.isVector()) {
7542 padWithUndef(ResTy, NumElts - ResultRegs.size());
7543 B.buildBuildVector(DstReg, ResultRegs);
7544 return true;
7545 }
7546
7547 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7548 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7549
7550 // Deal with the one annoying legal case.
7551 const LLT V3S16 = LLT::fixed_vector(3, 16);
7552 if (Ty == V3S16) {
7553 if (IsTFE) {
7554 if (ResultRegs.size() == 1) {
7555 NewResultReg = ResultRegs[0];
7556 } else if (ResultRegs.size() == 2) {
7557 LLT V4S16 = LLT::fixed_vector(4, 16);
7558 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7559 } else {
7560 return false;
7561 }
7562 }
7563
7564 if (MRI->getType(DstReg).getNumElements() <
7565 MRI->getType(NewResultReg).getNumElements()) {
7566 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7567 } else {
7568 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7569 }
7570 return true;
7571 }
7572
7573 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7574 B.buildConcatVectors(DstReg, ResultRegs);
7575 return true;
7576}
7577
7579 MachineInstr &MI) const {
7580 MachineIRBuilder &B = Helper.MIRBuilder;
7581 GISelChangeObserver &Observer = Helper.Observer;
7582
7583 Register OrigDst = MI.getOperand(0).getReg();
7584 Register Dst;
7585 LLT Ty = B.getMRI()->getType(OrigDst);
7586 unsigned Size = Ty.getSizeInBits();
7587 MachineFunction &MF = B.getMF();
7588 unsigned Opc = 0;
7589 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7590 assert(Size == 8 || Size == 16);
7591 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7592 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7593 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7594 // destination register.
7595 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7596 } else {
7597 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7598 Dst = OrigDst;
7599 }
7600
7601 Observer.changingInstr(MI);
7602
7603 // Handle needing to s.buffer.load() a p8 value.
7604 if (hasBufferRsrcWorkaround(Ty)) {
7605 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7606 B.setInsertPt(B.getMBB(), MI);
7607 }
7609 Ty = getBitcastRegisterType(Ty);
7610 Helper.bitcastDst(MI, Ty, 0);
7611 B.setInsertPt(B.getMBB(), MI);
7612 }
7613
7614 // FIXME: We don't really need this intermediate instruction. The intrinsic
7615 // should be fixed to have a memory operand. Since it's readnone, we're not
7616 // allowed to add one.
7617 MI.setDesc(B.getTII().get(Opc));
7618 MI.removeOperand(1); // Remove intrinsic ID
7619
7620 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7621 const unsigned MemSize = (Size + 7) / 8;
7622 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7628 MemSize, MemAlign);
7629 MI.addMemOperand(MF, MMO);
7630 if (Dst != OrigDst) {
7631 MI.getOperand(0).setReg(Dst);
7632 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7633 B.buildTrunc(OrigDst, Dst);
7634 }
7635
7636 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7637 // always be legal. We may need to restore this to a 96-bit result if it turns
7638 // out this needs to be converted to a vector load during RegBankSelect.
7639 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7640 if (Ty.isVector())
7642 else
7643 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7644 }
7645
7646 Observer.changedInstr(MI);
7647 return true;
7648}
7649
7651 MachineInstr &MI) const {
7652 MachineIRBuilder &B = Helper.MIRBuilder;
7653 GISelChangeObserver &Observer = Helper.Observer;
7654 Observer.changingInstr(MI);
7655 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7656 MI.removeOperand(0); // Remove intrinsic ID
7658 Observer.changedInstr(MI);
7659 return true;
7660}
7661
7662// TODO: Move to selection
7665 MachineIRBuilder &B) const {
7666 if (!ST.hasTrapHandler() ||
7667 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7668 return legalizeTrapEndpgm(MI, MRI, B);
7669
7670 return ST.supportsGetDoorbellID() ?
7672}
7673
7676 const DebugLoc &DL = MI.getDebugLoc();
7677 MachineBasicBlock &BB = B.getMBB();
7678 MachineFunction *MF = BB.getParent();
7679
7680 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7681 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7682 .addImm(0);
7683 MI.eraseFromParent();
7684 return true;
7685 }
7686
7687 // We need a block split to make the real endpgm a terminator. We also don't
7688 // want to break phis in successor blocks, so we can't just delete to the
7689 // end of the block.
7690 BB.splitAt(MI, false /*UpdateLiveIns*/);
7692 MF->push_back(TrapBB);
7693 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7694 .addImm(0);
7695 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7696 .addMBB(TrapBB);
7697
7698 BB.addSuccessor(TrapBB);
7699 MI.eraseFromParent();
7700 return true;
7701}
7702
7705 MachineFunction &MF = B.getMF();
7706 const LLT S64 = LLT::scalar(64);
7707
7708 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7709 // For code object version 5, queue_ptr is passed through implicit kernarg.
7715 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7716
7717 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7719
7720 if (!loadInputValue(KernargPtrReg, B,
7722 return false;
7723
7724 // TODO: can we be smarter about machine pointer info?
7727 PtrInfo.getWithOffset(Offset),
7731
7732 // Pointer address
7735 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7736 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7737 // Load address
7738 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7739 B.buildCopy(SGPR01, Temp);
7740 B.buildInstr(AMDGPU::S_TRAP)
7741 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7742 .addReg(SGPR01, RegState::Implicit);
7743 MI.eraseFromParent();
7744 return true;
7745 }
7746
7747 // Pass queue pointer to trap handler as input, and insert trap instruction
7748 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7749 Register LiveIn =
7752 return false;
7753
7754 B.buildCopy(SGPR01, LiveIn);
7755 B.buildInstr(AMDGPU::S_TRAP)
7756 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7757 .addReg(SGPR01, RegState::Implicit);
7758
7759 MI.eraseFromParent();
7760 return true;
7761}
7762
7765 MachineIRBuilder &B) const {
7766 // We need to simulate the 's_trap 2' instruction on targets that run in
7767 // PRIV=1 (where it is treated as a nop).
7768 if (ST.hasPrivEnabledTrap2NopBug()) {
7769 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7770 MI.getDebugLoc());
7771 MI.eraseFromParent();
7772 return true;
7773 }
7774
7775 B.buildInstr(AMDGPU::S_TRAP)
7776 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7777 MI.eraseFromParent();
7778 return true;
7779}
7780
7783 MachineIRBuilder &B) const {
7784 // Is non-HSA path or trap-handler disabled? Then, report a warning
7785 // accordingly
7786 if (!ST.hasTrapHandler() ||
7787 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7788 Function &Fn = B.getMF().getFunction();
7790 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7791 } else {
7792 // Insert debug-trap instruction
7793 B.buildInstr(AMDGPU::S_TRAP)
7794 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7795 }
7796
7797 MI.eraseFromParent();
7798 return true;
7799}
7800
7802 MachineInstr &MI, MachineIRBuilder &B) const {
7803 MachineRegisterInfo &MRI = *B.getMRI();
7804 const LLT S16 = LLT::scalar(16);
7805 const LLT S32 = LLT::scalar(32);
7806 const LLT V2S16 = LLT::fixed_vector(2, 16);
7807 const LLT V3S32 = LLT::fixed_vector(3, 32);
7808
7809 Register DstReg = MI.getOperand(0).getReg();
7810 Register NodePtr = MI.getOperand(2).getReg();
7811 Register RayExtent = MI.getOperand(3).getReg();
7812 Register RayOrigin = MI.getOperand(4).getReg();
7813 Register RayDir = MI.getOperand(5).getReg();
7814 Register RayInvDir = MI.getOperand(6).getReg();
7815 Register TDescr = MI.getOperand(7).getReg();
7816
7817 if (!ST.hasGFX10_AEncoding()) {
7818 Function &Fn = B.getMF().getFunction();
7820 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7821 return false;
7822 }
7823
7824 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7825 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7826 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7827 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7828 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7829 const unsigned NumVDataDwords = 4;
7830 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7831 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7832 const bool UseNSA =
7833 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7834
7835 const unsigned BaseOpcodes[2][2] = {
7836 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7837 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7838 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7839 int Opcode;
7840 if (UseNSA) {
7841 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7842 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7843 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7844 : AMDGPU::MIMGEncGfx10NSA,
7845 NumVDataDwords, NumVAddrDwords);
7846 } else {
7847 assert(!IsGFX12Plus);
7848 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7849 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7850 : AMDGPU::MIMGEncGfx10Default,
7851 NumVDataDwords, NumVAddrDwords);
7852 }
7853 assert(Opcode != -1);
7854
7856 if (UseNSA && IsGFX11Plus) {
7857 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7858 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7859 auto Merged = B.buildMergeLikeInstr(
7860 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7861 Ops.push_back(Merged.getReg(0));
7862 };
7863
7864 Ops.push_back(NodePtr);
7865 Ops.push_back(RayExtent);
7866 packLanes(RayOrigin);
7867
7868 if (IsA16) {
7869 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7870 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7871 auto MergedDir = B.buildMergeLikeInstr(
7872 V3S32,
7873 {B.buildBitcast(
7874 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7875 UnmergeRayDir.getReg(0)}))
7876 .getReg(0),
7877 B.buildBitcast(
7878 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7879 UnmergeRayDir.getReg(1)}))
7880 .getReg(0),
7881 B.buildBitcast(
7882 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7883 UnmergeRayDir.getReg(2)}))
7884 .getReg(0)});
7885 Ops.push_back(MergedDir.getReg(0));
7886 } else {
7887 packLanes(RayDir);
7888 packLanes(RayInvDir);
7889 }
7890 } else {
7891 if (Is64) {
7892 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7893 Ops.push_back(Unmerge.getReg(0));
7894 Ops.push_back(Unmerge.getReg(1));
7895 } else {
7896 Ops.push_back(NodePtr);
7897 }
7898 Ops.push_back(RayExtent);
7899
7900 auto packLanes = [&Ops, &S32, &B](Register Src) {
7901 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7902 Ops.push_back(Unmerge.getReg(0));
7903 Ops.push_back(Unmerge.getReg(1));
7904 Ops.push_back(Unmerge.getReg(2));
7905 };
7906
7907 packLanes(RayOrigin);
7908 if (IsA16) {
7909 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7910 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7914 B.buildMergeLikeInstr(R1,
7915 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7916 B.buildMergeLikeInstr(
7917 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7918 B.buildMergeLikeInstr(
7919 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7920 Ops.push_back(R1);
7921 Ops.push_back(R2);
7922 Ops.push_back(R3);
7923 } else {
7924 packLanes(RayDir);
7925 packLanes(RayInvDir);
7926 }
7927 }
7928
7929 if (!UseNSA) {
7930 // Build a single vector containing all the operands so far prepared.
7931 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7932 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7933 Ops.clear();
7934 Ops.push_back(MergedOps);
7935 }
7936
7937 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7938 .addDef(DstReg)
7939 .addImm(Opcode);
7940
7941 for (Register R : Ops) {
7942 MIB.addUse(R);
7943 }
7944
7945 MIB.addUse(TDescr)
7946 .addImm(IsA16 ? 1 : 0)
7947 .cloneMemRefs(MI);
7948
7949 MI.eraseFromParent();
7950 return true;
7951}
7952
7954 MachineInstr &MI, MachineIRBuilder &B) const {
7955 const LLT S32 = LLT::scalar(32);
7956 const LLT V2S32 = LLT::fixed_vector(2, 32);
7957
7958 Register DstReg = MI.getOperand(0).getReg();
7959 Register DstOrigin = MI.getOperand(1).getReg();
7960 Register DstDir = MI.getOperand(2).getReg();
7961 Register NodePtr = MI.getOperand(4).getReg();
7962 Register RayExtent = MI.getOperand(5).getReg();
7963 Register InstanceMask = MI.getOperand(6).getReg();
7964 Register RayOrigin = MI.getOperand(7).getReg();
7965 Register RayDir = MI.getOperand(8).getReg();
7966 Register Offsets = MI.getOperand(9).getReg();
7967 Register TDescr = MI.getOperand(10).getReg();
7968
7969 if (!ST.hasBVHDualAndBVH8Insts()) {
7970 Function &Fn = B.getMF().getFunction();
7972 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7973 return false;
7974 }
7975
7976 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7977 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7978 const unsigned NumVDataDwords = 10;
7979 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7980 int Opcode = AMDGPU::getMIMGOpcode(
7981 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7982 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7983 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7984 assert(Opcode != -1);
7985
7986 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7987 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7988
7989 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7990 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7991 .addDef(DstReg)
7992 .addDef(DstOrigin)
7993 .addDef(DstDir)
7994 .addImm(Opcode)
7995 .addUse(NodePtr)
7996 .addUse(RayExtentInstanceMaskVec.getReg(0))
7997 .addUse(RayOrigin)
7998 .addUse(RayDir)
7999 .addUse(Offsets)
8000 .addUse(TDescr)
8001 .cloneMemRefs(MI);
8002
8003 MI.eraseFromParent();
8004 return true;
8005}
8006
8008 MachineIRBuilder &B) const {
8009 const SITargetLowering *TLI = ST.getTargetLowering();
8011 Register DstReg = MI.getOperand(0).getReg();
8012 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
8013 MI.eraseFromParent();
8014 return true;
8015}
8016
8018 MachineIRBuilder &B) const {
8019 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8020 if (!ST.hasArchitectedSGPRs())
8021 return false;
8022 LLT S32 = LLT::scalar(32);
8023 Register DstReg = MI.getOperand(0).getReg();
8024 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
8025 auto LSB = B.buildConstant(S32, 25);
8026 auto Width = B.buildConstant(S32, 5);
8027 B.buildUbfx(DstReg, TTMP8, LSB, Width);
8028 MI.eraseFromParent();
8029 return true;
8030}
8031
8034 AMDGPU::Hwreg::Id HwReg,
8035 unsigned LowBit,
8036 unsigned Width) const {
8037 MachineRegisterInfo &MRI = *B.getMRI();
8038 Register DstReg = MI.getOperand(0).getReg();
8039 if (!MRI.getRegClassOrNull(DstReg))
8040 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8041 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8042 .addDef(DstReg)
8043 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
8044 MI.eraseFromParent();
8045 return true;
8046}
8047
8048static constexpr unsigned FPEnvModeBitField =
8050
8051static constexpr unsigned FPEnvTrapBitField =
8053
8056 MachineIRBuilder &B) const {
8057 Register Src = MI.getOperand(0).getReg();
8058 if (MRI.getType(Src) != S64)
8059 return false;
8060
8061 auto ModeReg =
8062 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8063 /*HasSideEffects=*/true, /*isConvergent=*/false)
8064 .addImm(FPEnvModeBitField);
8065 auto TrapReg =
8066 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8067 /*HasSideEffects=*/true, /*isConvergent=*/false)
8068 .addImm(FPEnvTrapBitField);
8069 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8070 MI.eraseFromParent();
8071 return true;
8072}
8073
8076 MachineIRBuilder &B) const {
8077 Register Src = MI.getOperand(0).getReg();
8078 if (MRI.getType(Src) != S64)
8079 return false;
8080
8081 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
8082 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8083 /*HasSideEffects=*/true, /*isConvergent=*/false)
8084 .addImm(static_cast<int16_t>(FPEnvModeBitField))
8085 .addReg(Unmerge.getReg(0));
8086 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8087 /*HasSideEffects=*/true, /*isConvergent=*/false)
8088 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
8089 .addReg(Unmerge.getReg(1));
8090 MI.eraseFromParent();
8091 return true;
8092}
8093
8095 MachineInstr &MI) const {
8096 MachineIRBuilder &B = Helper.MIRBuilder;
8097 MachineRegisterInfo &MRI = *B.getMRI();
8098
8099 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8100 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
8101 switch (IntrID) {
8102 case Intrinsic::amdgcn_icmp: {
8103 // amdgcn.icmp(i1 src0, i1 0, NE) -> ballot(src0)
8104 // This is the only valid form of amdgcn.icmp with i1 inputs.
8105 Register Src0 = MI.getOperand(2).getReg();
8106 LLT SrcTy = MRI.getType(Src0);
8107 if (SrcTy != LLT::scalar(1))
8108 return true; // Not i1, leave for default handling.
8109
8110 // Check that src1 is constant 0.
8111 Register Src1 = MI.getOperand(3).getReg();
8112 auto Src1Const = getIConstantVRegValWithLookThrough(Src1, MRI);
8113 if (!Src1Const || Src1Const->Value != 0)
8114 return false; // Invalid i1 icmp form.
8115
8116 // Check that predicate is ICMP_NE.
8117 int64_t Pred = MI.getOperand(4).getImm();
8118 if (Pred != CmpInst::ICMP_NE)
8119 return false; // Invalid i1 icmp form.
8120
8121 // Convert to ballot.
8122 Register Dst = MI.getOperand(0).getReg();
8123 B.buildIntrinsic(Intrinsic::amdgcn_ballot, Dst).addUse(Src0);
8124 MI.eraseFromParent();
8125 return true;
8126 }
8127 case Intrinsic::sponentry:
8128 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8129 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8130 // that we can remove this cast.
8131 const LLT S32 = LLT::scalar(32);
8133 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8134
8135 Register DstReg = MI.getOperand(0).getReg();
8136 B.buildIntToPtr(DstReg, TmpReg);
8137 MI.eraseFromParent();
8138 } else {
8139 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8140 1, 0, /*IsImmutable=*/false);
8141 B.buildFrameIndex(MI.getOperand(0), FI);
8142 MI.eraseFromParent();
8143 }
8144 return true;
8145 case Intrinsic::amdgcn_if:
8146 case Intrinsic::amdgcn_else: {
8147 MachineInstr *Br = nullptr;
8148 MachineBasicBlock *UncondBrTarget = nullptr;
8149 bool Negated = false;
8150 if (MachineInstr *BrCond =
8151 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8152 const SIRegisterInfo *TRI
8153 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8154
8155 Register Def = MI.getOperand(1).getReg();
8156 Register Use = MI.getOperand(3).getReg();
8157
8158 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8159
8160 if (Negated)
8161 std::swap(CondBrTarget, UncondBrTarget);
8162
8163 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8164 if (IntrID == Intrinsic::amdgcn_if) {
8165 B.buildInstr(AMDGPU::SI_IF)
8166 .addDef(Def)
8167 .addUse(Use)
8168 .addMBB(UncondBrTarget);
8169 } else {
8170 B.buildInstr(AMDGPU::SI_ELSE)
8171 .addDef(Def)
8172 .addUse(Use)
8173 .addMBB(UncondBrTarget);
8174 }
8175
8176 if (Br) {
8177 Br->getOperand(0).setMBB(CondBrTarget);
8178 } else {
8179 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8180 // since we're swapping branch targets it needs to be reinserted.
8181 // FIXME: IRTranslator should probably not do this
8182 B.buildBr(*CondBrTarget);
8183 }
8184
8185 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8186 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8187 MI.eraseFromParent();
8188 BrCond->eraseFromParent();
8189 return true;
8190 }
8191
8192 return false;
8193 }
8194 case Intrinsic::amdgcn_loop: {
8195 MachineInstr *Br = nullptr;
8196 MachineBasicBlock *UncondBrTarget = nullptr;
8197 bool Negated = false;
8198 if (MachineInstr *BrCond =
8199 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8200 const SIRegisterInfo *TRI
8201 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8202
8203 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8204 Register Reg = MI.getOperand(2).getReg();
8205
8206 if (Negated)
8207 std::swap(CondBrTarget, UncondBrTarget);
8208
8209 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8210 B.buildInstr(AMDGPU::SI_LOOP)
8211 .addUse(Reg)
8212 .addMBB(UncondBrTarget);
8213
8214 if (Br)
8215 Br->getOperand(0).setMBB(CondBrTarget);
8216 else
8217 B.buildBr(*CondBrTarget);
8218
8219 MI.eraseFromParent();
8220 BrCond->eraseFromParent();
8221 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8222 return true;
8223 }
8224
8225 return false;
8226 }
8227 case Intrinsic::amdgcn_wave_reduce_min:
8228 case Intrinsic::amdgcn_wave_reduce_umin:
8229 case Intrinsic::amdgcn_wave_reduce_max:
8230 case Intrinsic::amdgcn_wave_reduce_umax:
8231 case Intrinsic::amdgcn_wave_reduce_add:
8232 case Intrinsic::amdgcn_wave_reduce_sub:
8233 case Intrinsic::amdgcn_wave_reduce_and:
8234 case Intrinsic::amdgcn_wave_reduce_or:
8235 case Intrinsic::amdgcn_wave_reduce_xor: {
8236 Register SrcReg = MI.getOperand(2).getReg();
8237 if (MRI.getType(SrcReg) != LLT::scalar(16))
8238 return true;
8239 Register DstReg = MI.getOperand(0).getReg();
8240 bool NeedsSignExt = IntrID == Intrinsic::amdgcn_wave_reduce_min ||
8241 IntrID == Intrinsic::amdgcn_wave_reduce_max ||
8242 IntrID == Intrinsic::amdgcn_wave_reduce_add ||
8243 IntrID == Intrinsic::amdgcn_wave_reduce_sub;
8244 auto Ext = NeedsSignExt ? B.buildSExt(LLT::scalar(32), SrcReg)
8245 : B.buildZExt(LLT::scalar(32), SrcReg);
8246 auto NewDst = MRI.createGenericVirtualRegister(LLT::scalar(32));
8247 B.buildIntrinsic(IntrID, ArrayRef<Register>{NewDst},
8248 /*hasSideEffects=*/false, /*isConvergent=*/true)
8249 .addUse(Ext.getReg(0))
8250 .addImm(MI.getOperand(3).getImm()); // strategy
8251 B.buildTrunc(DstReg, NewDst);
8252 MI.eraseFromParent();
8253 return true;
8254 }
8255 case Intrinsic::amdgcn_addrspacecast_nonnull:
8256 return legalizeAddrSpaceCast(MI, MRI, B);
8257 case Intrinsic::amdgcn_make_buffer_rsrc:
8258 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8259 case Intrinsic::amdgcn_kernarg_segment_ptr:
8260 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8261 // This only makes sense to call in a kernel, so just lower to null.
8262 B.buildConstant(MI.getOperand(0).getReg(), 0);
8263 MI.eraseFromParent();
8264 return true;
8265 }
8266
8269 case Intrinsic::amdgcn_implicitarg_ptr:
8270 return legalizeImplicitArgPtr(MI, MRI, B);
8271 case Intrinsic::amdgcn_workitem_id_x:
8272 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8274 case Intrinsic::amdgcn_workitem_id_y:
8275 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8277 case Intrinsic::amdgcn_workitem_id_z:
8278 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8280 case Intrinsic::amdgcn_workgroup_id_x:
8281 return legalizeWorkGroupId(
8285 case Intrinsic::amdgcn_workgroup_id_y:
8286 return legalizeWorkGroupId(
8290 case Intrinsic::amdgcn_workgroup_id_z:
8291 return legalizeWorkGroupId(
8295 case Intrinsic::amdgcn_cluster_id_x:
8296 return ST.hasClusters() &&
8299 case Intrinsic::amdgcn_cluster_id_y:
8300 return ST.hasClusters() &&
8303 case Intrinsic::amdgcn_cluster_id_z:
8304 return ST.hasClusters() &&
8307 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8308 return ST.hasClusters() &&
8311 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8312 return ST.hasClusters() &&
8315 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8316 return ST.hasClusters() &&
8319 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8320 return ST.hasClusters() &&
8322 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8323 return ST.hasClusters() &&
8326 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8327 return ST.hasClusters() &&
8330 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8331 return ST.hasClusters() &&
8334 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8335 return ST.hasClusters() &&
8337 MI, MRI, B,
8339 case Intrinsic::amdgcn_wave_id:
8340 return legalizeWaveID(MI, B);
8341 case Intrinsic::amdgcn_lds_kernel_id:
8342 return legalizePreloadedArgIntrin(MI, MRI, B,
8344 case Intrinsic::amdgcn_dispatch_ptr:
8345 return legalizePreloadedArgIntrin(MI, MRI, B,
8347 case Intrinsic::amdgcn_queue_ptr:
8348 return legalizePreloadedArgIntrin(MI, MRI, B,
8350 case Intrinsic::amdgcn_implicit_buffer_ptr:
8353 case Intrinsic::amdgcn_dispatch_id:
8354 return legalizePreloadedArgIntrin(MI, MRI, B,
8356 case Intrinsic::r600_read_ngroups_x:
8357 // TODO: Emit error for hsa
8360 case Intrinsic::r600_read_ngroups_y:
8363 case Intrinsic::r600_read_ngroups_z:
8366 case Intrinsic::r600_read_local_size_x:
8367 // TODO: Could insert G_ASSERT_ZEXT from s16
8369 case Intrinsic::r600_read_local_size_y:
8370 // TODO: Could insert G_ASSERT_ZEXT from s16
8372 // TODO: Could insert G_ASSERT_ZEXT from s16
8373 case Intrinsic::r600_read_local_size_z:
8376 case Intrinsic::amdgcn_fdiv_fast:
8377 return legalizeFDIVFastIntrin(MI, MRI, B);
8378 case Intrinsic::amdgcn_is_shared:
8380 case Intrinsic::amdgcn_is_private:
8382 case Intrinsic::amdgcn_wavefrontsize: {
8383 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8384 MI.eraseFromParent();
8385 return true;
8386 }
8387 case Intrinsic::amdgcn_s_buffer_load:
8388 return legalizeSBufferLoad(Helper, MI);
8389 case Intrinsic::amdgcn_raw_buffer_store:
8390 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8391 case Intrinsic::amdgcn_struct_buffer_store:
8392 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8393 return legalizeBufferStore(MI, Helper, false, false);
8394 case Intrinsic::amdgcn_raw_buffer_store_format:
8395 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8396 case Intrinsic::amdgcn_struct_buffer_store_format:
8397 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8398 return legalizeBufferStore(MI, Helper, false, true);
8399 case Intrinsic::amdgcn_raw_tbuffer_store:
8400 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8401 case Intrinsic::amdgcn_struct_tbuffer_store:
8402 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8403 return legalizeBufferStore(MI, Helper, true, true);
8404 case Intrinsic::amdgcn_raw_buffer_load:
8405 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8406 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8407 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8408 case Intrinsic::amdgcn_struct_buffer_load:
8409 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8410 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8411 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8412 return legalizeBufferLoad(MI, Helper, false, false);
8413 case Intrinsic::amdgcn_raw_buffer_load_format:
8414 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8415 case Intrinsic::amdgcn_struct_buffer_load_format:
8416 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8417 return legalizeBufferLoad(MI, Helper, true, false);
8418 case Intrinsic::amdgcn_raw_tbuffer_load:
8419 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8420 case Intrinsic::amdgcn_struct_tbuffer_load:
8421 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8422 return legalizeBufferLoad(MI, Helper, true, true);
8423 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8424 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8425 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8426 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8427 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8428 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8429 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8430 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8431 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8433 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8434 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8435 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8436 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8437 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8438 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8439 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8440 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8441 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8442 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8443 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8445 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8446 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8447 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8448 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8449 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8450 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8451 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8452 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8453 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8454 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8455 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8457 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8458 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8459 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8460 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8461 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8462 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8463 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8464 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8465 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8466 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8467 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8469 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8470 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8471 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8473 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8475 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8476 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8477 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8478 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8479 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8481 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8482 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8483 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8484 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8485 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8486 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8487 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8488 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8489 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8491 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8493 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8495 return legalizeBufferAtomic(MI, B, IntrID);
8496 case Intrinsic::amdgcn_rsq_clamp:
8497 return legalizeRsqClampIntrinsic(MI, MRI, B);
8498 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8500 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8501 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8503 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8504 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8505 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8506 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8507 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8508 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8509 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8510 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8511 Register Index = MI.getOperand(5).getReg();
8512 LLT S64 = LLT::scalar(64);
8513 LLT IndexArgTy = MRI.getType(Index);
8514 if (IndexArgTy != S64) {
8515 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8516 : B.buildAnyExt(S64, Index);
8517 MI.getOperand(5).setReg(NewIndex.getReg(0));
8518 }
8519 return true;
8520 }
8521 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8522 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8523 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8524 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8525 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8529 Register Index = MI.getOperand(5).getReg();
8530 LLT S32 = LLT::scalar(32);
8531 if (MRI.getType(Index) != S32)
8532 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8533 return true;
8534 }
8535 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8536 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8537 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8538 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8539 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8540 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8541 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8542 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8543 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8544 Register Index = MI.getOperand(7).getReg();
8545 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8546 ? LLT::scalar(64)
8547 : LLT::scalar(32);
8548 LLT IndexArgTy = MRI.getType(Index);
8549 if (IndexArgTy != IdxTy) {
8550 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8551 : B.buildAnyExt(IdxTy, Index);
8552 MI.getOperand(7).setReg(NewIndex.getReg(0));
8553 }
8554 return true;
8555 }
8556
8557 case Intrinsic::amdgcn_fmed3: {
8558 GISelChangeObserver &Observer = Helper.Observer;
8559
8560 // FIXME: This is to workaround the inability of tablegen match combiners to
8561 // match intrinsics in patterns.
8562 Observer.changingInstr(MI);
8563 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8564 MI.removeOperand(1);
8565 Observer.changedInstr(MI);
8566 return true;
8567 }
8568 case Intrinsic::amdgcn_readlane:
8569 case Intrinsic::amdgcn_writelane:
8570 case Intrinsic::amdgcn_readfirstlane:
8571 case Intrinsic::amdgcn_permlane16:
8572 case Intrinsic::amdgcn_permlanex16:
8573 case Intrinsic::amdgcn_permlane64:
8574 case Intrinsic::amdgcn_set_inactive:
8575 case Intrinsic::amdgcn_set_inactive_chain_arg:
8576 case Intrinsic::amdgcn_mov_dpp8:
8577 case Intrinsic::amdgcn_update_dpp:
8578 case Intrinsic::amdgcn_permlane_bcast:
8579 case Intrinsic::amdgcn_permlane_up:
8580 case Intrinsic::amdgcn_permlane_down:
8581 case Intrinsic::amdgcn_permlane_xor:
8582 return legalizeLaneOp(Helper, MI, IntrID);
8583 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8584 return legalizeSBufferPrefetch(Helper, MI);
8585 case Intrinsic::amdgcn_dead: {
8586 // TODO: Use poison instead of undef
8587 for (const MachineOperand &Def : MI.defs())
8588 B.buildUndef(Def);
8589 MI.eraseFromParent();
8590 return true;
8591 }
8592 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8593 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8594 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8595 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8596 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8597 MI.eraseFromParent();
8598 return true;
8599 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8600 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8601 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8602 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8603 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8604 MI.eraseFromParent();
8605 return true;
8606 case Intrinsic::amdgcn_av_load_b128:
8607 case Intrinsic::amdgcn_av_store_b128: {
8608 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
8609 if (!ST.hasFlatGlobalInsts()) {
8610 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8611 ? "llvm.amdgcn.av.load.b128"
8612 : "llvm.amdgcn.av.store.b128";
8613 Function &Fn = B.getMF().getFunction();
8615 Fn, Twine(Name) + " not supported on subtarget", MI.getDebugLoc()));
8616 return false;
8617 }
8618 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8619 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8620 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8621 else
8622 B.buildStore(MI.getOperand(2), MI.getOperand(1),
8623 **MI.memoperands_begin());
8624 MI.eraseFromParent();
8625 return true;
8626 }
8627 case Intrinsic::amdgcn_flat_load_monitor_b32:
8628 case Intrinsic::amdgcn_flat_load_monitor_b64:
8629 case Intrinsic::amdgcn_flat_load_monitor_b128:
8630 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8631 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8632 .add(MI.getOperand(0))
8633 .add(MI.getOperand(2))
8634 .addMemOperand(*MI.memoperands_begin());
8635 MI.eraseFromParent();
8636 return true;
8637 case Intrinsic::amdgcn_global_load_monitor_b32:
8638 case Intrinsic::amdgcn_global_load_monitor_b64:
8639 case Intrinsic::amdgcn_global_load_monitor_b128:
8640 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8641 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8642 .add(MI.getOperand(0))
8643 .add(MI.getOperand(2))
8644 .addMemOperand(*MI.memoperands_begin());
8645 MI.eraseFromParent();
8646 return true;
8647 default: {
8648 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8650 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8651 return true;
8652 }
8653 }
8654
8655 return true;
8656}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:77
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1487
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:297
static const fltSemantics & IEEEdouble()
Definition APFloat.h:298
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1223
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1203
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1163
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:383
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:861
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:558
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:1987
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:656
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:464
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:317
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1685
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:436
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:78
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.