LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
36#include "llvm/IR/IntrinsicsAMDGPU.h"
37#include "llvm/IR/IntrinsicsR600.h"
38
39#define DEBUG_TYPE "amdgpu-legalinfo"
40
41using namespace llvm;
42using namespace LegalizeActions;
43using namespace LegalizeMutations;
44using namespace LegalityPredicates;
45using namespace MIPatternMatch;
46
47// Hack until load/store selection patterns support any tuple of legal types.
49 "amdgpu-global-isel-new-legality",
50 cl::desc("Use GlobalISel desired legality, rather than try to use"
51 "rules compatible with selection patterns"),
52 cl::init(false),
54
55static constexpr unsigned MaxRegisterSize = 1024;
56
57// Round the number of elements to the next power of two elements
59 unsigned NElts = Ty.getNumElements();
60 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
61 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
62}
63
64// Round the number of bits to the next power of two bits
66 unsigned Bits = Ty.getSizeInBits();
67 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
68 return LLT::scalar(Pow2Bits);
69}
70
71/// \returns true if this is an odd sized vector which should widen by adding an
72/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
73/// excludes s1 vectors, which should always be scalarized.
74static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75 return [=](const LegalityQuery &Query) {
76 const LLT Ty = Query.Types[TypeIdx];
77 if (!Ty.isVector())
78 return false;
79
80 const LLT EltTy = Ty.getElementType();
81 const unsigned EltSize = EltTy.getSizeInBits();
82 return Ty.getNumElements() % 2 != 0 &&
83 EltSize > 1 && EltSize < 32 &&
84 Ty.getSizeInBits() % 32 != 0;
85 };
86}
87
88static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
89 return [=](const LegalityQuery &Query) {
90 const LLT Ty = Query.Types[TypeIdx];
91 return Ty.getSizeInBits() % 32 == 0;
92 };
93}
94
95static LegalityPredicate isWideVec16(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getScalarType();
99 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
100 };
101}
102
103static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
104 return [=](const LegalityQuery &Query) {
105 const LLT Ty = Query.Types[TypeIdx];
106 const LLT EltTy = Ty.getElementType();
107 return std::pair(TypeIdx,
108 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
109 };
110}
111
113 return [=](const LegalityQuery &Query) {
114 const LLT Ty = Query.Types[TypeIdx];
115 const LLT EltTy = Ty.getElementType();
116 unsigned Size = Ty.getSizeInBits();
117 unsigned Pieces = (Size + 63) / 64;
118 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
119 return std::pair(TypeIdx, LLT::scalarOrVector(
120 ElementCount::getFixed(NewNumElts), EltTy));
121 };
122}
123
124// Increase the number of vector elements to reach the next multiple of 32-bit
125// type.
126static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT Ty = Query.Types[TypeIdx];
129
130 const LLT EltTy = Ty.getElementType();
131 const int Size = Ty.getSizeInBits();
132 const int EltSize = EltTy.getSizeInBits();
133 const int NextMul32 = (Size + 31) / 32;
134
135 assert(EltSize < 32);
136
137 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
138 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
139 };
140}
141
142// Retrieves the scalar type that's the same size as the mem desc
144 return [=](const LegalityQuery &Query) {
145 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
146 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
147 };
148}
149
150// Increase the number of vector elements to reach the next legal RegClass.
152 return [=](const LegalityQuery &Query) {
153 const LLT Ty = Query.Types[TypeIdx];
154 const unsigned NumElts = Ty.getNumElements();
155 const unsigned EltSize = Ty.getElementType().getSizeInBits();
156 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
157
158 assert(EltSize == 32 || EltSize == 64);
159 assert(Ty.getSizeInBits() < MaxRegisterSize);
160
161 unsigned NewNumElts;
162 // Find the nearest legal RegClass that is larger than the current type.
163 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
164 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
165 break;
166 }
167 return std::pair(TypeIdx,
168 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
169 };
170}
171
173 if (!Ty.isVector())
174 return LLT::scalar(128);
175 const ElementCount NumElems = Ty.getElementCount();
176 return LLT::vector(NumElems, LLT::scalar(128));
177}
178
180 if (!Ty.isVector())
181 return LLT::fixed_vector(4, LLT::scalar(32));
182 const unsigned NumElems = Ty.getElementCount().getFixedValue();
183 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
184}
185
187 const unsigned Size = Ty.getSizeInBits();
188
189 if (Size <= 32) {
190 // <2 x s8> -> s16
191 // <4 x s8> -> s32
192 return LLT::scalar(Size);
193 }
194
196}
197
198static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
199 return [=](const LegalityQuery &Query) {
200 const LLT Ty = Query.Types[TypeIdx];
201 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
202 };
203}
204
206 return [=](const LegalityQuery &Query) {
207 const LLT Ty = Query.Types[TypeIdx];
208 unsigned Size = Ty.getSizeInBits();
209 assert(Size % 32 == 0);
210 return std::pair(
212 };
213}
214
215static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
216 return [=](const LegalityQuery &Query) {
217 const LLT QueryTy = Query.Types[TypeIdx];
218 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
219 };
220}
221
222static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
223 return [=](const LegalityQuery &Query) {
224 const LLT QueryTy = Query.Types[TypeIdx];
225 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
226 };
227}
228
229static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
230 return [=](const LegalityQuery &Query) {
231 const LLT QueryTy = Query.Types[TypeIdx];
232 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
233 };
234}
235
236static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
237 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
239}
240
242 const int EltSize = EltTy.getSizeInBits();
243 return EltSize == 16 || EltSize % 32 == 0;
244}
245
246static bool isRegisterVectorType(LLT Ty) {
247 const int EltSize = Ty.getElementType().getSizeInBits();
248 return EltSize == 32 || EltSize == 64 ||
249 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
250 EltSize == 128 || EltSize == 256;
251}
252
253// TODO: replace all uses of isRegisterType with isRegisterClassType
254static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
255 if (!isRegisterSize(ST, Ty.getSizeInBits()))
256 return false;
257
258 if (Ty.isVector())
259 return isRegisterVectorType(Ty);
260
261 return true;
262}
263
264// Any combination of 32 or 64-bit elements up the maximum register size, and
265// multiples of v2s16.
267 unsigned TypeIdx) {
268 return [=, &ST](const LegalityQuery &Query) {
269 return isRegisterType(ST, Query.Types[TypeIdx]);
270 };
271}
272
273// RegisterType that doesn't have a corresponding RegClass.
274// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
275// should be removed.
277 unsigned TypeIdx) {
278 return [=, &ST](const LegalityQuery &Query) {
279 LLT Ty = Query.Types[TypeIdx];
280 return isRegisterType(ST, Ty) &&
281 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
282 };
283}
284
285static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
286 return [=](const LegalityQuery &Query) {
287 const LLT QueryTy = Query.Types[TypeIdx];
288 if (!QueryTy.isVector())
289 return false;
290 const LLT EltTy = QueryTy.getElementType();
291 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
292 };
293}
294
295constexpr LLT S1 = LLT::scalar(1);
296constexpr LLT S8 = LLT::scalar(8);
297constexpr LLT S16 = LLT::scalar(16);
298constexpr LLT S32 = LLT::scalar(32);
299constexpr LLT F32 = LLT::float32();
300constexpr LLT S64 = LLT::scalar(64);
301constexpr LLT F64 = LLT::float64();
302constexpr LLT S96 = LLT::scalar(96);
303constexpr LLT S128 = LLT::scalar(128);
304constexpr LLT S160 = LLT::scalar(160);
305constexpr LLT S192 = LLT::scalar(192);
306constexpr LLT S224 = LLT::scalar(224);
307constexpr LLT S256 = LLT::scalar(256);
308constexpr LLT S512 = LLT::scalar(512);
309constexpr LLT S1024 = LLT::scalar(1024);
311
312constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
313constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
314constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
315constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
316constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
317constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
318constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
319constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
320
322constexpr LLT V2BF16 = V2F16; // FIXME
323
324constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
325constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
326constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
327constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
328constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
329constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
330constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
331constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
332constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
333constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
334constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
335constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
336constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
337
338constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
339constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
340constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
341constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
342constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
343constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
344constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
345constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
346
347constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
348constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
349
350constexpr std::initializer_list<LLT> AllScalarTypes = {
352
353constexpr std::initializer_list<LLT> AllS16Vectors{
355
356constexpr std::initializer_list<LLT> AllS32Vectors = {
359
360constexpr std::initializer_list<LLT> AllS64Vectors = {
362
368
369// Checks whether a type is in the list of legal register types.
370static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
371 if (Ty.isPointerOrPointerVector())
372 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
373
376 (ST.useRealTrue16Insts() && Ty == S16) ||
378}
379
381 unsigned TypeIdx) {
382 return [&ST, TypeIdx](const LegalityQuery &Query) {
383 return isRegisterClassType(ST, Query.Types[TypeIdx]);
384 };
385}
386
387// If we have a truncating store or an extending load with a data size larger
388// than 32-bits, we need to reduce to a 32-bit type.
390 return [=](const LegalityQuery &Query) {
391 const LLT Ty = Query.Types[TypeIdx];
392 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
393 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
394 };
395}
396
397// If we have a truncating store or an extending load with a data size larger
398// than 32-bits and mem location is a power of 2
400 return [=](const LegalityQuery &Query) {
401 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
402 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
403 isPowerOf2_64(MemSize);
404 };
405}
406
407// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
408// handle some operations by just promoting the register during
409// selection. There are also d16 loads on GFX9+ which preserve the high bits.
410static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
411 bool IsLoad, bool IsAtomic) {
412 switch (AS) {
414 // FIXME: Private element size.
415 return ST.hasFlatScratchEnabled() ? 128 : 32;
417 return ST.useDS128() ? 128 : 64;
422 // Treat constant and global as identical. SMRD loads are sometimes usable for
423 // global loads (ideally constant address space should be eliminated)
424 // depending on the context. Legality cannot be context dependent, but
425 // RegBankSelect can split the load as necessary depending on the pointer
426 // register bank/uniformity and if the memory is invariant or not written in a
427 // kernel.
428 return IsLoad ? 512 : 128;
429 default:
430 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
431 // if they may alias scratch depending on the subtarget. This needs to be
432 // moved to custom handling to use addressMayBeAccessedAsPrivate
433 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
434 }
435}
436
437static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
438 const LegalityQuery &Query) {
439 const LLT Ty = Query.Types[0];
440
441 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
442 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
443
444 unsigned RegSize = Ty.getSizeInBits();
445 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
446 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
447 unsigned AS = Query.Types[1].getAddressSpace();
448
449 // All of these need to be custom lowered to cast the pointer operand.
451 return false;
452
453 // Do not handle extending vector loads.
454 if (Ty.isVector() && MemSize != RegSize)
455 return false;
456
457 // TODO: We should be able to widen loads if the alignment is high enough, but
458 // we also need to modify the memory access size.
459#if 0
460 // Accept widening loads based on alignment.
461 if (IsLoad && MemSize < Size)
462 MemSize = std::max(MemSize, Align);
463#endif
464
465 // Only 1-byte and 2-byte to 32-bit extloads are valid.
466 if (MemSize != RegSize && RegSize != 32)
467 return false;
468
469 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
470 Query.MMODescrs[0].Ordering !=
472 return false;
473
474 switch (MemSize) {
475 case 8:
476 case 16:
477 case 32:
478 case 64:
479 case 128:
480 break;
481 case 96:
482 if (!ST.hasDwordx3LoadStores())
483 return false;
484 break;
485 case 256:
486 case 512:
487 // These may contextually need to be broken down.
488 break;
489 default:
490 return false;
491 }
492
493 assert(RegSize >= MemSize);
494
495 if (AlignBits < MemSize) {
496 const SITargetLowering *TLI = ST.getTargetLowering();
497 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
498 Align(AlignBits / 8)))
499 return false;
500 }
501
502 return true;
503}
504
505// The newer buffer intrinsic forms take their resource arguments as
506// pointers in address space 8, aka s128 values. However, in order to not break
507// SelectionDAG, the underlying operations have to continue to take v4i32
508// arguments. Therefore, we convert resource pointers - or vectors of them
509// to integer values here.
510static bool hasBufferRsrcWorkaround(const LLT Ty) {
511 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
512 return true;
513 if (Ty.isVector()) {
514 const LLT ElemTy = Ty.getElementType();
515 return hasBufferRsrcWorkaround(ElemTy);
516 }
517 return false;
518}
519
520// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
521// workaround this. Eventually it should ignore the type for loads and only care
522// about the size. Return true in cases where we will workaround this for now by
523// bitcasting.
524static bool loadStoreBitcastWorkaround(const LLT Ty) {
526 return false;
527
528 const unsigned Size = Ty.getSizeInBits();
529 if (Ty.isPointerVector())
530 return true;
531 if (Size <= 64)
532 return false;
533 // Address space 8 pointers get their own workaround.
535 return false;
536 if (!Ty.isVector())
537 return true;
538
539 unsigned EltSize = Ty.getScalarSizeInBits();
540 return EltSize != 32 && EltSize != 64;
541}
542
543static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
544 const LLT Ty = Query.Types[0];
545 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
547}
548
549/// Return true if a load or store of the type should be lowered with a bitcast
550/// to a different type.
551static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
552 const LLT MemTy) {
553 const unsigned MemSizeInBits = MemTy.getSizeInBits();
554 const unsigned Size = Ty.getSizeInBits();
555 if (Size != MemSizeInBits)
556 return Size <= 32 && Ty.isVector();
557
559 return true;
560
561 // Don't try to handle bitcasting vector ext loads for now.
562 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
563 (Size <= 32 || isRegisterSize(ST, Size)) &&
564 !isRegisterVectorElementType(Ty.getElementType());
565}
566
567/// Return true if we should legalize a load by widening an odd sized memory
568/// access up to the alignment. Note this case when the memory access itself
569/// changes, not the size of the result register.
570static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
571 uint64_t AlignInBits, unsigned AddrSpace,
572 unsigned Opcode) {
573 unsigned SizeInBits = MemoryTy.getSizeInBits();
574 // We don't want to widen cases that are naturally legal.
575 if (isPowerOf2_32(SizeInBits))
576 return false;
577
578 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
579 // end up widening these for a scalar load during RegBankSelect, if we don't
580 // have 96-bit scalar loads.
581 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
582 return false;
583
584 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
585 return false;
586
587 // A load is known dereferenceable up to the alignment, so it's legal to widen
588 // to it.
589 //
590 // TODO: Could check dereferenceable for less aligned cases.
591 unsigned RoundedSize = NextPowerOf2(SizeInBits);
592 if (AlignInBits < RoundedSize)
593 return false;
594
595 // Do not widen if it would introduce a slow unaligned load.
596 const SITargetLowering *TLI = ST.getTargetLowering();
597 unsigned Fast = 0;
599 RoundedSize, AddrSpace, Align(AlignInBits / 8),
601 Fast;
602}
603
604static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
605 unsigned Opcode) {
606 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
607 return false;
608
609 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
610 Query.MMODescrs[0].AlignInBits,
611 Query.Types[1].getAddressSpace(), Opcode);
612}
613
614/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
615/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
616/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
618 MachineRegisterInfo &MRI, unsigned Idx) {
619 MachineOperand &MO = MI.getOperand(Idx);
620
621 const LLT PointerTy = MRI.getType(MO.getReg());
622
623 // Paranoidly prevent us from doing this multiple times.
625 return PointerTy;
626
627 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
628 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
629 if (!PointerTy.isVector()) {
630 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
631 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
632 const LLT S32 = LLT::scalar(32);
633
634 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
635 std::array<Register, 4> VectorElems;
636 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
637 for (unsigned I = 0; I < NumParts; ++I)
638 VectorElems[I] =
639 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
640 B.buildMergeValues(MO, VectorElems);
641 MO.setReg(VectorReg);
642 return VectorTy;
643 }
644 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
645 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
646 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
647 B.buildIntToPtr(MO, Scalar);
648 MO.setReg(BitcastReg);
649
650 return VectorTy;
651}
652
653/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
654/// the form in which the value must be in order to be passed to the low-level
655/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
656/// needed in order to account for the fact that we can't define a register
657/// class for s128 without breaking SelectionDAG.
659 MachineRegisterInfo &MRI = *B.getMRI();
660 const LLT PointerTy = MRI.getType(Pointer);
661 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
662 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
663
664 if (!PointerTy.isVector()) {
665 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
666 SmallVector<Register, 4> PointerParts;
667 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
668 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
669 for (unsigned I = 0; I < NumParts; ++I)
670 PointerParts.push_back(Unmerged.getReg(I));
671 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
672 }
673 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
674 return B.buildBitcast(VectorTy, Scalar).getReg(0);
675}
676
678 unsigned Idx) {
679 MachineOperand &MO = MI.getOperand(Idx);
680
681 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
682 // Paranoidly prevent us from doing this multiple times.
684 return;
686}
687
689 const GCNTargetMachine &TM)
690 : ST(ST_) {
691 using namespace TargetOpcode;
692
693 auto GetAddrSpacePtr = [&TM](unsigned AS) {
694 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
695 };
696
697 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
698 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
699 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
700 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
701 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
702 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
703 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
704 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
705 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
706 const LLT BufferStridedPtr =
707 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
708
709 const LLT CodePtr = FlatPtr;
710
711 const std::initializer_list<LLT> AddrSpaces64 = {
712 GlobalPtr, ConstantPtr, FlatPtr
713 };
714
715 const std::initializer_list<LLT> AddrSpaces32 = {
716 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
717 };
718
719 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
720
721 const std::initializer_list<LLT> FPTypesBase = {
722 S32, S64
723 };
724
725 const std::initializer_list<LLT> FPTypes16 = {
726 S32, S64, S16
727 };
728
729 const std::initializer_list<LLT> FPTypesPK16 = {
730 S32, S64, S16, V2S16
731 };
732
733 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
734
735 // s1 for VCC branches, s32 for SCC branches.
737
738 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
739 // elements for v3s16
742 .legalFor(AllS32Vectors)
744 .legalFor(AddrSpaces64)
745 .legalFor(AddrSpaces32)
746 .legalFor(AddrSpaces128)
747 .legalIf(isPointer(0))
748 .clampScalar(0, S16, S256)
750 .clampMaxNumElements(0, S32, 16)
752 .scalarize(0);
753
754 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
755 // Full set of gfx9 features.
756 if (ST.hasScalarAddSub64()) {
757 getActionDefinitionsBuilder({G_ADD, G_SUB})
758 .legalFor({S64, S32, S16, V2S16})
759 .clampMaxNumElementsStrict(0, S16, 2)
760 .scalarize(0)
761 .minScalar(0, S16)
763 .maxScalar(0, S32);
764 } else {
765 getActionDefinitionsBuilder({G_ADD, G_SUB})
766 .legalFor({S32, S16, V2S16})
767 .clampMaxNumElementsStrict(0, S16, 2)
768 .scalarize(0)
769 .minScalar(0, S16)
771 .maxScalar(0, S32);
772 }
773
774 if (ST.hasScalarSMulU64()) {
776 .legalFor({S64, S32, S16, V2S16})
777 .clampMaxNumElementsStrict(0, S16, 2)
778 .scalarize(0)
779 .minScalar(0, S16)
781 .custom();
782 } else {
784 .legalFor({S32, S16, V2S16})
785 .clampMaxNumElementsStrict(0, S16, 2)
786 .scalarize(0)
787 .minScalar(0, S16)
789 .custom();
790 }
791 assert(ST.hasMad64_32());
792
793 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
794 .legalFor({S32, S16, V2S16}) // Clamp modifier
795 .minScalarOrElt(0, S16)
797 .scalarize(0)
799 .lower();
800 } else if (ST.has16BitInsts()) {
801 getActionDefinitionsBuilder({G_ADD, G_SUB})
802 .legalFor({S32, S16})
803 .minScalar(0, S16)
805 .maxScalar(0, S32)
806 .scalarize(0);
807
809 .legalFor({S32, S16})
810 .scalarize(0)
811 .minScalar(0, S16)
813 .custom();
814 assert(ST.hasMad64_32());
815
816 // Technically the saturating operations require clamp bit support, but this
817 // was introduced at the same time as 16-bit operations.
818 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
819 .legalFor({S32, S16}) // Clamp modifier
820 .minScalar(0, S16)
821 .scalarize(0)
823 .lower();
824
825 // We're just lowering this, but it helps get a better result to try to
826 // coerce to the desired type first.
827 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
828 .minScalar(0, S16)
829 .scalarize(0)
830 .lower();
831 } else {
832 getActionDefinitionsBuilder({G_ADD, G_SUB})
833 .legalFor({S32})
834 .widenScalarToNextMultipleOf(0, 32)
835 .clampScalar(0, S32, S32)
836 .scalarize(0);
837
838 auto &Mul = getActionDefinitionsBuilder(G_MUL)
839 .legalFor({S32})
840 .scalarize(0)
841 .minScalar(0, S32)
843
844 if (ST.hasMad64_32())
845 Mul.custom();
846 else
847 Mul.maxScalar(0, S32);
848
849 if (ST.hasIntClamp()) {
850 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
851 .legalFor({S32}) // Clamp modifier.
852 .scalarize(0)
854 .lower();
855 } else {
856 // Clamp bit support was added in VI, along with 16-bit operations.
857 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
858 .minScalar(0, S32)
859 .scalarize(0)
860 .lower();
861 }
862
863 // FIXME: DAG expansion gets better results. The widening uses the smaller
864 // range values and goes for the min/max lowering directly.
865 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
866 .minScalar(0, S32)
867 .scalarize(0)
868 .lower();
869 }
870
872 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
873 .customFor({S32, S64})
874 .clampScalar(0, S32, S64)
876 .scalarize(0);
877
878 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
879 .legalFor({S32})
880 .maxScalar(0, S32);
881
882 if (ST.hasVOP3PInsts()) {
883 Mulh
884 .clampMaxNumElements(0, S8, 2)
885 .lowerFor({V2S8});
886 }
887
888 Mulh
889 .scalarize(0)
890 .lower();
891
892 // Report legal for any types we can handle anywhere. For the cases only legal
893 // on the SALU, RegBankSelect will be able to re-legalize.
894 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
895 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
896 .clampScalar(0, S32, S64)
902 .scalarize(0);
903
905 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
906 .legalFor({{S32, S1}, {S32, S32}})
907 .clampScalar(0, S32, S32)
908 .scalarize(0);
909
911 // Don't worry about the size constraint.
913 .lower();
914
916 .legalFor({S1, S32, S64, S16, GlobalPtr,
917 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
918 .legalIf(isPointer(0))
919 .clampScalar(0, S32, S64)
921
922 getActionDefinitionsBuilder(G_FCONSTANT)
923 .legalFor({S32, S64, S16})
924 .clampScalar(0, S16, S64);
925
926 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
927 .legalIf(isRegisterClassType(ST, 0))
928 // s1 and s16 are special cases because they have legal operations on
929 // them, but don't really occupy registers in the normal way.
930 .legalFor({S1, S16})
931 .clampNumElements(0, V16S32, V32S32)
935 .clampMaxNumElements(0, S32, 16);
936
937 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
938
939 // If the amount is divergent, we have to do a wave reduction to get the
940 // maximum value, so this is expanded during RegBankSelect.
941 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
942 .legalFor({{PrivatePtr, S32}});
943
944 getActionDefinitionsBuilder(G_STACKSAVE)
945 .customFor({PrivatePtr});
946 getActionDefinitionsBuilder(G_STACKRESTORE)
947 .legalFor({PrivatePtr});
948
949 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
950
951 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
952 .customIf(typeIsNot(0, PrivatePtr));
953
954 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
955
956 auto &FPOpActions = getActionDefinitionsBuilder(
957 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
958 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
959 .legalFor({S32, S64});
960 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
961 .customFor({S32, S64});
962 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
963 .customFor({S32, S64});
964
965 if (ST.has16BitInsts()) {
966 if (ST.hasVOP3PInsts())
967 FPOpActions.legalFor({S16, V2S16});
968 else
969 FPOpActions.legalFor({S16});
970
971 TrigActions.customFor({S16});
972 FDIVActions.customFor({S16});
973 }
974
975 if (ST.hasPackedFP32Ops()) {
976 FPOpActions.legalFor({V2S32});
977 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
978 }
979
980 auto &MinNumMaxNumIeee =
981 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
982
983 if (ST.hasVOP3PInsts()) {
984 MinNumMaxNumIeee.legalFor(FPTypesPK16)
985 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
986 .clampMaxNumElements(0, S16, 2)
987 .clampScalar(0, S16, S64)
988 .scalarize(0);
989 } else if (ST.has16BitInsts()) {
990 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
991 } else {
992 MinNumMaxNumIeee.legalFor(FPTypesBase)
993 .clampScalar(0, S32, S64)
994 .scalarize(0);
995 }
996
997 auto &MinNumMaxNum = getActionDefinitionsBuilder(
998 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
999
1000 if (ST.hasVOP3PInsts()) {
1001 MinNumMaxNum.customFor(FPTypesPK16)
1002 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1003 .clampMaxNumElements(0, S16, 2)
1004 .clampScalar(0, S16, S64)
1005 .scalarize(0);
1006 } else if (ST.has16BitInsts()) {
1007 MinNumMaxNum.customFor(FPTypes16)
1008 .clampScalar(0, S16, S64)
1009 .scalarize(0);
1010 } else {
1011 MinNumMaxNum.customFor(FPTypesBase)
1012 .clampScalar(0, S32, S64)
1013 .scalarize(0);
1014 }
1015
1016 if (ST.hasVOP3PInsts())
1017 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1018
1019 FPOpActions
1020 .scalarize(0)
1021 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1022
1023 TrigActions
1024 .scalarize(0)
1025 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1026
1027 FDIVActions
1028 .scalarize(0)
1029 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1030
1031 getActionDefinitionsBuilder({G_FNEG, G_FABS})
1032 .legalFor(FPTypesPK16)
1034 .scalarize(0)
1035 .clampScalar(0, S16, S64);
1036
1037 if (ST.has16BitInsts()) {
1039 .legalFor({S16})
1040 .customFor({S32, S64})
1041 .scalarize(0)
1042 .unsupported();
1044 .legalFor({S32, S64, S16})
1045 .scalarize(0)
1046 .clampScalar(0, S16, S64);
1047
1048 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1049 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1050 .scalarize(0)
1051 .maxScalarIf(typeIs(0, S16), 1, S16)
1052 .clampScalar(1, S32, S32)
1053 .lower();
1054
1056 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1057 .scalarize(0)
1058 .lower();
1059 } else {
1061 .customFor({S32, S64, S16})
1062 .scalarize(0)
1063 .unsupported();
1064
1065
1066 if (ST.hasFractBug()) {
1068 .customFor({S64})
1069 .legalFor({S32, S64})
1070 .scalarize(0)
1071 .clampScalar(0, S32, S64);
1072 } else {
1074 .legalFor({S32, S64})
1075 .scalarize(0)
1076 .clampScalar(0, S32, S64);
1077 }
1078
1079 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1080 .legalFor({{S32, S32}, {S64, S32}})
1081 .scalarize(0)
1082 .clampScalar(0, S32, S64)
1083 .clampScalar(1, S32, S32)
1084 .lower();
1085
1087 .customFor({{S32, S32}, {S64, S32}})
1088 .scalarize(0)
1089 .minScalar(0, S32)
1090 .clampScalar(1, S32, S32)
1091 .lower();
1092 }
1093
1094 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1095 if (ST.hasCvtPkF16F32Inst()) {
1096 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1097 .clampMaxNumElements(0, S16, 2);
1098 } else {
1099 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1100 }
1101 FPTruncActions.scalarize(0).lower();
1102
1104 .legalFor({{S64, S32}, {S32, S16}})
1105 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1106 .scalarize(0);
1107
1108 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1109 if (ST.has16BitInsts()) {
1110 FSubActions
1111 // Use actual fsub instruction
1112 .legalFor({S32, S16})
1113 // Must use fadd + fneg
1114 .lowerFor({S64, V2S16});
1115 } else {
1116 FSubActions
1117 // Use actual fsub instruction
1118 .legalFor({S32})
1119 // Must use fadd + fneg
1120 .lowerFor({S64, S16, V2S16});
1121 }
1122
1123 FSubActions
1124 .scalarize(0)
1125 .clampScalar(0, S32, S64);
1126
1127 // Whether this is legal depends on the floating point mode for the function.
1128 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1129 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1130 FMad.customFor({S32, S16});
1131 else if (ST.hasMadMacF32Insts())
1132 FMad.customFor({S32});
1133 else if (ST.hasMadF16())
1134 FMad.customFor({S16});
1135 FMad.scalarize(0)
1136 .lower();
1137
1138 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1139 if (ST.has16BitInsts()) {
1140 FRem.customFor({S16, S32, S64});
1141 } else {
1142 FRem.minScalar(0, S32)
1143 .customFor({S32, S64});
1144 }
1145 FRem.scalarize(0);
1146
1147 // TODO: Do we need to clamp maximum bitwidth?
1149 .legalIf(isScalar(0))
1150 .legalFor({{V2S16, V2S32}})
1151 .clampMaxNumElements(0, S16, 2)
1152 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1153 // situations (like an invalid implicit use), we don't want to infinite loop
1154 // in the legalizer.
1156 .alwaysLegal();
1157
1158 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1159 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1160 {S32, S1}, {S64, S1}, {S16, S1}})
1161 .scalarize(0)
1162 .clampScalar(0, S32, S64)
1163 .widenScalarToNextPow2(1, 32);
1164
1165 // TODO: Split s1->s64 during regbankselect for VALU.
1166 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1167 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1168 .lowerIf(typeIs(1, S1))
1169 .customFor({{S32, S64}, {S64, S64}});
1170 if (ST.has16BitInsts())
1171 IToFP.legalFor({{S16, S16}});
1172 IToFP.clampScalar(1, S32, S64)
1173 .minScalar(0, S32)
1174 .scalarize(0)
1176
1177 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1178 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1179 .customFor({{S64, S32}, {S64, S64}})
1180 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1181 if (ST.has16BitInsts())
1182 FPToI.legalFor({{S16, S16}});
1183 else
1184 FPToI.minScalar(1, S32);
1185
1186 FPToI.minScalar(0, S32)
1187 .widenScalarToNextPow2(0, 32)
1188 .scalarize(0)
1189 .lower();
1190
1191 // clang-format off
1192 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1193 .legalFor({{S32, S32}, {S32, S64}})
1194 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1195 FPToISat.minScalar(1, S32);
1196 FPToISat.minScalar(0, S32)
1197 .widenScalarToNextPow2(0, 32)
1198 .scalarize(0)
1199 .lower();
1200 // clang-format on
1201
1202 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1203 .clampScalar(0, S16, S64)
1204 .scalarize(0)
1205 .lower();
1206
1207 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1208 .legalFor({S16, S32})
1209 .scalarize(0)
1210 .lower();
1211
1212 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1213 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1214 .scalarize(0)
1215 .lower();
1216
1217 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1218 .clampScalar(0, S16, S64)
1219 .scalarize(0)
1220 .lower();
1221
1222 if (ST.has16BitInsts()) {
1224 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1225 .legalFor({S16, S32, S64})
1226 .clampScalar(0, S16, S64)
1227 .scalarize(0);
1228 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1230 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1231 .legalFor({S32, S64})
1232 .clampScalar(0, S32, S64)
1233 .scalarize(0);
1234 } else {
1236 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1237 .legalFor({S32})
1238 .customFor({S64})
1239 .clampScalar(0, S32, S64)
1240 .scalarize(0);
1241 }
1242
1244 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1245 .legalIf(all(isPointer(0), sameSize(0, 1)))
1246 .scalarize(0)
1247 .scalarSameSizeAs(1, 0);
1248
1250 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1251 .scalarSameSizeAs(1, 0)
1252 .scalarize(0);
1253
1254 auto &CmpBuilder =
1256 // The compare output type differs based on the register bank of the output,
1257 // so make both s1 and s32 legal.
1258 //
1259 // Scalar compares producing output in scc will be promoted to s32, as that
1260 // is the allocatable register type that will be needed for the copy from
1261 // scc. This will be promoted during RegBankSelect, and we assume something
1262 // before that won't try to use s32 result types.
1263 //
1264 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1265 // bank.
1267 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1268 .legalForCartesianProduct(
1269 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1270 if (ST.has16BitInsts()) {
1271 CmpBuilder.legalFor({{S1, S16}});
1272 }
1273
1274 CmpBuilder
1276 .clampScalar(1, S32, S64)
1277 .scalarize(0)
1278 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1279
1280 auto &FCmpBuilder =
1282 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1283
1284 if (ST.hasSALUFloatInsts())
1285 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1286
1287 FCmpBuilder
1289 .clampScalar(1, S32, S64)
1290 .scalarize(0);
1291
1292 // FIXME: fpow has a selection pattern that should move to custom lowering.
1293 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1294 if (ST.has16BitInsts())
1295 ExpOps.customFor({{S32}, {S16}});
1296 else
1297 ExpOps.customFor({S32});
1298 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1299 .scalarize(0);
1300
1302 .clampScalar(0, MinScalarFPTy, S32)
1303 .lower();
1304
1305 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1306 Log2Ops.customFor({S32});
1307 if (ST.has16BitInsts())
1308 Log2Ops.legalFor({S16});
1309 else
1310 Log2Ops.customFor({S16});
1311 Log2Ops.scalarize(0)
1312 .lower();
1313
1314 auto &LogOps =
1315 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1316 LogOps.customFor({S32, S16});
1317 LogOps.clampScalar(0, MinScalarFPTy, S32)
1318 .scalarize(0);
1319
1320 // The 64-bit versions produce 32-bit results, but only on the SALU.
1322 .legalFor({{S32, S32}, {S32, S64}})
1323 .clampScalar(0, S32, S32)
1324 .widenScalarToNextPow2(1, 32)
1325 .clampScalar(1, S32, S64)
1326 .scalarize(0)
1327 .widenScalarToNextPow2(0, 32);
1328
1329 // If no 16 bit instr is available, lower into different instructions.
1330 if (ST.has16BitInsts())
1331 getActionDefinitionsBuilder(G_IS_FPCLASS)
1332 .legalForCartesianProduct({S1}, FPTypes16)
1333 .widenScalarToNextPow2(1)
1334 .scalarize(0)
1335 .lower();
1336 else
1337 getActionDefinitionsBuilder(G_IS_FPCLASS)
1338 .legalForCartesianProduct({S1}, FPTypesBase)
1339 .lowerFor({S1, S16})
1340 .widenScalarToNextPow2(1)
1341 .scalarize(0)
1342 .lower();
1343
1344 // The hardware instructions return a different result on 0 than the generic
1345 // instructions expect. The hardware produces -1, but these produce the
1346 // bitwidth.
1347 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1348 .scalarize(0)
1349 .clampScalar(0, S32, S32)
1350 .clampScalar(1, S32, S64)
1351 .widenScalarToNextPow2(0, 32)
1352 .widenScalarToNextPow2(1, 32)
1353 .custom();
1354
1355 // The 64-bit versions produce 32-bit results, but only on the SALU.
1356 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1357 .legalFor({{S32, S32}, {S32, S64}})
1358 .customIf(scalarNarrowerThan(1, 32))
1359 .clampScalar(0, S32, S32)
1360 .clampScalar(1, S32, S64)
1361 .scalarize(0)
1362 .widenScalarToNextPow2(0, 32)
1363 .widenScalarToNextPow2(1, 32);
1364
1365 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1366 .legalFor({{S32, S32}, {S32, S64}})
1367 .clampScalar(0, S32, S32)
1368 .clampScalar(1, S32, S64)
1369 .scalarize(0)
1370 .widenScalarToNextPow2(0, 32)
1371 .widenScalarToNextPow2(1, 32);
1372
1373 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1374 // RegBankSelect.
1375 getActionDefinitionsBuilder(G_BITREVERSE)
1376 .legalFor({S32, S64})
1377 .clampScalar(0, S32, S64)
1378 .scalarize(0)
1380
1381 if (ST.has16BitInsts()) {
1383 .legalFor({S16, S32, V2S16})
1384 .clampMaxNumElementsStrict(0, S16, 2)
1385 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1386 // narrowScalar limitation.
1388 .clampScalar(0, S16, S32)
1389 .scalarize(0);
1390
1391 if (ST.hasVOP3PInsts()) {
1393 .legalFor({S32, S16, V2S16})
1394 .clampMaxNumElements(0, S16, 2)
1395 .minScalar(0, S16)
1397 .scalarize(0)
1398 .lower();
1399 if (ST.hasIntMinMax64()) {
1400 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1401 .legalFor({S32, S16, S64, V2S16})
1402 .clampMaxNumElements(0, S16, 2)
1403 .minScalar(0, S16)
1405 .scalarize(0)
1406 .lower();
1407 } else {
1408 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1409 .legalFor({S32, S16, V2S16})
1410 .clampMaxNumElements(0, S16, 2)
1411 .minScalar(0, S16)
1413 .scalarize(0)
1414 .lower();
1415 }
1416 } else {
1417 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1418 .legalFor({S32, S16})
1419 .widenScalarToNextPow2(0)
1420 .minScalar(0, S16)
1421 .scalarize(0)
1422 .lower();
1423 }
1424 } else {
1425 // TODO: Should have same legality without v_perm_b32
1427 .legalFor({S32})
1428 .lowerIf(scalarNarrowerThan(0, 32))
1429 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1430 // narrowScalar limitation.
1432 .maxScalar(0, S32)
1433 .scalarize(0)
1434 .lower();
1435
1436 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1437 .legalFor({S32})
1438 .minScalar(0, S32)
1440 .scalarize(0)
1441 .lower();
1442 }
1443
1444 getActionDefinitionsBuilder(G_INTTOPTR)
1445 // List the common cases
1446 .legalForCartesianProduct(AddrSpaces64, {S64})
1447 .legalForCartesianProduct(AddrSpaces32, {S32})
1448 .scalarize(0)
1449 // Accept any address space as long as the size matches
1450 .legalIf(sameSize(0, 1))
1452 [](const LegalityQuery &Query) {
1453 return std::pair(
1454 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1455 })
1456 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1457 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1458 });
1459
1460 getActionDefinitionsBuilder(G_PTRTOINT)
1461 // List the common cases
1462 .legalForCartesianProduct(AddrSpaces64, {S64})
1463 .legalForCartesianProduct(AddrSpaces32, {S32})
1464 .scalarize(0)
1465 // Accept any address space as long as the size matches
1466 .legalIf(sameSize(0, 1))
1468 [](const LegalityQuery &Query) {
1469 return std::pair(
1470 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1471 })
1472 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1473 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1474 });
1475
1476 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1477 .scalarize(0)
1478 .custom();
1479
1480 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1481 bool IsLoad) -> bool {
1482 const LLT DstTy = Query.Types[0];
1483
1484 // Split vector extloads.
1485 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1486
1487 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1488 return true;
1489
1490 const LLT PtrTy = Query.Types[1];
1491 unsigned AS = PtrTy.getAddressSpace();
1492 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1493 Query.MMODescrs[0].Ordering !=
1495 return true;
1496
1497 // Catch weird sized loads that don't evenly divide into the access sizes
1498 // TODO: May be able to widen depending on alignment etc.
1499 unsigned NumRegs = (MemSize + 31) / 32;
1500 if (NumRegs == 3) {
1501 if (!ST.hasDwordx3LoadStores())
1502 return true;
1503 } else {
1504 // If the alignment allows, these should have been widened.
1505 if (!isPowerOf2_32(NumRegs))
1506 return true;
1507 }
1508
1509 return false;
1510 };
1511
1512 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1513 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1514 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1515
1516 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1517 // LDS
1518 // TODO: Unsupported flat for SI.
1519
1520 for (unsigned Op : {G_LOAD, G_STORE}) {
1521 const bool IsStore = Op == G_STORE;
1522
1523 auto &Actions = getActionDefinitionsBuilder(Op);
1524 // Explicitly list some common cases.
1525 // TODO: Does this help compile time at all?
1526 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1527 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1528 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1529 {S64, GlobalPtr, S64, GlobalAlign32},
1530 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1531 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1532 {S32, GlobalPtr, S8, GlobalAlign8},
1533 {S32, GlobalPtr, S16, GlobalAlign16},
1534
1535 {S32, LocalPtr, S32, 32},
1536 {S64, LocalPtr, S64, 32},
1537 {V2S32, LocalPtr, V2S32, 32},
1538 {S32, LocalPtr, S8, 8},
1539 {S32, LocalPtr, S16, 16},
1540 {V2S16, LocalPtr, S32, 32},
1541
1542 {S32, PrivatePtr, S32, 32},
1543 {S32, PrivatePtr, S8, 8},
1544 {S32, PrivatePtr, S16, 16},
1545 {V2S16, PrivatePtr, S32, 32},
1546
1547 {S32, ConstantPtr, S32, GlobalAlign32},
1548 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1549 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1550 {S64, ConstantPtr, S64, GlobalAlign32},
1551 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1552 Actions.legalIf(
1553 [=](const LegalityQuery &Query) -> bool {
1554 return isLoadStoreLegal(ST, Query);
1555 });
1556
1557 // The custom pointers (fat pointers, buffer resources) don't work with load
1558 // and store at this level. Fat pointers should have been lowered to
1559 // intrinsics before the translation to MIR.
1560 Actions.unsupportedIf(
1561 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1562
1563 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1564 // ptrtoint. This is needed to account for the fact that we can't have i128
1565 // as a register class for SelectionDAG reasons.
1566 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1567 return hasBufferRsrcWorkaround(Query.Types[0]);
1568 });
1569
1570 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1571 // 64-bits.
1572 //
1573 // TODO: Should generalize bitcast action into coerce, which will also cover
1574 // inserting addrspacecasts.
1575 Actions.customIf(typeIs(1, Constant32Ptr));
1576
1577 // Turn any illegal element vectors into something easier to deal
1578 // with. These will ultimately produce 32-bit scalar shifts to extract the
1579 // parts anyway.
1580 //
1581 // For odd 16-bit element vectors, prefer to split those into pieces with
1582 // 16-bit vector parts.
1583 Actions.bitcastIf(
1584 [=](const LegalityQuery &Query) -> bool {
1585 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1586 Query.MMODescrs[0].MemoryTy);
1587 }, bitcastToRegisterType(0));
1588
1589 if (!IsStore) {
1590 // Widen suitably aligned loads by loading extra bytes. The standard
1591 // legalization actions can't properly express widening memory operands.
1592 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1593 return shouldWidenLoad(ST, Query, G_LOAD);
1594 });
1595 }
1596
1597 // FIXME: load/store narrowing should be moved to lower action
1598 Actions
1599 .narrowScalarIf(
1600 [=](const LegalityQuery &Query) -> bool {
1601 return !Query.Types[0].isVector() &&
1602 needToSplitMemOp(Query, Op == G_LOAD);
1603 },
1604 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1605 const LLT DstTy = Query.Types[0];
1606 const LLT PtrTy = Query.Types[1];
1607
1608 const unsigned DstSize = DstTy.getSizeInBits();
1609 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1610
1611 // Split extloads.
1612 if (DstSize > MemSize)
1613 return std::pair(0, LLT::scalar(MemSize));
1614
1615 unsigned MaxSize = maxSizeForAddrSpace(
1616 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1617 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1618 if (MemSize > MaxSize)
1619 return std::pair(0, LLT::scalar(MaxSize));
1620
1621 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1622 return std::pair(0, LLT::scalar(Align));
1623 })
1624 .fewerElementsIf(
1625 [=](const LegalityQuery &Query) -> bool {
1626 return Query.Types[0].isVector() &&
1627 needToSplitMemOp(Query, Op == G_LOAD);
1628 },
1629 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1630 const LLT DstTy = Query.Types[0];
1631 const LLT PtrTy = Query.Types[1];
1632
1633 LLT EltTy = DstTy.getElementType();
1634 unsigned MaxSize = maxSizeForAddrSpace(
1635 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1636 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1637
1638 // FIXME: Handle widened to power of 2 results better. This ends
1639 // up scalarizing.
1640 // FIXME: 3 element stores scalarized on SI
1641
1642 // Split if it's too large for the address space.
1643 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1644 if (MemSize > MaxSize) {
1645 unsigned NumElts = DstTy.getNumElements();
1646 unsigned EltSize = EltTy.getSizeInBits();
1647
1648 if (MaxSize % EltSize == 0) {
1649 return std::pair(
1651 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1652 }
1653
1654 unsigned NumPieces = MemSize / MaxSize;
1655
1656 // FIXME: Refine when odd breakdowns handled
1657 // The scalars will need to be re-legalized.
1658 if (NumPieces == 1 || NumPieces >= NumElts ||
1659 NumElts % NumPieces != 0)
1660 return std::pair(0, EltTy);
1661
1662 return std::pair(0,
1663 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1664 }
1665
1666 // FIXME: We could probably handle weird extending loads better.
1667 if (DstTy.getSizeInBits() > MemSize)
1668 return std::pair(0, EltTy);
1669
1670 unsigned EltSize = EltTy.getSizeInBits();
1671 unsigned DstSize = DstTy.getSizeInBits();
1672 if (!isPowerOf2_32(DstSize)) {
1673 // We're probably decomposing an odd sized store. Try to split
1674 // to the widest type. TODO: Account for alignment. As-is it
1675 // should be OK, since the new parts will be further legalized.
1676 unsigned FloorSize = llvm::bit_floor(DstSize);
1677 return std::pair(
1679 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1680 }
1681
1682 // May need relegalization for the scalars.
1683 return std::pair(0, EltTy);
1684 })
1685 .minScalar(0, S32)
1686 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1688 .widenScalarToNextPow2(0)
1689 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1690 .lower();
1691 }
1692
1693 // FIXME: Unaligned accesses not lowered.
1694 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1695 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1696 {S32, GlobalPtr, S16, 2 * 8},
1697 {S32, LocalPtr, S8, 8},
1698 {S32, LocalPtr, S16, 16},
1699 {S32, PrivatePtr, S8, 8},
1700 {S32, PrivatePtr, S16, 16},
1701 {S32, ConstantPtr, S8, 8},
1702 {S32, ConstantPtr, S16, 2 * 8}})
1703 .legalIf(
1704 [=](const LegalityQuery &Query) -> bool {
1705 return isLoadStoreLegal(ST, Query);
1706 });
1707
1708 if (ST.hasFlatAddressSpace()) {
1709 ExtLoads.legalForTypesWithMemDesc(
1710 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1711 }
1712
1713 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1714 // 64-bits.
1715 //
1716 // TODO: Should generalize bitcast action into coerce, which will also cover
1717 // inserting addrspacecasts.
1718 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1719
1720 ExtLoads.clampScalar(0, S32, S32)
1722 .lower();
1723
1724 auto &Atomics = getActionDefinitionsBuilder(
1725 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1726 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1727 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1728 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1729 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1730 {S64, GlobalPtr}, {S64, LocalPtr},
1731 {S32, RegionPtr}, {S64, RegionPtr}});
1732 if (ST.hasFlatAddressSpace()) {
1733 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1734 }
1735
1736 auto &Atomics32 =
1737 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1738 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1739 if (ST.hasFlatAddressSpace()) {
1740 Atomics32.legalFor({{S32, FlatPtr}});
1741 }
1742
1743 // TODO: v2bf16 operations, and fat buffer pointer support.
1744 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1745 if (ST.hasLDSFPAtomicAddF32()) {
1746 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1747 if (ST.hasLdsAtomicAddF64())
1748 Atomic.legalFor({{S64, LocalPtr}});
1749 if (ST.hasAtomicDsPkAdd16Insts())
1750 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1751 }
1752 if (ST.hasAtomicFaddInsts())
1753 Atomic.legalFor({{S32, GlobalPtr}});
1754 if (ST.hasFlatAtomicFaddF32Inst())
1755 Atomic.legalFor({{S32, FlatPtr}});
1756
1757 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1758 // These are legal with some caveats, and should have undergone expansion in
1759 // the IR in most situations
1760 // TODO: Move atomic expansion into legalizer
1761 Atomic.legalFor({
1762 {S32, GlobalPtr},
1763 {S64, GlobalPtr},
1764 {S64, FlatPtr}
1765 });
1766 }
1767
1768 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1769 ST.hasAtomicBufferGlobalPkAddF16Insts())
1770 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1771 if (ST.hasAtomicGlobalPkAddBF16Inst())
1772 Atomic.legalFor({{V2BF16, GlobalPtr}});
1773 if (ST.hasAtomicFlatPkAdd16Insts())
1774 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1775
1776
1777 // Most of the legalization work here is done by AtomicExpand. We could
1778 // probably use a simpler legality rule that just assumes anything is OK.
1779 auto &AtomicFMinFMax =
1780 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1781 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1782
1783 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1784 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1785 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1786 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1787 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1788 AtomicFMinFMax.legalFor({F32, FlatPtr});
1789 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1790 AtomicFMinFMax.legalFor({F64, FlatPtr});
1791
1792 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1793 // demarshalling
1794 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1795 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1796 {S32, FlatPtr}, {S64, FlatPtr}})
1797 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1798 {S32, RegionPtr}, {S64, RegionPtr}});
1799 // TODO: Pointer types, any 32-bit or 64-bit vector
1800
1801 // Condition should be s32 for scalar, s1 for vector.
1804 LocalPtr, FlatPtr, PrivatePtr,
1805 LLT::fixed_vector(2, LocalPtr),
1806 LLT::fixed_vector(2, PrivatePtr)},
1807 {S1, S32})
1808 .clampScalar(0, S16, S64)
1809 .scalarize(1)
1812 .clampMaxNumElements(0, S32, 2)
1813 .clampMaxNumElements(0, LocalPtr, 2)
1814 .clampMaxNumElements(0, PrivatePtr, 2)
1815 .scalarize(0)
1817 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1818
1819 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1820 // be more flexible with the shift amount type.
1821 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1822 .legalFor({{S32, S32}, {S64, S32}});
1823 if (ST.has16BitInsts()) {
1824 if (ST.hasVOP3PInsts()) {
1825 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1826 .clampMaxNumElements(0, S16, 2);
1827 } else
1828 Shifts.legalFor({{S16, S16}});
1829
1830 // TODO: Support 16-bit shift amounts for all types
1831 Shifts.widenScalarIf(
1832 [=](const LegalityQuery &Query) {
1833 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1834 // 32-bit amount.
1835 const LLT ValTy = Query.Types[0];
1836 const LLT AmountTy = Query.Types[1];
1837 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1838 AmountTy.getSizeInBits() < 16;
1839 }, changeTo(1, S16));
1840 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1841 Shifts.clampScalar(1, S32, S32);
1842 Shifts.widenScalarToNextPow2(0, 16);
1843 Shifts.clampScalar(0, S16, S64);
1844
1845 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1846 .minScalar(0, S16)
1847 .scalarize(0)
1848 .lower();
1849 } else {
1850 // Make sure we legalize the shift amount type first, as the general
1851 // expansion for the shifted type will produce much worse code if it hasn't
1852 // been truncated already.
1853 Shifts.clampScalar(1, S32, S32);
1854 Shifts.widenScalarToNextPow2(0, 32);
1855 Shifts.clampScalar(0, S32, S64);
1856
1857 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1858 .minScalar(0, S32)
1859 .scalarize(0)
1860 .lower();
1861 }
1862 Shifts.scalarize(0);
1863
1864 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1865 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1866 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1867 unsigned IdxTypeIdx = 2;
1868
1870 .customIf([=](const LegalityQuery &Query) {
1871 const LLT EltTy = Query.Types[EltTypeIdx];
1872 const LLT VecTy = Query.Types[VecTypeIdx];
1873 const LLT IdxTy = Query.Types[IdxTypeIdx];
1874 const unsigned EltSize = EltTy.getSizeInBits();
1875 const bool isLegalVecType =
1877 // Address space 8 pointers are 128-bit wide values, but the logic
1878 // below will try to bitcast them to 2N x s64, which will fail.
1879 // Therefore, as an intermediate step, wrap extracts/insertions from a
1880 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1881 // extraction result) in order to produce a vector operation that can
1882 // be handled by the logic below.
1883 if (EltTy.isPointer() && EltSize > 64)
1884 return true;
1885 return (EltSize == 32 || EltSize == 64) &&
1886 VecTy.getSizeInBits() % 32 == 0 &&
1887 VecTy.getSizeInBits() <= MaxRegisterSize &&
1888 IdxTy.getSizeInBits() == 32 &&
1889 isLegalVecType;
1890 })
1891 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1892 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1893 bitcastToVectorElement32(VecTypeIdx))
1894 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1895 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1896 scalarOrEltWiderThan(VecTypeIdx, 64)),
1897 [=](const LegalityQuery &Query) {
1898 // For > 64-bit element types, try to turn this into a
1899 // 64-bit element vector since we may be able to do better
1900 // indexing if this is scalar. If not, fall back to 32.
1901 const LLT EltTy = Query.Types[EltTypeIdx];
1902 const LLT VecTy = Query.Types[VecTypeIdx];
1903 const unsigned DstEltSize = EltTy.getSizeInBits();
1904 const unsigned VecSize = VecTy.getSizeInBits();
1905
1906 const unsigned TargetEltSize =
1907 DstEltSize % 64 == 0 ? 64 : 32;
1908 return std::pair(VecTypeIdx,
1909 LLT::fixed_vector(VecSize / TargetEltSize,
1910 TargetEltSize));
1911 })
1912 .clampScalar(EltTypeIdx, S32, S64)
1913 .clampScalar(VecTypeIdx, S32, S64)
1914 .clampScalar(IdxTypeIdx, S32, S32)
1915 .clampMaxNumElements(VecTypeIdx, S32, 32)
1916 // TODO: Clamp elements for 64-bit vectors?
1917 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1919 // It should only be necessary with variable indexes.
1920 // As a last resort, lower to the stack
1921 .lower();
1922 }
1923
1924 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1925 .unsupportedIf([=](const LegalityQuery &Query) {
1926 const LLT &EltTy = Query.Types[1].getElementType();
1927 return Query.Types[0] != EltTy;
1928 });
1929
1930 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1931 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1932 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1933
1934 // FIXME: Doesn't handle extract of illegal sizes.
1936 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1937 .lowerIf([=](const LegalityQuery &Query) {
1938 // Sub-vector(or single element) insert and extract.
1939 // TODO: verify immediate offset here since lower only works with
1940 // whole elements.
1941 const LLT BigTy = Query.Types[BigTyIdx];
1942 return BigTy.isVector();
1943 })
1944 // FIXME: Multiples of 16 should not be legal.
1945 .legalIf([=](const LegalityQuery &Query) {
1946 const LLT BigTy = Query.Types[BigTyIdx];
1947 const LLT LitTy = Query.Types[LitTyIdx];
1948 return (BigTy.getSizeInBits() % 32 == 0) &&
1949 (LitTy.getSizeInBits() % 16 == 0);
1950 })
1951 .widenScalarIf(
1952 [=](const LegalityQuery &Query) {
1953 const LLT BigTy = Query.Types[BigTyIdx];
1954 return (BigTy.getScalarSizeInBits() < 16);
1955 },
1957 .widenScalarIf(
1958 [=](const LegalityQuery &Query) {
1959 const LLT LitTy = Query.Types[LitTyIdx];
1960 return (LitTy.getScalarSizeInBits() < 16);
1961 },
1963 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1964 .widenScalarToNextPow2(BigTyIdx, 32);
1965
1966 }
1967
1968 auto &BuildVector =
1969 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1971 .legalForCartesianProduct(AllS64Vectors, {S64})
1972 .clampNumElements(0, V16S32, V32S32)
1977
1978 if (ST.hasScalarPackInsts()) {
1979 BuildVector
1980 // FIXME: Should probably widen s1 vectors straight to s32
1981 .minScalarOrElt(0, S16)
1982 .minScalar(1, S16);
1983
1984 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1985 .legalFor({V2S16, S32})
1986 .lower();
1987 } else {
1988 BuildVector.customFor({V2S16, S16});
1989 BuildVector.minScalarOrElt(0, S32);
1990
1991 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1992 .customFor({V2S16, S32})
1993 .lower();
1994 }
1995
1996 BuildVector.legalIf(isRegisterType(ST, 0));
1997
1998 // FIXME: Clamp maximum size
1999 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2000 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2001 .clampMaxNumElements(0, S32, 32)
2002 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2003 .clampMaxNumElements(0, S16, 64);
2004
2005 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2006
2007 // Merge/Unmerge
2008 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2009 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2010 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2011
2012 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2013 const LLT Ty = Query.Types[TypeIdx];
2014 if (Ty.isVector()) {
2015 const LLT &EltTy = Ty.getElementType();
2016 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2017 return true;
2019 return true;
2020 }
2021 return false;
2022 };
2023
2024 auto &Builder =
2026 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2027 .lowerFor({{S16, V2S16}})
2028 .lowerIf([=](const LegalityQuery &Query) {
2029 const LLT BigTy = Query.Types[BigTyIdx];
2030 return BigTy.getSizeInBits() == 32;
2031 })
2032 // Try to widen to s16 first for small types.
2033 // TODO: Only do this on targets with legal s16 shifts
2034 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2035 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2037 oneMoreElement(BigTyIdx))
2039 elementTypeIs(1, S16)),
2040 changeTo(1, V2S16))
2041 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2042 // not worth considering the multiples of 64 since 2*192 and 2*384
2043 // are not valid.
2044 .clampScalar(LitTyIdx, S32, S512)
2045 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2046 // Break up vectors with weird elements into scalars
2048 [=](const LegalityQuery &Query) {
2049 return notValidElt(Query, LitTyIdx);
2050 },
2051 scalarize(0))
2052 .fewerElementsIf(
2053 [=](const LegalityQuery &Query) {
2054 return notValidElt(Query, BigTyIdx);
2055 },
2056 scalarize(1))
2057 .clampScalar(BigTyIdx, S32, MaxScalar);
2058
2059 if (Op == G_MERGE_VALUES) {
2060 Builder.widenScalarIf(
2061 // TODO: Use 16-bit shifts if legal for 8-bit values?
2062 [=](const LegalityQuery &Query) {
2063 const LLT Ty = Query.Types[LitTyIdx];
2064 return Ty.getSizeInBits() < 32;
2065 },
2066 changeTo(LitTyIdx, S32));
2067 }
2068
2069 Builder.widenScalarIf(
2070 [=](const LegalityQuery &Query) {
2071 const LLT Ty = Query.Types[BigTyIdx];
2072 return Ty.getSizeInBits() % 16 != 0;
2073 },
2074 [=](const LegalityQuery &Query) {
2075 // Pick the next power of 2, or a multiple of 64 over 128.
2076 // Whichever is smaller.
2077 const LLT &Ty = Query.Types[BigTyIdx];
2078 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2079 if (NewSizeInBits >= 256) {
2080 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2081 if (RoundedTo < NewSizeInBits)
2082 NewSizeInBits = RoundedTo;
2083 }
2084 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2085 })
2086 // Any vectors left are the wrong size. Scalarize them.
2087 .scalarize(0)
2088 .scalarize(1);
2089 }
2090
2091 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2092 // RegBankSelect.
2093 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2094 .legalFor({{S32}, {S64}})
2095 .clampScalar(0, S32, S64);
2096
2097 if (ST.hasVOP3PInsts()) {
2098 SextInReg.lowerFor({{V2S16}})
2099 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2100 // get more vector shift opportunities, since we'll get those when
2101 // expanded.
2102 .clampMaxNumElementsStrict(0, S16, 2);
2103 } else if (ST.has16BitInsts()) {
2104 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2105 } else {
2106 // Prefer to promote to s32 before lowering if we don't have 16-bit
2107 // shifts. This avoid a lot of intermediate truncate and extend operations.
2108 SextInReg.lowerFor({{S32}, {S64}});
2109 }
2110
2111 SextInReg
2112 .scalarize(0)
2113 .clampScalar(0, S32, S64)
2114 .lower();
2115
2116 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2117 .scalarize(0)
2118 .lower();
2119
2120 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2121 FSHRActionDefs.legalFor({{S32, S32}})
2122 .clampMaxNumElementsStrict(0, S16, 2);
2123 if (ST.hasVOP3PInsts())
2124 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2125 FSHRActionDefs.scalarize(0).lower();
2126
2127 if (ST.hasVOP3PInsts()) {
2129 .lowerFor({{V2S16, V2S16}})
2130 .clampMaxNumElementsStrict(0, S16, 2)
2131 .scalarize(0)
2132 .lower();
2133 } else {
2135 .scalarize(0)
2136 .lower();
2137 }
2138
2139 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2140 .legalFor({S64});
2141
2142 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2143
2145 .alwaysLegal();
2146
2147 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2148 .scalarize(0)
2149 .minScalar(0, S32)
2150 .lower();
2151
2152 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2153 .legalFor({{S32, S32}, {S64, S32}})
2154 .clampScalar(1, S32, S32)
2155 .clampScalar(0, S32, S64)
2157 .scalarize(0);
2158
2160 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2161 G_FCOPYSIGN,
2162
2163 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2164 G_READ_REGISTER, G_WRITE_REGISTER,
2165
2166 G_SADDO, G_SSUBO})
2167 .lower();
2168
2169 if (ST.hasIEEEMinimumMaximumInsts()) {
2170 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2171 .legalFor(FPTypesPK16)
2172 .clampMaxNumElements(0, S16, 2)
2173 .scalarize(0);
2174 } else if (ST.hasVOP3PInsts()) {
2175 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2176 .lowerFor({V2S16})
2177 .clampMaxNumElementsStrict(0, S16, 2)
2178 .scalarize(0)
2179 .lower();
2180 } else {
2181 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2182 .scalarize(0)
2183 .clampScalar(0, S32, S64)
2184 .lower();
2185 }
2186
2187 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2188 .lower();
2189
2190 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2191
2192 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2193 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2194 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2195 .unsupported();
2196
2198
2200 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2201 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2202 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2203 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2204 .legalFor(AllVectors)
2205 .scalarize(1)
2206 .lower();
2207
2209 verify(*ST.getInstrInfo());
2210}
2211
2214 LostDebugLocObserver &LocObserver) const {
2215 MachineIRBuilder &B = Helper.MIRBuilder;
2216 MachineRegisterInfo &MRI = *B.getMRI();
2217
2218 switch (MI.getOpcode()) {
2219 case TargetOpcode::G_ADDRSPACE_CAST:
2220 return legalizeAddrSpaceCast(MI, MRI, B);
2221 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2222 return legalizeFroundeven(MI, MRI, B);
2223 case TargetOpcode::G_FCEIL:
2224 return legalizeFceil(MI, MRI, B);
2225 case TargetOpcode::G_FREM:
2226 return legalizeFrem(MI, MRI, B);
2227 case TargetOpcode::G_INTRINSIC_TRUNC:
2228 return legalizeIntrinsicTrunc(MI, MRI, B);
2229 case TargetOpcode::G_SITOFP:
2230 return legalizeITOFP(MI, MRI, B, true);
2231 case TargetOpcode::G_UITOFP:
2232 return legalizeITOFP(MI, MRI, B, false);
2233 case TargetOpcode::G_FPTOSI:
2234 return legalizeFPTOI(MI, MRI, B, true);
2235 case TargetOpcode::G_FPTOUI:
2236 return legalizeFPTOI(MI, MRI, B, false);
2237 case TargetOpcode::G_FMINNUM:
2238 case TargetOpcode::G_FMAXNUM:
2239 case TargetOpcode::G_FMINIMUMNUM:
2240 case TargetOpcode::G_FMAXIMUMNUM:
2241 return legalizeMinNumMaxNum(Helper, MI);
2242 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2243 return legalizeExtractVectorElt(MI, MRI, B);
2244 case TargetOpcode::G_INSERT_VECTOR_ELT:
2245 return legalizeInsertVectorElt(MI, MRI, B);
2246 case TargetOpcode::G_FSIN:
2247 case TargetOpcode::G_FCOS:
2248 return legalizeSinCos(MI, MRI, B);
2249 case TargetOpcode::G_GLOBAL_VALUE:
2250 return legalizeGlobalValue(MI, MRI, B);
2251 case TargetOpcode::G_LOAD:
2252 case TargetOpcode::G_SEXTLOAD:
2253 case TargetOpcode::G_ZEXTLOAD:
2254 return legalizeLoad(Helper, MI);
2255 case TargetOpcode::G_STORE:
2256 return legalizeStore(Helper, MI);
2257 case TargetOpcode::G_FMAD:
2258 return legalizeFMad(MI, MRI, B);
2259 case TargetOpcode::G_FDIV:
2260 return legalizeFDIV(MI, MRI, B);
2261 case TargetOpcode::G_FFREXP:
2262 return legalizeFFREXP(MI, MRI, B);
2263 case TargetOpcode::G_FSQRT:
2264 return legalizeFSQRT(MI, MRI, B);
2265 case TargetOpcode::G_UDIV:
2266 case TargetOpcode::G_UREM:
2267 case TargetOpcode::G_UDIVREM:
2268 return legalizeUnsignedDIV_REM(MI, MRI, B);
2269 case TargetOpcode::G_SDIV:
2270 case TargetOpcode::G_SREM:
2271 case TargetOpcode::G_SDIVREM:
2272 return legalizeSignedDIV_REM(MI, MRI, B);
2273 case TargetOpcode::G_ATOMIC_CMPXCHG:
2274 return legalizeAtomicCmpXChg(MI, MRI, B);
2275 case TargetOpcode::G_FLOG2:
2276 return legalizeFlog2(MI, B);
2277 case TargetOpcode::G_FLOG:
2278 case TargetOpcode::G_FLOG10:
2279 return legalizeFlogCommon(MI, B);
2280 case TargetOpcode::G_FEXP2:
2281 return legalizeFExp2(MI, B);
2282 case TargetOpcode::G_FEXP:
2283 case TargetOpcode::G_FEXP10:
2284 return legalizeFExp(MI, B);
2285 case TargetOpcode::G_FPOW:
2286 return legalizeFPow(MI, B);
2287 case TargetOpcode::G_FFLOOR:
2288 return legalizeFFloor(MI, MRI, B);
2289 case TargetOpcode::G_BUILD_VECTOR:
2290 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2291 return legalizeBuildVector(MI, MRI, B);
2292 case TargetOpcode::G_MUL:
2293 return legalizeMul(Helper, MI);
2294 case TargetOpcode::G_CTLZ:
2295 case TargetOpcode::G_CTTZ:
2296 return legalizeCTLZ_CTTZ(MI, MRI, B);
2297 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2298 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2299 case TargetOpcode::G_STACKSAVE:
2300 return legalizeStackSave(MI, B);
2301 case TargetOpcode::G_GET_FPENV:
2302 return legalizeGetFPEnv(MI, MRI, B);
2303 case TargetOpcode::G_SET_FPENV:
2304 return legalizeSetFPEnv(MI, MRI, B);
2305 case TargetOpcode::G_TRAP:
2306 return legalizeTrap(MI, MRI, B);
2307 case TargetOpcode::G_DEBUGTRAP:
2308 return legalizeDebugTrap(MI, MRI, B);
2309 default:
2310 return false;
2311 }
2312
2313 llvm_unreachable("expected switch to return");
2314}
2315
2317 unsigned AS,
2319 MachineIRBuilder &B) const {
2320 MachineFunction &MF = B.getMF();
2321 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2322 const LLT S32 = LLT::scalar(32);
2323 const LLT S64 = LLT::scalar(64);
2324
2326
2327 if (ST.hasApertureRegs()) {
2328 // Note: this register is somewhat broken. When used as a 32-bit operand,
2329 // it only returns zeroes. The real value is in the upper 32 bits.
2330 // Thus, we must emit extract the high 32 bits.
2331 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2332 ? AMDGPU::SRC_SHARED_BASE
2333 : AMDGPU::SRC_PRIVATE_BASE;
2334 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2335 !ST.hasGloballyAddressableScratch()) &&
2336 "Cannot use src_private_base with globally addressable scratch!");
2337 Register Dst = MRI.createGenericVirtualRegister(S64);
2338 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2339 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2340 return B.buildUnmerge(S32, Dst).getReg(1);
2341 }
2342
2343 Register LoadAddr = MRI.createGenericVirtualRegister(
2345 // For code object version 5, private_base and shared_base are passed through
2346 // implicit kernargs.
2350
2355 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2356
2357 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2359
2360 if (!loadInputValue(KernargPtrReg, B,
2362 return Register();
2363
2365 PtrInfo.getWithOffset(Offset),
2369
2370 // Pointer address
2371 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2372 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2373 // Load address
2374 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2375 }
2376
2377 Register QueuePtr = MRI.createGenericVirtualRegister(
2379
2381 return Register();
2382
2383 // TODO: Use custom PseudoSourceValue
2385
2386 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2387 // private_segment_aperture_base_hi.
2388 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2389
2391 PtrInfo,
2394 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2395
2396 B.buildObjectPtrOffset(
2397 LoadAddr, QueuePtr,
2398 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2399 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2400}
2401
2402/// Return true if the value is a known valid address, such that a null check is
2403/// not necessary.
2405 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2406 MachineInstr *Def = MRI.getVRegDef(Val);
2407 switch (Def->getOpcode()) {
2408 case AMDGPU::G_FRAME_INDEX:
2409 case AMDGPU::G_GLOBAL_VALUE:
2410 case AMDGPU::G_BLOCK_ADDR:
2411 return true;
2412 case AMDGPU::G_CONSTANT: {
2413 const ConstantInt *CI = Def->getOperand(1).getCImm();
2414 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2415 }
2416 default:
2417 return false;
2418 }
2419
2420 return false;
2421}
2422
2425 MachineIRBuilder &B) const {
2426 MachineFunction &MF = B.getMF();
2427
2428 // MI can either be a G_ADDRSPACE_CAST or a
2429 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2430 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2431 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2432 Intrinsic::amdgcn_addrspacecast_nonnull));
2433
2434 const LLT S32 = LLT::scalar(32);
2435 Register Dst = MI.getOperand(0).getReg();
2436 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2437 : MI.getOperand(1).getReg();
2438 LLT DstTy = MRI.getType(Dst);
2439 LLT SrcTy = MRI.getType(Src);
2440 unsigned DestAS = DstTy.getAddressSpace();
2441 unsigned SrcAS = SrcTy.getAddressSpace();
2442
2443 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2444 // vector element.
2445 assert(!DstTy.isVector());
2446
2447 const AMDGPUTargetMachine &TM
2448 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2449
2450 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2451 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2452 return true;
2453 }
2454
2455 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2456 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2457 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2458 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2459 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2460 ST.hasGloballyAddressableScratch()) {
2461 // flat -> private with globally addressable scratch: subtract
2462 // src_flat_scratch_base_lo.
2463 const LLT S32 = LLT::scalar(32);
2464 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2465 Register FlatScratchBaseLo =
2466 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2467 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2468 .getReg(0);
2469 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2470 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2471 return B.buildIntToPtr(Dst, Sub).getReg(0);
2472 }
2473
2474 // Extract low 32-bits of the pointer.
2475 return B.buildExtract(Dst, Src, 0).getReg(0);
2476 };
2477
2478 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2479 // G_ADDRSPACE_CAST we need to guess.
2480 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2481 castFlatToLocalOrPrivate(Dst);
2482 MI.eraseFromParent();
2483 return true;
2484 }
2485
2486 unsigned NullVal = TM.getNullPointerValue(DestAS);
2487
2488 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2489 auto FlatNull = B.buildConstant(SrcTy, 0);
2490
2491 // Extract low 32-bits of the pointer.
2492 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2493
2494 auto CmpRes =
2495 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2496 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2497
2498 MI.eraseFromParent();
2499 return true;
2500 }
2501
2502 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2503 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2504 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2505 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2506 // Coerce the type of the low half of the result so we can use
2507 // merge_values.
2508 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2509
2510 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2511 ST.hasGloballyAddressableScratch()) {
2512 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2513 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2514 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2515 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2516 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2517 .addUse(AllOnes)
2518 .addUse(ThreadID)
2519 .getReg(0);
2520 if (ST.isWave64()) {
2521 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2522 .addUse(AllOnes)
2523 .addUse(ThreadID)
2524 .getReg(0);
2525 }
2526 Register ShAmt =
2527 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2528 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2529 Register CvtPtr =
2530 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2531 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2532 // 64-bit hi:lo value.
2533 Register FlatScratchBase =
2534 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2535 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2536 .getReg(0);
2537 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2538 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2539 }
2540
2541 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2542 if (!ApertureReg.isValid())
2543 return false;
2544
2545 // TODO: Should we allow mismatched types but matching sizes in merges to
2546 // avoid the ptrtoint?
2547 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2548 };
2549
2550 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2551 // G_ADDRSPACE_CAST we need to guess.
2552 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2553 castLocalOrPrivateToFlat(Dst);
2554 MI.eraseFromParent();
2555 return true;
2556 }
2557
2558 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2559
2560 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2561 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2562
2563 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2564 SegmentNull.getReg(0));
2565
2566 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2567
2568 MI.eraseFromParent();
2569 return true;
2570 }
2571
2572 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2573 SrcTy.getSizeInBits() == 64) {
2574 // Truncate.
2575 B.buildExtract(Dst, Src, 0);
2576 MI.eraseFromParent();
2577 return true;
2578 }
2579
2580 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2581 DstTy.getSizeInBits() == 64) {
2583 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2584 auto PtrLo = B.buildPtrToInt(S32, Src);
2585 if (AddrHiVal == 0) {
2586 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2587 B.buildIntToPtr(Dst, Zext);
2588 } else {
2589 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2590 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2591 }
2592
2593 MI.eraseFromParent();
2594 return true;
2595 }
2596
2597 // Invalid casts are poison.
2598 // TODO: Should return poison
2599 B.buildUndef(Dst);
2600 MI.eraseFromParent();
2601 return true;
2602}
2603
2606 MachineIRBuilder &B) const {
2607 Register Src = MI.getOperand(1).getReg();
2608 LLT Ty = MRI.getType(Src);
2609 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2610
2611 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2612 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2613
2614 auto C1 = B.buildFConstant(Ty, C1Val);
2615 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2616
2617 // TODO: Should this propagate fast-math-flags?
2618 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2619 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2620
2621 auto C2 = B.buildFConstant(Ty, C2Val);
2622 auto Fabs = B.buildFAbs(Ty, Src);
2623
2624 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2625 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2626 MI.eraseFromParent();
2627 return true;
2628}
2629
2632 MachineIRBuilder &B) const {
2633
2634 const LLT S1 = LLT::scalar(1);
2635 const LLT S64 = LLT::scalar(64);
2636
2637 Register Src = MI.getOperand(1).getReg();
2638 assert(MRI.getType(Src) == S64);
2639
2640 // result = trunc(src)
2641 // if (src > 0.0 && src != result)
2642 // result += 1.0
2643
2644 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2645
2646 const auto Zero = B.buildFConstant(S64, 0.0);
2647 const auto One = B.buildFConstant(S64, 1.0);
2648 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2649 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2650 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2651 auto Add = B.buildSelect(S64, And, One, Zero);
2652
2653 // TODO: Should this propagate fast-math-flags?
2654 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2655 MI.eraseFromParent();
2656 return true;
2657}
2658
2661 MachineIRBuilder &B) const {
2662 Register DstReg = MI.getOperand(0).getReg();
2663 Register Src0Reg = MI.getOperand(1).getReg();
2664 Register Src1Reg = MI.getOperand(2).getReg();
2665 auto Flags = MI.getFlags();
2666 LLT Ty = MRI.getType(DstReg);
2667
2668 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2669 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2670 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2671 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2672 MI.eraseFromParent();
2673 return true;
2674}
2675
2678 const unsigned FractBits = 52;
2679 const unsigned ExpBits = 11;
2680 LLT S32 = LLT::scalar(32);
2681
2682 auto Const0 = B.buildConstant(S32, FractBits - 32);
2683 auto Const1 = B.buildConstant(S32, ExpBits);
2684
2685 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2686 .addUse(Hi)
2687 .addUse(Const0.getReg(0))
2688 .addUse(Const1.getReg(0));
2689
2690 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2691}
2692
2695 MachineIRBuilder &B) const {
2696 const LLT S1 = LLT::scalar(1);
2697 const LLT S32 = LLT::scalar(32);
2698 const LLT S64 = LLT::scalar(64);
2699
2700 Register Src = MI.getOperand(1).getReg();
2701 assert(MRI.getType(Src) == S64);
2702
2703 // TODO: Should this use extract since the low half is unused?
2704 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2705 Register Hi = Unmerge.getReg(1);
2706
2707 // Extract the upper half, since this is where we will find the sign and
2708 // exponent.
2709 auto Exp = extractF64Exponent(Hi, B);
2710
2711 const unsigned FractBits = 52;
2712
2713 // Extract the sign bit.
2714 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2715 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2716
2717 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2718
2719 const auto Zero32 = B.buildConstant(S32, 0);
2720
2721 // Extend back to 64-bits.
2722 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2723
2724 auto Shr = B.buildAShr(S64, FractMask, Exp);
2725 auto Not = B.buildNot(S64, Shr);
2726 auto Tmp0 = B.buildAnd(S64, Src, Not);
2727 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2728
2729 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2730 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2731
2732 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2733 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2734 MI.eraseFromParent();
2735 return true;
2736}
2737
2740 MachineIRBuilder &B, bool Signed) const {
2741
2742 Register Dst = MI.getOperand(0).getReg();
2743 Register Src = MI.getOperand(1).getReg();
2744
2745 const LLT S64 = LLT::scalar(64);
2746 const LLT S32 = LLT::scalar(32);
2747
2748 assert(MRI.getType(Src) == S64);
2749
2750 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2751 auto ThirtyTwo = B.buildConstant(S32, 32);
2752
2753 if (MRI.getType(Dst) == S64) {
2754 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2755 : B.buildUITOFP(S64, Unmerge.getReg(1));
2756
2757 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2758 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2759
2760 // TODO: Should this propagate fast-math-flags?
2761 B.buildFAdd(Dst, LdExp, CvtLo);
2762 MI.eraseFromParent();
2763 return true;
2764 }
2765
2766 assert(MRI.getType(Dst) == S32);
2767
2768 auto One = B.buildConstant(S32, 1);
2769
2770 MachineInstrBuilder ShAmt;
2771 if (Signed) {
2772 auto ThirtyOne = B.buildConstant(S32, 31);
2773 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2774 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2775 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2776 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2777 .addUse(Unmerge.getReg(1));
2778 auto LS2 = B.buildSub(S32, LS, One);
2779 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2780 } else
2781 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2782 auto Norm = B.buildShl(S64, Src, ShAmt);
2783 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2784 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2785 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2786 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2787 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2788 B.buildFLdexp(Dst, FVal, Scale);
2789 MI.eraseFromParent();
2790 return true;
2791}
2792
2793// TODO: Copied from DAG implementation. Verify logic and document how this
2794// actually works.
2798 bool Signed) const {
2799
2800 Register Dst = MI.getOperand(0).getReg();
2801 Register Src = MI.getOperand(1).getReg();
2802
2803 const LLT S64 = LLT::scalar(64);
2804 const LLT S32 = LLT::scalar(32);
2805
2806 const LLT SrcLT = MRI.getType(Src);
2807 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2808
2809 unsigned Flags = MI.getFlags();
2810
2811 // The basic idea of converting a floating point number into a pair of 32-bit
2812 // integers is illustrated as follows:
2813 //
2814 // tf := trunc(val);
2815 // hif := floor(tf * 2^-32);
2816 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2817 // hi := fptoi(hif);
2818 // lo := fptoi(lof);
2819 //
2820 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2822 if (Signed && SrcLT == S32) {
2823 // However, a 32-bit floating point number has only 23 bits mantissa and
2824 // it's not enough to hold all the significant bits of `lof` if val is
2825 // negative. To avoid the loss of precision, We need to take the absolute
2826 // value after truncating and flip the result back based on the original
2827 // signedness.
2828 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2829 Trunc = B.buildFAbs(S32, Trunc, Flags);
2830 }
2831 MachineInstrBuilder K0, K1;
2832 if (SrcLT == S64) {
2833 K0 = B.buildFConstant(
2834 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2835 K1 = B.buildFConstant(
2836 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2837 } else {
2838 K0 = B.buildFConstant(
2839 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2840 K1 = B.buildFConstant(
2841 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2842 }
2843
2844 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2845 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2846 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2847
2848 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2849 : B.buildFPTOUI(S32, FloorMul);
2850 auto Lo = B.buildFPTOUI(S32, Fma);
2851
2852 if (Signed && SrcLT == S32) {
2853 // Flip the result based on the signedness, which is either all 0s or 1s.
2854 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2855 // r := xor({lo, hi}, sign) - sign;
2856 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2857 Sign);
2858 } else
2859 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2860 MI.eraseFromParent();
2861
2862 return true;
2863}
2864
2866 MachineInstr &MI) const {
2867 MachineFunction &MF = Helper.MIRBuilder.getMF();
2869
2870 // With ieee_mode disabled, the instructions have the correct behavior.
2871 if (!MFI->getMode().IEEE)
2872 return true;
2873
2875}
2876
2879 MachineIRBuilder &B) const {
2880 // TODO: Should move some of this into LegalizerHelper.
2881
2882 // TODO: Promote dynamic indexing of s16 to s32
2883
2884 Register Dst = MI.getOperand(0).getReg();
2885 Register Vec = MI.getOperand(1).getReg();
2886
2887 LLT VecTy = MRI.getType(Vec);
2888 LLT EltTy = VecTy.getElementType();
2889 assert(EltTy == MRI.getType(Dst));
2890
2891 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2892 // but we can't go directly to that logic becasue you can't bitcast a vector
2893 // of pointers to a vector of integers. Therefore, introduce an intermediate
2894 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2895 // drive the legalization forward.
2896 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2897 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2898 LLT IntVecTy = VecTy.changeElementType(IntTy);
2899
2900 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2901 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2902 B.buildIntToPtr(Dst, IntElt);
2903
2904 MI.eraseFromParent();
2905 return true;
2906 }
2907
2908 // FIXME: Artifact combiner probably should have replaced the truncated
2909 // constant before this, so we shouldn't need
2910 // getIConstantVRegValWithLookThrough.
2911 std::optional<ValueAndVReg> MaybeIdxVal =
2912 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2913 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2914 return true;
2915 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2916
2917 if (IdxVal < VecTy.getNumElements()) {
2918 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2919 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2920 } else {
2921 B.buildUndef(Dst);
2922 }
2923
2924 MI.eraseFromParent();
2925 return true;
2926}
2927
2930 MachineIRBuilder &B) const {
2931 // TODO: Should move some of this into LegalizerHelper.
2932
2933 // TODO: Promote dynamic indexing of s16 to s32
2934
2935 Register Dst = MI.getOperand(0).getReg();
2936 Register Vec = MI.getOperand(1).getReg();
2937 Register Ins = MI.getOperand(2).getReg();
2938
2939 LLT VecTy = MRI.getType(Vec);
2940 LLT EltTy = VecTy.getElementType();
2941 assert(EltTy == MRI.getType(Ins));
2942
2943 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2944 // but we can't go directly to that logic becasue you can't bitcast a vector
2945 // of pointers to a vector of integers. Therefore, make the pointer vector
2946 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2947 // new value, and then inttoptr the result vector back. This will then allow
2948 // the rest of legalization to take over.
2949 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2950 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2951 LLT IntVecTy = VecTy.changeElementType(IntTy);
2952
2953 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2954 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2955 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2956 MI.getOperand(3));
2957 B.buildIntToPtr(Dst, IntVecDest);
2958 MI.eraseFromParent();
2959 return true;
2960 }
2961
2962 // FIXME: Artifact combiner probably should have replaced the truncated
2963 // constant before this, so we shouldn't need
2964 // getIConstantVRegValWithLookThrough.
2965 std::optional<ValueAndVReg> MaybeIdxVal =
2966 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2967 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2968 return true;
2969
2970 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2971
2972 unsigned NumElts = VecTy.getNumElements();
2973 if (IdxVal < NumElts) {
2975 for (unsigned i = 0; i < NumElts; ++i)
2976 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2977 B.buildUnmerge(SrcRegs, Vec);
2978
2979 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2980 B.buildMergeLikeInstr(Dst, SrcRegs);
2981 } else {
2982 B.buildUndef(Dst);
2983 }
2984
2985 MI.eraseFromParent();
2986 return true;
2987}
2988
2991 MachineIRBuilder &B) const {
2992
2993 Register DstReg = MI.getOperand(0).getReg();
2994 Register SrcReg = MI.getOperand(1).getReg();
2995 LLT Ty = MRI.getType(DstReg);
2996 unsigned Flags = MI.getFlags();
2997
2998 Register TrigVal;
2999 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3000 if (ST.hasTrigReducedRange()) {
3001 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3002 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3003 .addUse(MulVal.getReg(0))
3004 .setMIFlags(Flags)
3005 .getReg(0);
3006 } else
3007 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3008
3009 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3010 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3011 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3012 .addUse(TrigVal)
3013 .setMIFlags(Flags);
3014 MI.eraseFromParent();
3015 return true;
3016}
3017
3020 const GlobalValue *GV,
3021 int64_t Offset,
3022 unsigned GAFlags) const {
3023 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3024 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3025 // to the following code sequence:
3026 //
3027 // For constant address space:
3028 // s_getpc_b64 s[0:1]
3029 // s_add_u32 s0, s0, $symbol
3030 // s_addc_u32 s1, s1, 0
3031 //
3032 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3033 // a fixup or relocation is emitted to replace $symbol with a literal
3034 // constant, which is a pc-relative offset from the encoding of the $symbol
3035 // operand to the global variable.
3036 //
3037 // For global address space:
3038 // s_getpc_b64 s[0:1]
3039 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3040 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3041 //
3042 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3043 // fixups or relocations are emitted to replace $symbol@*@lo and
3044 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3045 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3046 // operand to the global variable.
3047
3049
3050 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3051 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3052
3053 if (ST.has64BitLiterals()) {
3054 assert(GAFlags != SIInstrInfo::MO_NONE);
3055
3057 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3058 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3059 } else {
3061 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3062
3063 MIB.addGlobalAddress(GV, Offset, GAFlags);
3064 if (GAFlags == SIInstrInfo::MO_NONE)
3065 MIB.addImm(0);
3066 else
3067 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3068 }
3069
3070 if (!B.getMRI()->getRegClassOrNull(PCReg))
3071 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3072
3073 if (PtrTy.getSizeInBits() == 32)
3074 B.buildExtract(DstReg, PCReg, 0);
3075 return true;
3076}
3077
3078// Emit a ABS32_LO / ABS32_HI relocation stub.
3080 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3081 MachineRegisterInfo &MRI) const {
3082 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3083
3084 if (RequiresHighHalf && ST.has64BitLiterals()) {
3085 if (!MRI.getRegClassOrNull(DstReg))
3086 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3087 B.buildInstr(AMDGPU::S_MOV_B64)
3088 .addDef(DstReg)
3089 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3090 return;
3091 }
3092
3093 LLT S32 = LLT::scalar(32);
3094
3095 // Use the destination directly, if and only if we store the lower address
3096 // part only and we don't have a register class being set.
3097 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3098 ? DstReg
3099 : MRI.createGenericVirtualRegister(S32);
3100
3101 if (!MRI.getRegClassOrNull(AddrLo))
3102 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3103
3104 // Write the lower half.
3105 B.buildInstr(AMDGPU::S_MOV_B32)
3106 .addDef(AddrLo)
3107 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3108
3109 // If required, write the upper half as well.
3110 if (RequiresHighHalf) {
3111 assert(PtrTy.getSizeInBits() == 64 &&
3112 "Must provide a 64-bit pointer type!");
3113
3114 Register AddrHi = MRI.createGenericVirtualRegister(S32);
3115 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3116
3117 B.buildInstr(AMDGPU::S_MOV_B32)
3118 .addDef(AddrHi)
3119 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3120
3121 // Use the destination directly, if and only if we don't have a register
3122 // class being set.
3123 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3124 ? DstReg
3125 : MRI.createGenericVirtualRegister(LLT::scalar(64));
3126
3127 if (!MRI.getRegClassOrNull(AddrDst))
3128 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3129
3130 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3131
3132 // If we created a new register for the destination, cast the result into
3133 // the final output.
3134 if (AddrDst != DstReg)
3135 B.buildCast(DstReg, AddrDst);
3136 } else if (AddrLo != DstReg) {
3137 // If we created a new register for the destination, cast the result into
3138 // the final output.
3139 B.buildCast(DstReg, AddrLo);
3140 }
3141}
3142
3145 MachineIRBuilder &B) const {
3146 Register DstReg = MI.getOperand(0).getReg();
3147 LLT Ty = MRI.getType(DstReg);
3148 unsigned AS = Ty.getAddressSpace();
3149
3150 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3151 MachineFunction &MF = B.getMF();
3153
3155 if (!MFI->isModuleEntryFunction() &&
3156 GV->getName() != "llvm.amdgcn.module.lds" &&
3158 const Function &Fn = MF.getFunction();
3160 Fn, "local memory global used by non-kernel function",
3161 MI.getDebugLoc(), DS_Warning));
3162
3163 // We currently don't have a way to correctly allocate LDS objects that
3164 // aren't directly associated with a kernel. We do force inlining of
3165 // functions that use local objects. However, if these dead functions are
3166 // not eliminated, we don't want a compile time error. Just emit a warning
3167 // and a trap, since there should be no callable path here.
3168 B.buildTrap();
3169 B.buildUndef(DstReg);
3170 MI.eraseFromParent();
3171 return true;
3172 }
3173
3174 // TODO: We could emit code to handle the initialization somewhere.
3175 // We ignore the initializer for now and legalize it to allow selection.
3176 // The initializer will anyway get errored out during assembly emission.
3177 const SITargetLowering *TLI = ST.getTargetLowering();
3178 if (!TLI->shouldUseLDSConstAddress(GV)) {
3179 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3180 return true; // Leave in place;
3181 }
3182
3183 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3184 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3185 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3186 // zero-sized type in other languages to declare the dynamic shared
3187 // memory which size is not known at the compile time. They will be
3188 // allocated by the runtime and placed directly after the static
3189 // allocated ones. They all share the same offset.
3190 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3191 // Adjust alignment for that dynamic shared memory array.
3192 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3193 LLT S32 = LLT::scalar(32);
3194 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3195 B.buildIntToPtr(DstReg, Sz);
3196 MI.eraseFromParent();
3197 return true;
3198 }
3199 }
3200
3201 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3202 MI.eraseFromParent();
3203 return true;
3204 }
3205
3206 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3207 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3208 MI.eraseFromParent();
3209 return true;
3210 }
3211
3212 const SITargetLowering *TLI = ST.getTargetLowering();
3213
3214 if (TLI->shouldEmitFixup(GV)) {
3215 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3216 MI.eraseFromParent();
3217 return true;
3218 }
3219
3220 if (TLI->shouldEmitPCReloc(GV)) {
3221 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3222 MI.eraseFromParent();
3223 return true;
3224 }
3225
3227 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3228
3229 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3234 LoadTy, Align(8));
3235
3236 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3237
3238 if (Ty.getSizeInBits() == 32) {
3239 // Truncate if this is a 32-bit constant address.
3240 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3241 B.buildExtract(DstReg, Load, 0);
3242 } else
3243 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3244
3245 MI.eraseFromParent();
3246 return true;
3247}
3248
3250 if (Ty.isVector())
3251 return Ty.changeElementCount(
3252 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3253 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3254}
3255
3257 MachineInstr &MI) const {
3258 MachineIRBuilder &B = Helper.MIRBuilder;
3259 MachineRegisterInfo &MRI = *B.getMRI();
3260 GISelChangeObserver &Observer = Helper.Observer;
3261
3262 Register PtrReg = MI.getOperand(1).getReg();
3263 LLT PtrTy = MRI.getType(PtrReg);
3264 unsigned AddrSpace = PtrTy.getAddressSpace();
3265
3266 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3268 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3269 Observer.changingInstr(MI);
3270 MI.getOperand(1).setReg(Cast.getReg(0));
3271 Observer.changedInstr(MI);
3272 return true;
3273 }
3274
3275 if (MI.getOpcode() != AMDGPU::G_LOAD)
3276 return false;
3277
3278 Register ValReg = MI.getOperand(0).getReg();
3279 LLT ValTy = MRI.getType(ValReg);
3280
3281 if (hasBufferRsrcWorkaround(ValTy)) {
3282 Observer.changingInstr(MI);
3284 Observer.changedInstr(MI);
3285 return true;
3286 }
3287
3288 MachineMemOperand *MMO = *MI.memoperands_begin();
3289 const unsigned ValSize = ValTy.getSizeInBits();
3290 const LLT MemTy = MMO->getMemoryType();
3291 const Align MemAlign = MMO->getAlign();
3292 const unsigned MemSize = MemTy.getSizeInBits();
3293 const uint64_t AlignInBits = 8 * MemAlign.value();
3294
3295 // Widen non-power-of-2 loads to the alignment if needed
3296 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3297 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3298
3299 // This was already the correct extending load result type, so just adjust
3300 // the memory type.
3301 if (WideMemSize == ValSize) {
3302 MachineFunction &MF = B.getMF();
3303
3304 MachineMemOperand *WideMMO =
3305 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3306 Observer.changingInstr(MI);
3307 MI.setMemRefs(MF, {WideMMO});
3308 Observer.changedInstr(MI);
3309 return true;
3310 }
3311
3312 // Don't bother handling edge case that should probably never be produced.
3313 if (ValSize > WideMemSize)
3314 return false;
3315
3316 LLT WideTy = widenToNextPowerOf2(ValTy);
3317
3318 Register WideLoad;
3319 if (!WideTy.isVector()) {
3320 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3321 B.buildTrunc(ValReg, WideLoad).getReg(0);
3322 } else {
3323 // Extract the subvector.
3324
3325 if (isRegisterType(ST, ValTy)) {
3326 // If this a case where G_EXTRACT is legal, use it.
3327 // (e.g. <3 x s32> -> <4 x s32>)
3328 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3329 B.buildExtract(ValReg, WideLoad, 0);
3330 } else {
3331 // For cases where the widened type isn't a nice register value, unmerge
3332 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3333 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3334 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3335 }
3336 }
3337
3338 MI.eraseFromParent();
3339 return true;
3340 }
3341
3342 return false;
3343}
3344
3346 MachineInstr &MI) const {
3347 MachineIRBuilder &B = Helper.MIRBuilder;
3348 MachineRegisterInfo &MRI = *B.getMRI();
3349 GISelChangeObserver &Observer = Helper.Observer;
3350
3351 Register DataReg = MI.getOperand(0).getReg();
3352 LLT DataTy = MRI.getType(DataReg);
3353
3354 if (hasBufferRsrcWorkaround(DataTy)) {
3355 Observer.changingInstr(MI);
3357 Observer.changedInstr(MI);
3358 return true;
3359 }
3360 return false;
3361}
3362
3365 MachineIRBuilder &B) const {
3366 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3367 assert(Ty.isScalar());
3368
3369 MachineFunction &MF = B.getMF();
3371
3372 // TODO: Always legal with future ftz flag.
3373 // FIXME: Do we need just output?
3374 if (Ty == LLT::float32() &&
3376 return true;
3377 if (Ty == LLT::float16() &&
3379 return true;
3380
3381 MachineIRBuilder HelperBuilder(MI);
3382 GISelObserverWrapper DummyObserver;
3383 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3384 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3385}
3386
3389 Register DstReg = MI.getOperand(0).getReg();
3390 Register PtrReg = MI.getOperand(1).getReg();
3391 Register CmpVal = MI.getOperand(2).getReg();
3392 Register NewVal = MI.getOperand(3).getReg();
3393
3394 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3395 "this should not have been custom lowered");
3396
3397 LLT ValTy = MRI.getType(CmpVal);
3398 LLT VecTy = LLT::fixed_vector(2, ValTy);
3399
3400 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3401
3402 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3403 .addDef(DstReg)
3404 .addUse(PtrReg)
3405 .addUse(PackedVal)
3406 .setMemRefs(MI.memoperands());
3407
3408 MI.eraseFromParent();
3409 return true;
3410}
3411
3412/// Return true if it's known that \p Src can never be an f32 denormal value.
3414 Register Src) {
3415 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3416 switch (DefMI->getOpcode()) {
3417 case TargetOpcode::G_INTRINSIC: {
3419 case Intrinsic::amdgcn_frexp_mant:
3420 case Intrinsic::amdgcn_log:
3421 case Intrinsic::amdgcn_log_clamp:
3422 case Intrinsic::amdgcn_exp2:
3423 case Intrinsic::amdgcn_sqrt:
3424 return true;
3425 default:
3426 break;
3427 }
3428
3429 break;
3430 }
3431 case TargetOpcode::G_FSQRT:
3432 return true;
3433 case TargetOpcode::G_FFREXP: {
3434 if (DefMI->getOperand(0).getReg() == Src)
3435 return true;
3436 break;
3437 }
3438 case TargetOpcode::G_FPEXT: {
3439 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3440 }
3441 default:
3442 return false;
3443 }
3444
3445 return false;
3446}
3447
3448static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3449 return Flags & MachineInstr::FmAfn;
3450}
3451
3453 unsigned Flags) {
3454 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3457}
3458
3459std::pair<Register, Register>
3461 unsigned Flags) const {
3462 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3463 return {};
3464
3465 const LLT F32 = LLT::scalar(32);
3466 auto SmallestNormal = B.buildFConstant(
3468 auto IsLtSmallestNormal =
3469 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3470
3471 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3472 auto One = B.buildFConstant(F32, 1.0);
3473 auto ScaleFactor =
3474 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3475 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3476
3477 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3478}
3479
3481 MachineIRBuilder &B) const {
3482 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3483 // If we have to handle denormals, scale up the input and adjust the result.
3484
3485 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3486 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3487
3488 Register Dst = MI.getOperand(0).getReg();
3489 Register Src = MI.getOperand(1).getReg();
3490 LLT Ty = B.getMRI()->getType(Dst);
3491 unsigned Flags = MI.getFlags();
3492
3493 if (Ty == LLT::scalar(16)) {
3494 const LLT F32 = LLT::scalar(32);
3495 // Nothing in half is a denormal when promoted to f32.
3496 auto Ext = B.buildFPExt(F32, Src, Flags);
3497 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3498 .addUse(Ext.getReg(0))
3499 .setMIFlags(Flags);
3500 B.buildFPTrunc(Dst, Log2, Flags);
3501 MI.eraseFromParent();
3502 return true;
3503 }
3504
3505 assert(Ty == LLT::scalar(32));
3506
3507 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3508 if (!ScaledInput) {
3509 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3510 .addUse(Src)
3511 .setMIFlags(Flags);
3512 MI.eraseFromParent();
3513 return true;
3514 }
3515
3516 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3517 .addUse(ScaledInput)
3518 .setMIFlags(Flags);
3519
3520 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3521 auto Zero = B.buildFConstant(Ty, 0.0);
3522 auto ResultOffset =
3523 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3524 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3525
3526 MI.eraseFromParent();
3527 return true;
3528}
3529
3531 Register Z, unsigned Flags) {
3532 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3533 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3534}
3535
3537 MachineIRBuilder &B) const {
3538 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3539 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3540
3541 MachineRegisterInfo &MRI = *B.getMRI();
3542 Register Dst = MI.getOperand(0).getReg();
3543 Register X = MI.getOperand(1).getReg();
3544 unsigned Flags = MI.getFlags();
3545 const LLT Ty = MRI.getType(X);
3546 MachineFunction &MF = B.getMF();
3547
3548 const LLT F32 = LLT::scalar(32);
3549 const LLT F16 = LLT::scalar(16);
3550
3551 const AMDGPUTargetMachine &TM =
3552 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3553
3554 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3555 if (Ty == F16 && !ST.has16BitInsts()) {
3556 Register LogVal = MRI.createGenericVirtualRegister(F32);
3557 auto PromoteSrc = B.buildFPExt(F32, X);
3558 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3559 B.buildFPTrunc(Dst, LogVal);
3560 } else {
3561 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3562 }
3563
3564 MI.eraseFromParent();
3565 return true;
3566 }
3567
3568 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3569 if (ScaledInput)
3570 X = ScaledInput;
3571
3572 auto Y =
3573 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3574
3575 Register R;
3576 if (ST.hasFastFMAF32()) {
3577 // c+cc are ln(2)/ln(10) to more than 49 bits
3578 const float c_log10 = 0x1.344134p-2f;
3579 const float cc_log10 = 0x1.09f79ep-26f;
3580
3581 // c + cc is ln(2) to more than 49 bits
3582 const float c_log = 0x1.62e42ep-1f;
3583 const float cc_log = 0x1.efa39ep-25f;
3584
3585 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3586 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3587 // This adds correction terms for which contraction may lead to an increase
3588 // in the error of the approximation, so disable it.
3589 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3590 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3591 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3592 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3593 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3594 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3595 } else {
3596 // ch+ct is ln(2)/ln(10) to more than 36 bits
3597 const float ch_log10 = 0x1.344000p-2f;
3598 const float ct_log10 = 0x1.3509f6p-18f;
3599
3600 // ch + ct is ln(2) to more than 36 bits
3601 const float ch_log = 0x1.62e000p-1f;
3602 const float ct_log = 0x1.0bfbe8p-15f;
3603
3604 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3605 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3606
3607 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3608 auto YH = B.buildAnd(Ty, Y, MaskConst);
3609 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3610 // This adds correction terms for which contraction may lead to an increase
3611 // in the error of the approximation, so disable it.
3612 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3613 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3614
3615 Register Mad0 =
3616 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3617 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3618 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3619 }
3620
3621 const bool IsFiniteOnly =
3622 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3623 MI.getFlag(MachineInstr::FmNoInfs);
3624
3625 if (!IsFiniteOnly) {
3626 // Expand isfinite(x) => fabs(x) < inf
3627 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3628 auto Fabs = B.buildFAbs(Ty, Y);
3629 auto IsFinite =
3630 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3631 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3632 }
3633
3634 if (ScaledInput) {
3635 auto Zero = B.buildFConstant(Ty, 0.0);
3636 auto ShiftK =
3637 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3638 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3639 B.buildFSub(Dst, R, Shift, Flags);
3640 } else {
3641 B.buildCopy(Dst, R);
3642 }
3643
3644 MI.eraseFromParent();
3645 return true;
3646}
3647
3649 Register Src, bool IsLog10,
3650 unsigned Flags) const {
3651 const double Log2BaseInverted =
3653
3654 LLT Ty = B.getMRI()->getType(Dst);
3655
3656 if (Ty == LLT::scalar(32)) {
3657 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3658 if (ScaledInput) {
3659 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3660 .addUse(Src)
3661 .setMIFlags(Flags);
3662 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3663 auto Zero = B.buildFConstant(Ty, 0.0);
3664 auto ResultOffset =
3665 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3666 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3667
3668 if (ST.hasFastFMAF32())
3669 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3670 else {
3671 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3672 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3673 }
3674
3675 return true;
3676 }
3677 }
3678
3679 auto Log2Operand = Ty == LLT::scalar(16)
3680 ? B.buildFLog2(Ty, Src, Flags)
3681 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3682 .addUse(Src)
3683 .setMIFlags(Flags);
3684 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3685 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3686 return true;
3687}
3688
3690 MachineIRBuilder &B) const {
3691 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3692 // If we have to handle denormals, scale up the input and adjust the result.
3693
3694 Register Dst = MI.getOperand(0).getReg();
3695 Register Src = MI.getOperand(1).getReg();
3696 unsigned Flags = MI.getFlags();
3697 LLT Ty = B.getMRI()->getType(Dst);
3698 const LLT F16 = LLT::scalar(16);
3699 const LLT F32 = LLT::scalar(32);
3700
3701 if (Ty == F16) {
3702 // Nothing in half is a denormal when promoted to f32.
3703 auto Ext = B.buildFPExt(F32, Src, Flags);
3704 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3705 .addUse(Ext.getReg(0))
3706 .setMIFlags(Flags);
3707 B.buildFPTrunc(Dst, Log2, Flags);
3708 MI.eraseFromParent();
3709 return true;
3710 }
3711
3712 assert(Ty == F32);
3713
3714 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3715 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3716 .addUse(Src)
3717 .setMIFlags(Flags);
3718 MI.eraseFromParent();
3719 return true;
3720 }
3721
3722 // bool needs_scaling = x < -0x1.f80000p+6f;
3723 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3724
3725 // -nextafter(128.0, -1)
3726 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3727 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3728 RangeCheckConst, Flags);
3729
3730 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3731 auto Zero = B.buildFConstant(Ty, 0.0);
3732 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3733 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3734
3735 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3736 .addUse(AddInput.getReg(0))
3737 .setMIFlags(Flags);
3738
3739 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3740 auto One = B.buildFConstant(Ty, 1.0);
3741 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3742 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3743 MI.eraseFromParent();
3744 return true;
3745}
3746
3748 const SrcOp &Src, unsigned Flags) {
3749 LLT Ty = Dst.getLLTTy(*B.getMRI());
3750
3751 if (Ty == LLT::scalar(32)) {
3752 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3753 .addUse(Src.getReg())
3754 .setMIFlags(Flags);
3755 }
3756 return B.buildFExp2(Dst, Src, Flags);
3757}
3758
3760 Register Dst, Register X,
3761 unsigned Flags,
3762 bool IsExp10) const {
3763 LLT Ty = B.getMRI()->getType(X);
3764
3765 // exp(x) -> exp2(M_LOG2E_F * x);
3766 // exp10(x) -> exp2(log2(10) * x);
3767 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3768 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3769 buildExp(B, Dst, Mul, Flags);
3770 return true;
3771}
3772
3774 Register X, unsigned Flags) const {
3775 LLT Ty = B.getMRI()->getType(Dst);
3776 LLT F32 = LLT::scalar(32);
3777
3778 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3779 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3780 }
3781
3782 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3783 auto NeedsScaling =
3784 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3785 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3786 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3787 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3788
3789 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3790 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3791
3792 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3793 .addUse(ExpInput.getReg(0))
3794 .setMIFlags(Flags);
3795
3796 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3797 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3798 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3799 return true;
3800}
3801
3803 Register Dst, Register X,
3804 unsigned Flags) const {
3805 LLT Ty = B.getMRI()->getType(Dst);
3806 LLT F32 = LLT::scalar(32);
3807
3808 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3809 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3810 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3811 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3812
3813 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3814 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3815 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3816 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3817 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3818 return true;
3819 }
3820
3821 // bool s = x < -0x1.2f7030p+5f;
3822 // x += s ? 0x1.0p+5f : 0.0f;
3823 // exp10 = exp2(x * 0x1.a92000p+1f) *
3824 // exp2(x * 0x1.4f0978p-11f) *
3825 // (s ? 0x1.9f623ep-107f : 1.0f);
3826
3827 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3828 auto NeedsScaling =
3829 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3830
3831 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3832 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3833 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3834
3835 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3836 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3837
3838 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3839 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3840 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3841 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3842
3843 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3844 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
3845 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3846
3847 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3848 return true;
3849}
3850
3852 MachineIRBuilder &B) const {
3853 Register Dst = MI.getOperand(0).getReg();
3854 Register X = MI.getOperand(1).getReg();
3855 const unsigned Flags = MI.getFlags();
3856 MachineFunction &MF = B.getMF();
3857 MachineRegisterInfo &MRI = *B.getMRI();
3858 LLT Ty = MRI.getType(Dst);
3859 const LLT F16 = LLT::scalar(16);
3860 const LLT F32 = LLT::scalar(32);
3861 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3862
3863 if (Ty == F16) {
3864 // v_exp_f16 (fmul x, log2e)
3865 if (allowApproxFunc(MF, Flags)) {
3866 // TODO: Does this really require fast?
3867 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
3868 : legalizeFExpUnsafe(B, Dst, X, Flags);
3869 MI.eraseFromParent();
3870 return true;
3871 }
3872
3873 // Nothing in half is a denormal when promoted to f32.
3874 //
3875 // exp(f16 x) ->
3876 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3877 //
3878 // exp10(f16 x) ->
3879 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3880 auto Ext = B.buildFPExt(F32, X, Flags);
3881 Register Lowered = MRI.createGenericVirtualRegister(F32);
3882 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
3883 B.buildFPTrunc(Dst, Lowered, Flags);
3884 MI.eraseFromParent();
3885 return true;
3886 }
3887
3888 assert(Ty == F32);
3889
3890 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3891 // library behavior. Also, is known-not-daz source sufficient?
3892 if (allowApproxFunc(MF, Flags)) {
3893 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
3894 : legalizeFExpUnsafe(B, Dst, X, Flags);
3895 MI.eraseFromParent();
3896 return true;
3897 }
3898
3899 // Algorithm:
3900 //
3901 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3902 //
3903 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3904 // n = 64*m + j, 0 <= j < 64
3905 //
3906 // e^x = 2^((64*m + j + f)/64)
3907 // = (2^m) * (2^(j/64)) * 2^(f/64)
3908 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3909 //
3910 // f = x*(64/ln(2)) - n
3911 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3912 //
3913 // e^x = (2^m) * (2^(j/64)) * e^r
3914 //
3915 // (2^(j/64)) is precomputed
3916 //
3917 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3918 // e^r = 1 + q
3919 //
3920 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3921 //
3922 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3923 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3924 Register PH, PL;
3925
3926 if (ST.hasFastFMAF32()) {
3927 const float c_exp = numbers::log2ef;
3928 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3929 const float c_exp10 = 0x1.a934f0p+1f;
3930 const float cc_exp10 = 0x1.2f346ep-24f;
3931
3932 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3933 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3934 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3935 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3936
3937 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3938 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3939 } else {
3940 const float ch_exp = 0x1.714000p+0f;
3941 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3942
3943 const float ch_exp10 = 0x1.a92000p+1f;
3944 const float cl_exp10 = 0x1.4f0978p-11f;
3945
3946 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3947 auto XH = B.buildAnd(Ty, X, MaskConst);
3948 auto XL = B.buildFSub(Ty, X, XH, Flags);
3949
3950 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3951 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3952
3953 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3954 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3955
3956 Register Mad0 =
3957 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3958 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3959 }
3960
3961 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3962
3963 // It is unsafe to contract this fsub into the PH multiply.
3964 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3965 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3966 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3967
3968 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3969 .addUse(A.getReg(0))
3970 .setMIFlags(Flags);
3971 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3972
3973 auto UnderflowCheckConst =
3974 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3975 auto Zero = B.buildFConstant(Ty, 0.0);
3976 auto Underflow =
3977 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3978
3979 R = B.buildSelect(Ty, Underflow, Zero, R);
3980
3981 if (!(Flags & MachineInstr::FmNoInfs)) {
3982 auto OverflowCheckConst =
3983 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3984
3985 auto Overflow =
3986 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3987 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3988 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3989 }
3990
3991 B.buildCopy(Dst, R);
3992 MI.eraseFromParent();
3993 return true;
3994}
3995
3997 MachineIRBuilder &B) const {
3998 Register Dst = MI.getOperand(0).getReg();
3999 Register Src0 = MI.getOperand(1).getReg();
4000 Register Src1 = MI.getOperand(2).getReg();
4001 unsigned Flags = MI.getFlags();
4002 LLT Ty = B.getMRI()->getType(Dst);
4003 const LLT F16 = LLT::float16();
4004 const LLT F32 = LLT::float32();
4005
4006 if (Ty == F32) {
4007 auto Log = B.buildFLog2(F32, Src0, Flags);
4008 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4009 .addUse(Log.getReg(0))
4010 .addUse(Src1)
4011 .setMIFlags(Flags);
4012 B.buildFExp2(Dst, Mul, Flags);
4013 } else if (Ty == F16) {
4014 // There's no f16 fmul_legacy, so we need to convert for it.
4015 auto Log = B.buildFLog2(F16, Src0, Flags);
4016 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4017 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4018 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4019 .addUse(Ext0.getReg(0))
4020 .addUse(Ext1.getReg(0))
4021 .setMIFlags(Flags);
4022 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4023 } else
4024 return false;
4025
4026 MI.eraseFromParent();
4027 return true;
4028}
4029
4030// Find a source register, ignoring any possible source modifiers.
4032 Register ModSrc = OrigSrc;
4033 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4034 ModSrc = SrcFNeg->getOperand(1).getReg();
4035 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4036 ModSrc = SrcFAbs->getOperand(1).getReg();
4037 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4038 ModSrc = SrcFAbs->getOperand(1).getReg();
4039 return ModSrc;
4040}
4041
4044 MachineIRBuilder &B) const {
4045
4046 const LLT S1 = LLT::scalar(1);
4047 const LLT F64 = LLT::float64();
4048 Register Dst = MI.getOperand(0).getReg();
4049 Register OrigSrc = MI.getOperand(1).getReg();
4050 unsigned Flags = MI.getFlags();
4051 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4052 "this should not have been custom lowered");
4053
4054 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4055 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4056 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4057 // V_FRACT bug is:
4058 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4059 //
4060 // Convert floor(x) to (x - fract(x))
4061
4062 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4063 .addUse(OrigSrc)
4064 .setMIFlags(Flags);
4065
4066 // Give source modifier matching some assistance before obscuring a foldable
4067 // pattern.
4068
4069 // TODO: We can avoid the neg on the fract? The input sign to fract
4070 // shouldn't matter?
4071 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4072
4073 auto Const =
4074 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4075
4076 Register Min = MRI.createGenericVirtualRegister(F64);
4077
4078 // We don't need to concern ourselves with the snan handling difference, so
4079 // use the one which will directly select.
4080 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4081 if (MFI->getMode().IEEE)
4082 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4083 else
4084 B.buildFMinNum(Min, Fract, Const, Flags);
4085
4086 Register CorrectedFract = Min;
4087 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4088 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4089 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4090 }
4091
4092 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4093 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4094
4095 MI.eraseFromParent();
4096 return true;
4097}
4098
4099// Turn an illegal packed v2s16 build vector into bit operations.
4100// TODO: This should probably be a bitcast action in LegalizerHelper.
4103 Register Dst = MI.getOperand(0).getReg();
4104 const LLT S32 = LLT::scalar(32);
4105 const LLT S16 = LLT::scalar(16);
4106 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4107
4108 Register Src0 = MI.getOperand(1).getReg();
4109 Register Src1 = MI.getOperand(2).getReg();
4110
4111 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4112 assert(MRI.getType(Src0) == S32);
4113 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4114 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4115 }
4116
4117 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4118 B.buildBitcast(Dst, Merge);
4119
4120 MI.eraseFromParent();
4121 return true;
4122}
4123
4124// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4125//
4126// Source and accumulation registers must all be 32-bits.
4127//
4128// TODO: When the multiply is uniform, we should produce a code sequence
4129// that is better suited to instruction selection on the SALU. Instead of
4130// the outer loop going over parts of the result, the outer loop should go
4131// over parts of one of the factors. This should result in instruction
4132// selection that makes full use of S_ADDC_U32 instructions.
4135 ArrayRef<Register> Src0,
4136 ArrayRef<Register> Src1,
4137 bool UsePartialMad64_32,
4138 bool SeparateOddAlignedProducts) const {
4139 // Use (possibly empty) vectors of S1 registers to represent the set of
4140 // carries from one pair of positions to the next.
4141 using Carry = SmallVector<Register, 2>;
4142
4143 MachineIRBuilder &B = Helper.MIRBuilder;
4144 GISelValueTracking &VT = *Helper.getValueTracking();
4145
4146 const LLT S1 = LLT::scalar(1);
4147 const LLT S32 = LLT::scalar(32);
4148 const LLT S64 = LLT::scalar(64);
4149
4150 Register Zero32;
4151 Register Zero64;
4152
4153 auto getZero32 = [&]() -> Register {
4154 if (!Zero32)
4155 Zero32 = B.buildConstant(S32, 0).getReg(0);
4156 return Zero32;
4157 };
4158 auto getZero64 = [&]() -> Register {
4159 if (!Zero64)
4160 Zero64 = B.buildConstant(S64, 0).getReg(0);
4161 return Zero64;
4162 };
4163
4164 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4165 for (unsigned i = 0; i < Src0.size(); ++i) {
4166 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4167 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4168 }
4169
4170 // Merge the given carries into the 32-bit LocalAccum, which is modified
4171 // in-place.
4172 //
4173 // Returns the carry-out, which is a single S1 register or null.
4174 auto mergeCarry =
4175 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4176 if (CarryIn.empty())
4177 return Register();
4178
4179 bool HaveCarryOut = true;
4180 Register CarryAccum;
4181 if (CarryIn.size() == 1) {
4182 if (!LocalAccum) {
4183 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4184 return Register();
4185 }
4186
4187 CarryAccum = getZero32();
4188 } else {
4189 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4190 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4191 CarryAccum =
4192 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4193 .getReg(0);
4194 }
4195
4196 if (!LocalAccum) {
4197 LocalAccum = getZero32();
4198 HaveCarryOut = false;
4199 }
4200 }
4201
4202 auto Add =
4203 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4204 LocalAccum = Add.getReg(0);
4205 return HaveCarryOut ? Add.getReg(1) : Register();
4206 };
4207
4208 // Build a multiply-add chain to compute
4209 //
4210 // LocalAccum + (partial products at DstIndex)
4211 // + (opportunistic subset of CarryIn)
4212 //
4213 // LocalAccum is an array of one or two 32-bit registers that are updated
4214 // in-place. The incoming registers may be null.
4215 //
4216 // In some edge cases, carry-ins can be consumed "for free". In that case,
4217 // the consumed carry bits are removed from CarryIn in-place.
4218 auto buildMadChain =
4219 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4220 -> Carry {
4221 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4222 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4223
4224 Carry CarryOut;
4225 unsigned j0 = 0;
4226
4227 // Use plain 32-bit multiplication for the most significant part of the
4228 // result by default.
4229 if (LocalAccum.size() == 1 &&
4230 (!UsePartialMad64_32 || !CarryIn.empty())) {
4231 do {
4232 // Skip multiplication if one of the operands is 0
4233 unsigned j1 = DstIndex - j0;
4234 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4235 ++j0;
4236 continue;
4237 }
4238 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4239 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4240 LocalAccum[0] = Mul.getReg(0);
4241 } else {
4242 if (CarryIn.empty()) {
4243 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4244 } else {
4245 LocalAccum[0] =
4246 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4247 .getReg(0);
4248 CarryIn.pop_back();
4249 }
4250 }
4251 ++j0;
4252 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4253 }
4254
4255 // Build full 64-bit multiplies.
4256 if (j0 <= DstIndex) {
4257 bool HaveSmallAccum = false;
4258 Register Tmp;
4259
4260 if (LocalAccum[0]) {
4261 if (LocalAccum.size() == 1) {
4262 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4263 HaveSmallAccum = true;
4264 } else if (LocalAccum[1]) {
4265 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4266 HaveSmallAccum = false;
4267 } else {
4268 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4269 HaveSmallAccum = true;
4270 }
4271 } else {
4272 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4273 Tmp = getZero64();
4274 HaveSmallAccum = true;
4275 }
4276
4277 do {
4278 unsigned j1 = DstIndex - j0;
4279 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4280 ++j0;
4281 continue;
4282 }
4283 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4284 {Src0[j0], Src1[j1], Tmp});
4285 Tmp = Mad.getReg(0);
4286 if (!HaveSmallAccum)
4287 CarryOut.push_back(Mad.getReg(1));
4288 HaveSmallAccum = false;
4289
4290 ++j0;
4291 } while (j0 <= DstIndex);
4292
4293 auto Unmerge = B.buildUnmerge(S32, Tmp);
4294 LocalAccum[0] = Unmerge.getReg(0);
4295 if (LocalAccum.size() > 1)
4296 LocalAccum[1] = Unmerge.getReg(1);
4297 }
4298
4299 return CarryOut;
4300 };
4301
4302 // Outer multiply loop, iterating over destination parts from least
4303 // significant to most significant parts.
4304 //
4305 // The columns of the following diagram correspond to the destination parts
4306 // affected by one iteration of the outer loop (ignoring boundary
4307 // conditions).
4308 //
4309 // Dest index relative to 2 * i: 1 0 -1
4310 // ------
4311 // Carries from previous iteration: e o
4312 // Even-aligned partial product sum: E E .
4313 // Odd-aligned partial product sum: O O
4314 //
4315 // 'o' is OddCarry, 'e' is EvenCarry.
4316 // EE and OO are computed from partial products via buildMadChain and use
4317 // accumulation where possible and appropriate.
4318 //
4319 Register SeparateOddCarry;
4320 Carry EvenCarry;
4321 Carry OddCarry;
4322
4323 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4324 Carry OddCarryIn = std::move(OddCarry);
4325 Carry EvenCarryIn = std::move(EvenCarry);
4326 OddCarry.clear();
4327 EvenCarry.clear();
4328
4329 // Partial products at offset 2 * i.
4330 if (2 * i < Accum.size()) {
4331 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4332 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4333 }
4334
4335 // Partial products at offset 2 * i - 1.
4336 if (i > 0) {
4337 if (!SeparateOddAlignedProducts) {
4338 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4339 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4340 } else {
4341 bool IsHighest = 2 * i >= Accum.size();
4342 Register SeparateOddOut[2];
4343 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4344 .take_front(IsHighest ? 1 : 2);
4345 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4346
4348
4349 if (i == 1) {
4350 if (!IsHighest)
4351 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4352 else
4353 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4354 } else {
4355 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4356 SeparateOddCarry);
4357 }
4358 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4359
4360 if (!IsHighest) {
4361 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4362 Lo->getOperand(1).getReg());
4363 Accum[2 * i] = Hi.getReg(0);
4364 SeparateOddCarry = Hi.getReg(1);
4365 }
4366 }
4367 }
4368
4369 // Add in the carries from the previous iteration
4370 if (i > 0) {
4371 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4372 EvenCarryIn.push_back(CarryOut);
4373
4374 if (2 * i < Accum.size()) {
4375 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4376 OddCarry.push_back(CarryOut);
4377 }
4378 }
4379 }
4380}
4381
4382// Custom narrowing of wide multiplies using wide multiply-add instructions.
4383//
4384// TODO: If the multiply is followed by an addition, we should attempt to
4385// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4387 MachineInstr &MI) const {
4388 assert(ST.hasMad64_32());
4389 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4390
4391 MachineIRBuilder &B = Helper.MIRBuilder;
4392 MachineRegisterInfo &MRI = *B.getMRI();
4393
4394 Register DstReg = MI.getOperand(0).getReg();
4395 Register Src0 = MI.getOperand(1).getReg();
4396 Register Src1 = MI.getOperand(2).getReg();
4397
4398 LLT Ty = MRI.getType(DstReg);
4399 assert(Ty.isScalar());
4400
4401 unsigned Size = Ty.getSizeInBits();
4402 if (ST.hasVectorMulU64() && Size == 64)
4403 return true;
4404
4405 unsigned NumParts = Size / 32;
4406 assert((Size % 32) == 0);
4407 assert(NumParts >= 2);
4408
4409 // Whether to use MAD_64_32 for partial products whose high half is
4410 // discarded. This avoids some ADD instructions but risks false dependency
4411 // stalls on some subtargets in some cases.
4412 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4413
4414 // Whether to compute odd-aligned partial products separately. This is
4415 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4416 // in an even-aligned VGPR.
4417 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4418
4419 LLT S32 = LLT::scalar(32);
4420 SmallVector<Register, 2> Src0Parts, Src1Parts;
4421 for (unsigned i = 0; i < NumParts; ++i) {
4422 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4423 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4424 }
4425 B.buildUnmerge(Src0Parts, Src0);
4426 B.buildUnmerge(Src1Parts, Src1);
4427
4428 SmallVector<Register, 2> AccumRegs(NumParts);
4429 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4430 SeparateOddAlignedProducts);
4431
4432 B.buildMergeLikeInstr(DstReg, AccumRegs);
4433 MI.eraseFromParent();
4434 return true;
4435}
4436
4437// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4438// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4439// case with a single min instruction instead of a compare+select.
4442 MachineIRBuilder &B) const {
4443 Register Dst = MI.getOperand(0).getReg();
4444 Register Src = MI.getOperand(1).getReg();
4445 LLT DstTy = MRI.getType(Dst);
4446 LLT SrcTy = MRI.getType(Src);
4447
4448 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4449 ? AMDGPU::G_AMDGPU_FFBH_U32
4450 : AMDGPU::G_AMDGPU_FFBL_B32;
4451 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4452 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4453
4454 MI.eraseFromParent();
4455 return true;
4456}
4457
4460 MachineIRBuilder &B) const {
4461 Register Dst = MI.getOperand(0).getReg();
4462 Register Src = MI.getOperand(1).getReg();
4463 LLT SrcTy = MRI.getType(Src);
4464 TypeSize NumBits = SrcTy.getSizeInBits();
4465
4466 assert(NumBits < 32u);
4467
4468 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4469 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4470 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4471 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4472 B.buildTrunc(Dst, Ctlz);
4473 MI.eraseFromParent();
4474 return true;
4475}
4476
4477// Check that this is a G_XOR x, -1
4478static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4479 if (MI.getOpcode() != TargetOpcode::G_XOR)
4480 return false;
4481 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4482 return ConstVal == -1;
4483}
4484
4485// Return the use branch instruction, otherwise null if the usage is invalid.
4486static MachineInstr *
4488 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4489 Register CondDef = MI.getOperand(0).getReg();
4490 if (!MRI.hasOneNonDBGUse(CondDef))
4491 return nullptr;
4492
4493 MachineBasicBlock *Parent = MI.getParent();
4494 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4495
4496 if (isNot(MRI, *UseMI)) {
4497 Register NegatedCond = UseMI->getOperand(0).getReg();
4498 if (!MRI.hasOneNonDBGUse(NegatedCond))
4499 return nullptr;
4500
4501 // We're deleting the def of this value, so we need to remove it.
4502 eraseInstr(*UseMI, MRI);
4503
4504 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4505 Negated = true;
4506 }
4507
4508 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4509 return nullptr;
4510
4511 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4512 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4513 if (Next == Parent->end()) {
4514 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4515 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4516 return nullptr;
4517 UncondBrTarget = &*NextMBB;
4518 } else {
4519 if (Next->getOpcode() != AMDGPU::G_BR)
4520 return nullptr;
4521 Br = &*Next;
4522 UncondBrTarget = Br->getOperand(0).getMBB();
4523 }
4524
4525 return UseMI;
4526}
4527
4530 const ArgDescriptor *Arg,
4531 const TargetRegisterClass *ArgRC,
4532 LLT ArgTy) const {
4533 MCRegister SrcReg = Arg->getRegister();
4534 assert(SrcReg.isPhysical() && "Physical register expected");
4535 assert(DstReg.isVirtual() && "Virtual register expected");
4536
4537 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4538 *ArgRC, B.getDebugLoc(), ArgTy);
4539 if (Arg->isMasked()) {
4540 // TODO: Should we try to emit this once in the entry block?
4541 const LLT S32 = LLT::scalar(32);
4542 const unsigned Mask = Arg->getMask();
4543 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4544
4545 Register AndMaskSrc = LiveIn;
4546
4547 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4548 // 0.
4549 if (Shift != 0) {
4550 auto ShiftAmt = B.buildConstant(S32, Shift);
4551 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4552 }
4553
4554 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4555 } else {
4556 B.buildCopy(DstReg, LiveIn);
4557 }
4558}
4559
4564 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4565 Register DstReg = MI.getOperand(0).getReg();
4566 if (!ST.hasClusters()) {
4567 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4568 return false;
4569 MI.eraseFromParent();
4570 return true;
4571 }
4572
4573 // Clusters are supported. Return the global position in the grid. If clusters
4574 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4575
4576 // WorkGroupIdXYZ = ClusterId == 0 ?
4577 // ClusterIdXYZ :
4578 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4579 MachineRegisterInfo &MRI = *B.getMRI();
4580 const LLT S32 = LLT::scalar(32);
4581 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4582 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4583 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4584 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4585 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4586 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4587 return false;
4588
4589 auto One = B.buildConstant(S32, 1);
4590 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4591 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4592 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4593
4594 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4595
4596 switch (MFI->getClusterDims().getKind()) {
4599 B.buildCopy(DstReg, GlobalIdXYZ);
4600 MI.eraseFromParent();
4601 return true;
4602 }
4604 B.buildCopy(DstReg, ClusterIdXYZ);
4605 MI.eraseFromParent();
4606 return true;
4607 }
4609 using namespace AMDGPU::Hwreg;
4610 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4611 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4612 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4613 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4614 .addDef(ClusterId)
4615 .addImm(ClusterIdField);
4616 auto Zero = B.buildConstant(S32, 0);
4617 auto NoClusters =
4618 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4619 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4620 MI.eraseFromParent();
4621 return true;
4622 }
4623 }
4624
4625 llvm_unreachable("nothing should reach here");
4626}
4627
4629 Register DstReg, MachineIRBuilder &B,
4631 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4632 const ArgDescriptor *Arg = nullptr;
4633 const TargetRegisterClass *ArgRC;
4634 LLT ArgTy;
4635
4636 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4637 const ArgDescriptor WorkGroupIDX =
4638 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4639 // If GridZ is not programmed in an entry function then the hardware will set
4640 // it to all zeros, so there is no need to mask the GridY value in the low
4641 // order bits.
4642 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4643 AMDGPU::TTMP7,
4644 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4645 const ArgDescriptor WorkGroupIDZ =
4646 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4647 const ArgDescriptor ClusterWorkGroupIDX =
4648 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4649 const ArgDescriptor ClusterWorkGroupIDY =
4650 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4651 const ArgDescriptor ClusterWorkGroupIDZ =
4652 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4653 const ArgDescriptor ClusterWorkGroupMaxIDX =
4654 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4655 const ArgDescriptor ClusterWorkGroupMaxIDY =
4656 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4657 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4658 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4659 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4660 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4661
4662 auto LoadConstant = [&](unsigned N) {
4663 B.buildConstant(DstReg, N);
4664 return true;
4665 };
4666
4667 if (ST.hasArchitectedSGPRs() &&
4669 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4670 bool HasFixedDims = ClusterDims.isFixedDims();
4671
4672 switch (ArgType) {
4674 Arg = &WorkGroupIDX;
4675 ArgRC = &AMDGPU::SReg_32RegClass;
4676 ArgTy = LLT::scalar(32);
4677 break;
4679 Arg = &WorkGroupIDY;
4680 ArgRC = &AMDGPU::SReg_32RegClass;
4681 ArgTy = LLT::scalar(32);
4682 break;
4684 Arg = &WorkGroupIDZ;
4685 ArgRC = &AMDGPU::SReg_32RegClass;
4686 ArgTy = LLT::scalar(32);
4687 break;
4689 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4690 return LoadConstant(0);
4691 Arg = &ClusterWorkGroupIDX;
4692 ArgRC = &AMDGPU::SReg_32RegClass;
4693 ArgTy = LLT::scalar(32);
4694 break;
4696 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4697 return LoadConstant(0);
4698 Arg = &ClusterWorkGroupIDY;
4699 ArgRC = &AMDGPU::SReg_32RegClass;
4700 ArgTy = LLT::scalar(32);
4701 break;
4703 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4704 return LoadConstant(0);
4705 Arg = &ClusterWorkGroupIDZ;
4706 ArgRC = &AMDGPU::SReg_32RegClass;
4707 ArgTy = LLT::scalar(32);
4708 break;
4710 if (HasFixedDims)
4711 return LoadConstant(ClusterDims.getDims()[0] - 1);
4712 Arg = &ClusterWorkGroupMaxIDX;
4713 ArgRC = &AMDGPU::SReg_32RegClass;
4714 ArgTy = LLT::scalar(32);
4715 break;
4717 if (HasFixedDims)
4718 return LoadConstant(ClusterDims.getDims()[1] - 1);
4719 Arg = &ClusterWorkGroupMaxIDY;
4720 ArgRC = &AMDGPU::SReg_32RegClass;
4721 ArgTy = LLT::scalar(32);
4722 break;
4724 if (HasFixedDims)
4725 return LoadConstant(ClusterDims.getDims()[2] - 1);
4726 Arg = &ClusterWorkGroupMaxIDZ;
4727 ArgRC = &AMDGPU::SReg_32RegClass;
4728 ArgTy = LLT::scalar(32);
4729 break;
4731 Arg = &ClusterWorkGroupMaxFlatID;
4732 ArgRC = &AMDGPU::SReg_32RegClass;
4733 ArgTy = LLT::scalar(32);
4734 break;
4735 default:
4736 break;
4737 }
4738 }
4739
4740 if (!Arg)
4741 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4742
4743 if (!Arg) {
4745 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4746 // which case the pointer argument may be missing and we use null.
4747 return LoadConstant(0);
4748 }
4749
4750 // It's undefined behavior if a function marked with the amdgpu-no-*
4751 // attributes uses the corresponding intrinsic.
4752 B.buildUndef(DstReg);
4753 return true;
4754 }
4755
4756 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4757 return false; // TODO: Handle these
4758 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4759 return true;
4760}
4761
4765 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4766 return false;
4767
4768 MI.eraseFromParent();
4769 return true;
4770}
4771
4773 int64_t C) {
4774 B.buildConstant(MI.getOperand(0).getReg(), C);
4775 MI.eraseFromParent();
4776 return true;
4777}
4778
4781 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4782 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4783 if (MaxID == 0)
4784 return replaceWithConstant(B, MI, 0);
4785
4786 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4787 const ArgDescriptor *Arg;
4788 const TargetRegisterClass *ArgRC;
4789 LLT ArgTy;
4790 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4791
4792 Register DstReg = MI.getOperand(0).getReg();
4793 if (!Arg) {
4794 // It's undefined behavior if a function marked with the amdgpu-no-*
4795 // attributes uses the corresponding intrinsic.
4796 B.buildUndef(DstReg);
4797 MI.eraseFromParent();
4798 return true;
4799 }
4800
4801 if (Arg->isMasked()) {
4802 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4803 // masking operations anyway.
4804 //
4805 // TODO: We could assert the top bit is 0 for the source copy.
4806 if (!loadInputValue(DstReg, B, ArgType))
4807 return false;
4808 } else {
4809 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4810 if (!loadInputValue(TmpReg, B, ArgType))
4811 return false;
4812 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4813 }
4814
4815 MI.eraseFromParent();
4816 return true;
4817}
4818
4821 // This isn't really a constant pool but close enough.
4824 return PtrInfo;
4825}
4826
4828 int64_t Offset) const {
4830 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4831
4832 // TODO: If we passed in the base kernel offset we could have a better
4833 // alignment than 4, but we don't really need it.
4834 if (!loadInputValue(KernArgReg, B,
4836 llvm_unreachable("failed to find kernarg segment ptr");
4837
4838 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4839 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4840}
4841
4842/// Legalize a value that's loaded from kernel arguments. This is only used by
4843/// legacy intrinsics.
4847 Align Alignment) const {
4848 Register DstReg = MI.getOperand(0).getReg();
4849
4850 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4851 "unexpected kernarg parameter type");
4852
4855 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
4858 MI.eraseFromParent();
4859 return true;
4860}
4861
4864 MachineIRBuilder &B) const {
4865 Register Dst = MI.getOperand(0).getReg();
4866 LLT DstTy = MRI.getType(Dst);
4867 LLT S16 = LLT::scalar(16);
4868 LLT S32 = LLT::scalar(32);
4869 LLT S64 = LLT::scalar(64);
4870
4871 if (DstTy == S16)
4872 return legalizeFDIV16(MI, MRI, B);
4873 if (DstTy == S32)
4874 return legalizeFDIV32(MI, MRI, B);
4875 if (DstTy == S64)
4876 return legalizeFDIV64(MI, MRI, B);
4877
4878 return false;
4879}
4880
4882 Register DstDivReg,
4883 Register DstRemReg,
4884 Register X,
4885 Register Y) const {
4886 const LLT S1 = LLT::scalar(1);
4887 const LLT S32 = LLT::scalar(32);
4888
4889 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4890 // algorithm used here.
4891
4892 // Initial estimate of inv(y).
4893 auto FloatY = B.buildUITOFP(S32, Y);
4894 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4895 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4896 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4897 auto Z = B.buildFPTOUI(S32, ScaledY);
4898
4899 // One round of UNR.
4900 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4901 auto NegYZ = B.buildMul(S32, NegY, Z);
4902 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4903
4904 // Quotient/remainder estimate.
4905 auto Q = B.buildUMulH(S32, X, Z);
4906 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4907
4908 // First quotient/remainder refinement.
4909 auto One = B.buildConstant(S32, 1);
4910 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4911 if (DstDivReg)
4912 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4913 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4914
4915 // Second quotient/remainder refinement.
4916 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4917 if (DstDivReg)
4918 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4919
4920 if (DstRemReg)
4921 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4922}
4923
4924// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4925//
4926// Return lo, hi of result
4927//
4928// %cvt.lo = G_UITOFP Val.lo
4929// %cvt.hi = G_UITOFP Val.hi
4930// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4931// %rcp = G_AMDGPU_RCP_IFLAG %mad
4932// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4933// %mul2 = G_FMUL %mul1, 2**(-32)
4934// %trunc = G_INTRINSIC_TRUNC %mul2
4935// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4936// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4937static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4938 Register Val) {
4939 const LLT S32 = LLT::scalar(32);
4940 auto Unmerge = B.buildUnmerge(S32, Val);
4941
4942 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4943 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4944
4945 auto Mad = B.buildFMAD(
4946 S32, CvtHi, // 2**32
4947 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4948
4949 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4950 auto Mul1 = B.buildFMul(
4951 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4952
4953 // 2**(-32)
4954 auto Mul2 = B.buildFMul(
4955 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4956 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4957
4958 // -(2**32)
4959 auto Mad2 = B.buildFMAD(
4960 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4961 Mul1);
4962
4963 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4964 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4965
4966 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4967}
4968
4970 Register DstDivReg,
4971 Register DstRemReg,
4972 Register Numer,
4973 Register Denom) const {
4974 const LLT S32 = LLT::scalar(32);
4975 const LLT S64 = LLT::scalar(64);
4976 const LLT S1 = LLT::scalar(1);
4977 Register RcpLo, RcpHi;
4978
4979 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4980
4981 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4982
4983 auto Zero64 = B.buildConstant(S64, 0);
4984 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4985
4986 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4987 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4988
4989 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4990 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4991 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4992
4993 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4994 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4995 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4996
4997 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4998 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4999 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5000 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5001 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5002
5003 auto Zero32 = B.buildConstant(S32, 0);
5004 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5005 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5006 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5007
5008 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5009 Register NumerLo = UnmergeNumer.getReg(0);
5010 Register NumerHi = UnmergeNumer.getReg(1);
5011
5012 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5013 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5014 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5015 Register Mul3_Lo = UnmergeMul3.getReg(0);
5016 Register Mul3_Hi = UnmergeMul3.getReg(1);
5017 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5018 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5019 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5020 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5021
5022 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5023 Register DenomLo = UnmergeDenom.getReg(0);
5024 Register DenomHi = UnmergeDenom.getReg(1);
5025
5026 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5027 auto C1 = B.buildSExt(S32, CmpHi);
5028
5029 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5030 auto C2 = B.buildSExt(S32, CmpLo);
5031
5032 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5033 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5034
5035 // TODO: Here and below portions of the code can be enclosed into if/endif.
5036 // Currently control flow is unconditional and we have 4 selects after
5037 // potential endif to substitute PHIs.
5038
5039 // if C3 != 0 ...
5040 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5041 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5042 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5043 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5044
5045 auto One64 = B.buildConstant(S64, 1);
5046 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5047
5048 auto C4 =
5049 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5050 auto C5 =
5051 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5052 auto C6 = B.buildSelect(
5053 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5054
5055 // if (C6 != 0)
5056 auto Add4 = B.buildAdd(S64, Add3, One64);
5057 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5058
5059 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5060 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5061 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5062
5063 // endif C6
5064 // endif C3
5065
5066 if (DstDivReg) {
5067 auto Sel1 = B.buildSelect(
5068 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5069 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5070 Sel1, MulHi3);
5071 }
5072
5073 if (DstRemReg) {
5074 auto Sel2 = B.buildSelect(
5075 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5076 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5077 Sel2, Sub1);
5078 }
5079}
5080
5083 MachineIRBuilder &B) const {
5084 Register DstDivReg, DstRemReg;
5085 switch (MI.getOpcode()) {
5086 default:
5087 llvm_unreachable("Unexpected opcode!");
5088 case AMDGPU::G_UDIV: {
5089 DstDivReg = MI.getOperand(0).getReg();
5090 break;
5091 }
5092 case AMDGPU::G_UREM: {
5093 DstRemReg = MI.getOperand(0).getReg();
5094 break;
5095 }
5096 case AMDGPU::G_UDIVREM: {
5097 DstDivReg = MI.getOperand(0).getReg();
5098 DstRemReg = MI.getOperand(1).getReg();
5099 break;
5100 }
5101 }
5102
5103 const LLT S64 = LLT::scalar(64);
5104 const LLT S32 = LLT::scalar(32);
5105 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5106 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5107 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5108 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5109
5110 if (Ty == S32)
5111 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5112 else if (Ty == S64)
5113 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5114 else
5115 return false;
5116
5117 MI.eraseFromParent();
5118 return true;
5119}
5120
5123 MachineIRBuilder &B) const {
5124 const LLT S64 = LLT::scalar(64);
5125 const LLT S32 = LLT::scalar(32);
5126
5127 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5128 if (Ty != S32 && Ty != S64)
5129 return false;
5130
5131 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5132 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5133 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5134
5135 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5136 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5137 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5138
5139 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5140 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5141
5142 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5143 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5144
5145 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5146 switch (MI.getOpcode()) {
5147 default:
5148 llvm_unreachable("Unexpected opcode!");
5149 case AMDGPU::G_SDIV: {
5150 DstDivReg = MI.getOperand(0).getReg();
5151 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5152 break;
5153 }
5154 case AMDGPU::G_SREM: {
5155 DstRemReg = MI.getOperand(0).getReg();
5156 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5157 break;
5158 }
5159 case AMDGPU::G_SDIVREM: {
5160 DstDivReg = MI.getOperand(0).getReg();
5161 DstRemReg = MI.getOperand(1).getReg();
5162 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5163 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5164 break;
5165 }
5166 }
5167
5168 if (Ty == S32)
5169 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5170 else
5171 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5172
5173 if (DstDivReg) {
5174 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5175 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5176 B.buildSub(DstDivReg, SignXor, Sign);
5177 }
5178
5179 if (DstRemReg) {
5180 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5181 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5182 B.buildSub(DstRemReg, SignXor, Sign);
5183 }
5184
5185 MI.eraseFromParent();
5186 return true;
5187}
5188
5191 MachineIRBuilder &B) const {
5192 Register Res = MI.getOperand(0).getReg();
5193 Register LHS = MI.getOperand(1).getReg();
5194 Register RHS = MI.getOperand(2).getReg();
5195 uint16_t Flags = MI.getFlags();
5196 LLT ResTy = MRI.getType(Res);
5197
5198 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5199
5200 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5201 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5202 return false;
5203
5204 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5205 // the CI documentation has a worst case error of 1 ulp.
5206 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5207 // use it as long as we aren't trying to use denormals.
5208 //
5209 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5210
5211 // 1 / x -> RCP(x)
5212 if (CLHS->isExactlyValue(1.0)) {
5213 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5214 .addUse(RHS)
5215 .setMIFlags(Flags);
5216
5217 MI.eraseFromParent();
5218 return true;
5219 }
5220
5221 // -1 / x -> RCP( FNEG(x) )
5222 if (CLHS->isExactlyValue(-1.0)) {
5223 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5224 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5225 .addUse(FNeg.getReg(0))
5226 .setMIFlags(Flags);
5227
5228 MI.eraseFromParent();
5229 return true;
5230 }
5231 }
5232
5233 // For f16 require afn or arcp.
5234 // For f32 require afn.
5235 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5236 !MI.getFlag(MachineInstr::FmArcp)))
5237 return false;
5238
5239 // x / y -> x * (1.0 / y)
5240 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5241 .addUse(RHS)
5242 .setMIFlags(Flags);
5243 B.buildFMul(Res, LHS, RCP, Flags);
5244
5245 MI.eraseFromParent();
5246 return true;
5247}
5248
5251 MachineIRBuilder &B) const {
5252 Register Res = MI.getOperand(0).getReg();
5253 Register X = MI.getOperand(1).getReg();
5254 Register Y = MI.getOperand(2).getReg();
5255 uint16_t Flags = MI.getFlags();
5256 LLT ResTy = MRI.getType(Res);
5257
5258 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5259
5260 if (!AllowInaccurateRcp)
5261 return false;
5262
5263 auto NegY = B.buildFNeg(ResTy, Y);
5264 auto One = B.buildFConstant(ResTy, 1.0);
5265
5266 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5267 .addUse(Y)
5268 .setMIFlags(Flags);
5269
5270 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5271 R = B.buildFMA(ResTy, Tmp0, R, R);
5272
5273 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5274 R = B.buildFMA(ResTy, Tmp1, R, R);
5275
5276 auto Ret = B.buildFMul(ResTy, X, R);
5277 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5278
5279 B.buildFMA(Res, Tmp2, R, Ret);
5280 MI.eraseFromParent();
5281 return true;
5282}
5283
5286 MachineIRBuilder &B) const {
5288 return true;
5289
5290 Register Res = MI.getOperand(0).getReg();
5291 Register LHS = MI.getOperand(1).getReg();
5292 Register RHS = MI.getOperand(2).getReg();
5293
5294 uint16_t Flags = MI.getFlags();
5295
5296 LLT S16 = LLT::scalar(16);
5297 LLT S32 = LLT::scalar(32);
5298
5299 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5300 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5301 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5302 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5303 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5304 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5305 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5306 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5307 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5308 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5309 // q16.u = opx(V_CVT_F16_F32, q32.u);
5310 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5311
5312 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5313 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5314 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5315 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5316 .addUse(RHSExt.getReg(0))
5317 .setMIFlags(Flags);
5318 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5320 if (ST.hasMadMacF32Insts()) {
5321 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5322 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5323 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5324 } else {
5325 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5326 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5327 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5328 }
5329 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5330 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5331 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5332 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5333 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5334 .addUse(RDst.getReg(0))
5335 .addUse(RHS)
5336 .addUse(LHS)
5337 .setMIFlags(Flags);
5338
5339 MI.eraseFromParent();
5340 return true;
5341}
5342
5343static constexpr unsigned SPDenormModeBitField =
5345
5346// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5347// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5349 const GCNSubtarget &ST,
5351 // Set SP denorm mode to this value.
5352 unsigned SPDenormMode =
5353 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5354
5355 if (ST.hasDenormModeInst()) {
5356 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5357 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5358
5359 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5360 B.buildInstr(AMDGPU::S_DENORM_MODE)
5361 .addImm(NewDenormModeValue);
5362
5363 } else {
5364 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5365 .addImm(SPDenormMode)
5366 .addImm(SPDenormModeBitField);
5367 }
5368}
5369
5372 MachineIRBuilder &B) const {
5374 return true;
5375
5376 Register Res = MI.getOperand(0).getReg();
5377 Register LHS = MI.getOperand(1).getReg();
5378 Register RHS = MI.getOperand(2).getReg();
5379 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5380 SIModeRegisterDefaults Mode = MFI->getMode();
5381
5382 uint16_t Flags = MI.getFlags();
5383
5384 LLT S32 = LLT::scalar(32);
5385 LLT S1 = LLT::scalar(1);
5386
5387 auto One = B.buildFConstant(S32, 1.0f);
5388
5389 auto DenominatorScaled =
5390 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5391 .addUse(LHS)
5392 .addUse(RHS)
5393 .addImm(0)
5394 .setMIFlags(Flags);
5395 auto NumeratorScaled =
5396 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5397 .addUse(LHS)
5398 .addUse(RHS)
5399 .addImm(1)
5400 .setMIFlags(Flags);
5401
5402 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5403 .addUse(DenominatorScaled.getReg(0))
5404 .setMIFlags(Flags);
5405 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5406
5407 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5408 const bool HasDynamicDenormals =
5409 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5410 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5411
5412 Register SavedSPDenormMode;
5413 if (!PreservesDenormals) {
5414 if (HasDynamicDenormals) {
5415 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5416 B.buildInstr(AMDGPU::S_GETREG_B32)
5417 .addDef(SavedSPDenormMode)
5418 .addImm(SPDenormModeBitField);
5419 }
5420 toggleSPDenormMode(true, B, ST, Mode);
5421 }
5422
5423 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5424 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5425 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5426 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5427 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5428 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5429
5430 if (!PreservesDenormals) {
5431 if (HasDynamicDenormals) {
5432 assert(SavedSPDenormMode);
5433 B.buildInstr(AMDGPU::S_SETREG_B32)
5434 .addReg(SavedSPDenormMode)
5435 .addImm(SPDenormModeBitField);
5436 } else
5437 toggleSPDenormMode(false, B, ST, Mode);
5438 }
5439
5440 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5441 .addUse(Fma4.getReg(0))
5442 .addUse(Fma1.getReg(0))
5443 .addUse(Fma3.getReg(0))
5444 .addUse(NumeratorScaled.getReg(1))
5445 .setMIFlags(Flags);
5446
5447 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5448 .addUse(Fmas.getReg(0))
5449 .addUse(RHS)
5450 .addUse(LHS)
5451 .setMIFlags(Flags);
5452
5453 MI.eraseFromParent();
5454 return true;
5455}
5456
5459 MachineIRBuilder &B) const {
5461 return true;
5462
5463 Register Res = MI.getOperand(0).getReg();
5464 Register LHS = MI.getOperand(1).getReg();
5465 Register RHS = MI.getOperand(2).getReg();
5466
5467 uint16_t Flags = MI.getFlags();
5468
5469 LLT S64 = LLT::scalar(64);
5470 LLT S1 = LLT::scalar(1);
5471
5472 auto One = B.buildFConstant(S64, 1.0);
5473
5474 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5475 .addUse(LHS)
5476 .addUse(RHS)
5477 .addImm(0)
5478 .setMIFlags(Flags);
5479
5480 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5481
5482 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5483 .addUse(DivScale0.getReg(0))
5484 .setMIFlags(Flags);
5485
5486 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5487 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5488 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5489
5490 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5491 .addUse(LHS)
5492 .addUse(RHS)
5493 .addImm(1)
5494 .setMIFlags(Flags);
5495
5496 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5497 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5498 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5499
5500 Register Scale;
5501 if (!ST.hasUsableDivScaleConditionOutput()) {
5502 // Workaround a hardware bug on SI where the condition output from div_scale
5503 // is not usable.
5504
5505 LLT S32 = LLT::scalar(32);
5506
5507 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5508 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5509 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5510 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5511
5512 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5513 Scale1Unmerge.getReg(1));
5514 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5515 Scale0Unmerge.getReg(1));
5516 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5517 } else {
5518 Scale = DivScale1.getReg(1);
5519 }
5520
5521 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5522 .addUse(Fma4.getReg(0))
5523 .addUse(Fma3.getReg(0))
5524 .addUse(Mul.getReg(0))
5525 .addUse(Scale)
5526 .setMIFlags(Flags);
5527
5528 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5529 .addUse(Fmas.getReg(0))
5530 .addUse(RHS)
5531 .addUse(LHS)
5532 .setMIFlags(Flags);
5533
5534 MI.eraseFromParent();
5535 return true;
5536}
5537
5540 MachineIRBuilder &B) const {
5541 Register Res0 = MI.getOperand(0).getReg();
5542 Register Res1 = MI.getOperand(1).getReg();
5543 Register Val = MI.getOperand(2).getReg();
5544 uint16_t Flags = MI.getFlags();
5545
5546 LLT Ty = MRI.getType(Res0);
5547 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5548
5549 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5550 .addUse(Val)
5551 .setMIFlags(Flags);
5552 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5553 .addUse(Val)
5554 .setMIFlags(Flags);
5555
5556 if (ST.hasFractBug()) {
5557 auto Fabs = B.buildFAbs(Ty, Val);
5558 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5559 auto IsFinite =
5560 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5561 auto Zero = B.buildConstant(InstrExpTy, 0);
5562 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5563 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5564 }
5565
5566 B.buildCopy(Res0, Mant);
5567 B.buildSExtOrTrunc(Res1, Exp);
5568
5569 MI.eraseFromParent();
5570 return true;
5571}
5572
5575 MachineIRBuilder &B) const {
5576 Register Res = MI.getOperand(0).getReg();
5577 Register LHS = MI.getOperand(2).getReg();
5578 Register RHS = MI.getOperand(3).getReg();
5579 uint16_t Flags = MI.getFlags();
5580
5581 LLT S32 = LLT::scalar(32);
5582 LLT S1 = LLT::scalar(1);
5583
5584 auto Abs = B.buildFAbs(S32, RHS, Flags);
5585 const APFloat C0Val(1.0f);
5586
5587 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5588 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5589 auto C2 = B.buildFConstant(S32, 1.0f);
5590
5591 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5592 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5593
5594 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5595
5596 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5597 .addUse(Mul0.getReg(0))
5598 .setMIFlags(Flags);
5599
5600 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5601
5602 B.buildFMul(Res, Sel, Mul1, Flags);
5603
5604 MI.eraseFromParent();
5605 return true;
5606}
5607
5610 MachineIRBuilder &B) const {
5611 // Bypass the correct expansion a standard promotion through G_FSQRT would
5612 // get. The f32 op is accurate enough for the f16 cas.
5613 unsigned Flags = MI.getFlags();
5614 assert(!ST.has16BitInsts());
5615 const LLT F32 = LLT::scalar(32);
5616 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5617 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5618 .addUse(Ext.getReg(0))
5619 .setMIFlags(Flags);
5620 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5621 MI.eraseFromParent();
5622 return true;
5623}
5624
5627 MachineIRBuilder &B) const {
5628 MachineFunction &MF = B.getMF();
5629 Register Dst = MI.getOperand(0).getReg();
5630 Register X = MI.getOperand(1).getReg();
5631 const unsigned Flags = MI.getFlags();
5632 const LLT S1 = LLT::scalar(1);
5633 const LLT F32 = LLT::scalar(32);
5634 const LLT I32 = LLT::scalar(32);
5635
5636 if (allowApproxFunc(MF, Flags)) {
5637 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5638 .addUse(X)
5639 .setMIFlags(Flags);
5640 MI.eraseFromParent();
5641 return true;
5642 }
5643
5644 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5645 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5646 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5647 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5648 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5649
5650 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5651 if (needsDenormHandlingF32(MF, X, Flags)) {
5652 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5653 .addUse(SqrtX.getReg(0))
5654 .setMIFlags(Flags);
5655
5656 auto NegOne = B.buildConstant(I32, -1);
5657 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5658
5659 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5660 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5661
5662 auto PosOne = B.buildConstant(I32, 1);
5663 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5664
5665 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5666 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5667
5668 auto Zero = B.buildFConstant(F32, 0.0f);
5669 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5670
5671 SqrtS =
5672 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5673
5674 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5675 SqrtS =
5676 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5677 } else {
5678 auto SqrtR =
5679 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5680 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5681
5682 auto Half = B.buildFConstant(F32, 0.5f);
5683 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5684 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5685 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5686 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5687 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5688 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5689 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5690 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5691 }
5692
5693 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5694
5695 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5696
5697 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5698
5699 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5700 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5701
5702 MI.eraseFromParent();
5703 return true;
5704}
5705
5708 MachineIRBuilder &B) const {
5709 // For double type, the SQRT and RSQ instructions don't have required
5710 // precision, we apply Goldschmidt's algorithm to improve the result:
5711 //
5712 // y0 = rsq(x)
5713 // g0 = x * y0
5714 // h0 = 0.5 * y0
5715 //
5716 // r0 = 0.5 - h0 * g0
5717 // g1 = g0 * r0 + g0
5718 // h1 = h0 * r0 + h0
5719 //
5720 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5721 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5722 // h2 = h1 * r1 + h1
5723 //
5724 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5725 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5726 //
5727 // sqrt(x) = g3
5728
5729 const LLT S1 = LLT::scalar(1);
5730 const LLT S32 = LLT::scalar(32);
5731 const LLT F64 = LLT::scalar(64);
5732
5733 Register Dst = MI.getOperand(0).getReg();
5734 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5735
5736 Register X = MI.getOperand(1).getReg();
5737 unsigned Flags = MI.getFlags();
5738
5739 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5740
5741 auto ZeroInt = B.buildConstant(S32, 0);
5742 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5743
5744 // Scale up input if it is too small.
5745 auto ScaleUpFactor = B.buildConstant(S32, 256);
5746 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5747 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5748
5749 auto SqrtY =
5750 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5751
5752 auto Half = B.buildFConstant(F64, 0.5);
5753 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5754 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5755
5756 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5757 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5758
5759 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5760 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5761
5762 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5763 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5764
5765 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5766
5767 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5768 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5769
5770 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5771
5772 // Scale down the result.
5773 auto ScaleDownFactor = B.buildConstant(S32, -128);
5774 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5775 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5776
5777 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5778 // with finite only or nsz because rsq(+/-0) = +/-inf
5779
5780 // TODO: Check for DAZ and expand to subnormals
5781 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5782
5783 // If x is +INF, +0, or -0, use its original value
5784 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5785
5786 MI.eraseFromParent();
5787 return true;
5788}
5789
5792 MachineIRBuilder &B) const {
5793 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5794 if (Ty == LLT::scalar(32))
5795 return legalizeFSQRTF32(MI, MRI, B);
5796 if (Ty == LLT::scalar(64))
5797 return legalizeFSQRTF64(MI, MRI, B);
5798 if (Ty == LLT::scalar(16))
5799 return legalizeFSQRTF16(MI, MRI, B);
5800 return false;
5801}
5802
5803// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5804// FIXME: Why do we handle this one but not other removed instructions?
5805//
5806// Reciprocal square root. The clamp prevents infinite results, clamping
5807// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5808// +-max_float.
5811 MachineIRBuilder &B) const {
5812 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5813 return true;
5814
5815 Register Dst = MI.getOperand(0).getReg();
5816 Register Src = MI.getOperand(2).getReg();
5817 auto Flags = MI.getFlags();
5818
5819 LLT Ty = MRI.getType(Dst);
5820
5821 const fltSemantics *FltSemantics;
5822 if (Ty == LLT::scalar(32))
5823 FltSemantics = &APFloat::IEEEsingle();
5824 else if (Ty == LLT::scalar(64))
5825 FltSemantics = &APFloat::IEEEdouble();
5826 else
5827 return false;
5828
5829 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5830 .addUse(Src)
5831 .setMIFlags(Flags);
5832
5833 // We don't need to concern ourselves with the snan handling difference, since
5834 // the rsq quieted (or not) so use the one which will directly select.
5835 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5836 const bool UseIEEE = MFI->getMode().IEEE;
5837
5838 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5839 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5840 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5841
5842 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5843
5844 if (UseIEEE)
5845 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5846 else
5847 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5848 MI.eraseFromParent();
5849 return true;
5850}
5851
5852// TODO: Fix pointer type handling
5855 Intrinsic::ID IID) const {
5856
5857 MachineIRBuilder &B = Helper.MIRBuilder;
5858 MachineRegisterInfo &MRI = *B.getMRI();
5859
5860 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5861 IID == Intrinsic::amdgcn_permlanex16;
5862 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5863 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5864
5865 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5866 Register Src2, LLT VT) -> Register {
5867 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5868 switch (IID) {
5869 case Intrinsic::amdgcn_readfirstlane:
5870 case Intrinsic::amdgcn_permlane64:
5871 return LaneOp.getReg(0);
5872 case Intrinsic::amdgcn_readlane:
5873 case Intrinsic::amdgcn_set_inactive:
5874 case Intrinsic::amdgcn_set_inactive_chain_arg:
5875 return LaneOp.addUse(Src1).getReg(0);
5876 case Intrinsic::amdgcn_writelane:
5877 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5878 case Intrinsic::amdgcn_permlane16:
5879 case Intrinsic::amdgcn_permlanex16: {
5880 Register Src3 = MI.getOperand(5).getReg();
5881 int64_t Src4 = MI.getOperand(6).getImm();
5882 int64_t Src5 = MI.getOperand(7).getImm();
5883 return LaneOp.addUse(Src1)
5884 .addUse(Src2)
5885 .addUse(Src3)
5886 .addImm(Src4)
5887 .addImm(Src5)
5888 .getReg(0);
5889 }
5890 case Intrinsic::amdgcn_mov_dpp8:
5891 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5892 case Intrinsic::amdgcn_update_dpp:
5893 return LaneOp.addUse(Src1)
5894 .addImm(MI.getOperand(4).getImm())
5895 .addImm(MI.getOperand(5).getImm())
5896 .addImm(MI.getOperand(6).getImm())
5897 .addImm(MI.getOperand(7).getImm())
5898 .getReg(0);
5899 default:
5900 llvm_unreachable("unhandled lane op");
5901 }
5902 };
5903
5904 Register DstReg = MI.getOperand(0).getReg();
5905 Register Src0 = MI.getOperand(2).getReg();
5906 Register Src1, Src2;
5907 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5908 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5909 Src1 = MI.getOperand(3).getReg();
5910 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5911 Src2 = MI.getOperand(4).getReg();
5912 }
5913 }
5914
5915 LLT Ty = MRI.getType(DstReg);
5916 unsigned Size = Ty.getSizeInBits();
5917
5918 unsigned SplitSize = 32;
5919 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5920 ST.hasDPALU_DPP() &&
5921 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
5922 SplitSize = 64;
5923
5924 if (Size == SplitSize) {
5925 // Already legal
5926 return true;
5927 }
5928
5929 if (Size < 32) {
5930 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5931
5932 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5933 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5934
5935 if (IID == Intrinsic::amdgcn_writelane)
5936 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5937
5938 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5939 B.buildTrunc(DstReg, LaneOpDst);
5940 MI.eraseFromParent();
5941 return true;
5942 }
5943
5944 if (Size % SplitSize != 0)
5945 return false;
5946
5947 LLT PartialResTy = LLT::scalar(SplitSize);
5948 bool NeedsBitcast = false;
5949 if (Ty.isVector()) {
5950 LLT EltTy = Ty.getElementType();
5951 unsigned EltSize = EltTy.getSizeInBits();
5952 if (EltSize == SplitSize) {
5953 PartialResTy = EltTy;
5954 } else if (EltSize == 16 || EltSize == 32) {
5955 unsigned NElem = SplitSize / EltSize;
5956 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5957 } else {
5958 // Handle all other cases via S32/S64 pieces
5959 NeedsBitcast = true;
5960 }
5961 }
5962
5963 SmallVector<Register, 4> PartialRes;
5964 unsigned NumParts = Size / SplitSize;
5965 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5966 MachineInstrBuilder Src1Parts, Src2Parts;
5967
5968 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5969 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5970
5971 if (IID == Intrinsic::amdgcn_writelane)
5972 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5973
5974 for (unsigned i = 0; i < NumParts; ++i) {
5975 Src0 = Src0Parts.getReg(i);
5976
5977 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5978 Src1 = Src1Parts.getReg(i);
5979
5980 if (IID == Intrinsic::amdgcn_writelane)
5981 Src2 = Src2Parts.getReg(i);
5982
5983 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5984 }
5985
5986 if (NeedsBitcast)
5987 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
5988 LLT::scalar(Ty.getSizeInBits()), PartialRes));
5989 else
5990 B.buildMergeLikeInstr(DstReg, PartialRes);
5991
5992 MI.eraseFromParent();
5993 return true;
5994}
5995
5998 MachineIRBuilder &B) const {
6000 ST.getTargetLowering()->getImplicitParameterOffset(
6002 LLT DstTy = MRI.getType(DstReg);
6003 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6004
6005 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6006 if (!loadInputValue(KernargPtrReg, B,
6008 return false;
6009
6010 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6011 B.buildConstant(IdxTy, Offset).getReg(0));
6012 return true;
6013}
6014
6015/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6016/// bits of the pointer and replace them with the stride argument, then
6017/// merge_values everything together. In the common case of a raw buffer (the
6018/// stride component is 0), we can just AND off the upper half.
6021 Register Result = MI.getOperand(0).getReg();
6022 Register Pointer = MI.getOperand(2).getReg();
6023 Register Stride = MI.getOperand(3).getReg();
6024 Register NumRecords = MI.getOperand(4).getReg();
6025 Register Flags = MI.getOperand(5).getReg();
6026
6027 LLT S32 = LLT::scalar(32);
6028 LLT S64 = LLT::scalar(64);
6029
6030 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6031
6032 auto ExtStride = B.buildAnyExt(S32, Stride);
6033
6034 if (ST.has45BitNumRecordsBufferResource()) {
6035 Register Zero = B.buildConstant(S32, 0).getReg(0);
6036 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6037 // num_records.
6038 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6039 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6040 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6041 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6042 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6043
6044 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6045 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6046 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6047 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6048 auto ExtShiftedStride =
6049 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6050 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6051 auto ExtShiftedFlags =
6052 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6053 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6054 Register HighHalf =
6055 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6056 B.buildMergeValues(Result, {LowHalf, HighHalf});
6057 } else {
6058 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6059 auto Unmerge = B.buildUnmerge(S32, Pointer);
6060 auto LowHalf = Unmerge.getReg(0);
6061 auto HighHalf = Unmerge.getReg(1);
6062
6063 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6064 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6065 auto ShiftConst = B.buildConstant(S32, 16);
6066 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6067 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6068 Register NewHighHalfReg = NewHighHalf.getReg(0);
6069 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6070 }
6071
6072 MI.eraseFromParent();
6073 return true;
6074}
6075
6078 MachineIRBuilder &B) const {
6079 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6080 if (!MFI->isEntryFunction()) {
6083 }
6084
6085 Register DstReg = MI.getOperand(0).getReg();
6086 if (!getImplicitArgPtr(DstReg, MRI, B))
6087 return false;
6088
6089 MI.eraseFromParent();
6090 return true;
6091}
6092
6095 MachineIRBuilder &B) const {
6096 Function &F = B.getMF().getFunction();
6097 std::optional<uint32_t> KnownSize =
6099 if (KnownSize.has_value())
6100 B.buildConstant(DstReg, *KnownSize);
6101 return false;
6102}
6103
6106 MachineIRBuilder &B) const {
6107
6108 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6109 if (!MFI->isEntryFunction()) {
6112 }
6113
6114 Register DstReg = MI.getOperand(0).getReg();
6115 if (!getLDSKernelId(DstReg, MRI, B))
6116 return false;
6117
6118 MI.eraseFromParent();
6119 return true;
6120}
6121
6125 unsigned AddrSpace) const {
6126 const LLT S32 = LLT::scalar(32);
6127 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6128 Register Hi32 = Unmerge.getReg(1);
6129
6130 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6131 ST.hasGloballyAddressableScratch()) {
6132 Register FlatScratchBaseHi =
6133 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6134 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6135 .getReg(0);
6136 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6137 // Test bits 63..58 against the aperture address.
6138 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6139 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6140 B.buildConstant(S32, 1u << 26));
6141 } else {
6142 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6143 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6144 }
6145 MI.eraseFromParent();
6146 return true;
6147}
6148
6149// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6150// offset (the offset that is included in bounds checking and swizzling, to be
6151// split between the instruction's voffset and immoffset fields) and soffset
6152// (the offset that is excluded from bounds checking and swizzling, to go in
6153// the instruction's soffset field). This function takes the first kind of
6154// offset and figures out how to split it between voffset and immoffset.
6155std::pair<Register, unsigned>
6157 Register OrigOffset) const {
6158 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6159 Register BaseReg;
6160 unsigned ImmOffset;
6161 const LLT S32 = LLT::scalar(32);
6162 MachineRegisterInfo &MRI = *B.getMRI();
6163
6164 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6165 // being added, so we can only safely match a 32-bit addition with no unsigned
6166 // overflow.
6167 bool CheckNUW = ST.hasGFX1250Insts();
6168 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6169 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6170
6171 // If BaseReg is a pointer, convert it to int.
6172 if (MRI.getType(BaseReg).isPointer())
6173 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6174
6175 // If the immediate value is too big for the immoffset field, put only bits
6176 // that would normally fit in the immoffset field. The remaining value that
6177 // is copied/added for the voffset field is a large power of 2, and it
6178 // stands more chance of being CSEd with the copy/add for another similar
6179 // load/store.
6180 // However, do not do that rounding down if that is a negative
6181 // number, as it appears to be illegal to have a negative offset in the
6182 // vgpr, even if adding the immediate offset makes it positive.
6183 unsigned Overflow = ImmOffset & ~MaxImm;
6184 ImmOffset -= Overflow;
6185 if ((int32_t)Overflow < 0) {
6186 Overflow += ImmOffset;
6187 ImmOffset = 0;
6188 }
6189
6190 if (Overflow != 0) {
6191 if (!BaseReg) {
6192 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6193 } else {
6194 auto OverflowVal = B.buildConstant(S32, Overflow);
6195 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6196 }
6197 }
6198
6199 if (!BaseReg)
6200 BaseReg = B.buildConstant(S32, 0).getReg(0);
6201
6202 return std::pair(BaseReg, ImmOffset);
6203}
6204
6205/// Handle register layout difference for f16 images for some subtargets.
6208 Register Reg,
6209 bool ImageStore) const {
6210 const LLT S16 = LLT::scalar(16);
6211 const LLT S32 = LLT::scalar(32);
6212 LLT StoreVT = MRI.getType(Reg);
6213 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6214
6215 if (ST.hasUnpackedD16VMem()) {
6216 auto Unmerge = B.buildUnmerge(S16, Reg);
6217
6218 SmallVector<Register, 4> WideRegs;
6219 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6220 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6221
6222 int NumElts = StoreVT.getNumElements();
6223
6224 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6225 .getReg(0);
6226 }
6227
6228 if (ImageStore && ST.hasImageStoreD16Bug()) {
6229 if (StoreVT.getNumElements() == 2) {
6230 SmallVector<Register, 4> PackedRegs;
6231 Reg = B.buildBitcast(S32, Reg).getReg(0);
6232 PackedRegs.push_back(Reg);
6233 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6234 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6235 .getReg(0);
6236 }
6237
6238 if (StoreVT.getNumElements() == 3) {
6239 SmallVector<Register, 4> PackedRegs;
6240 auto Unmerge = B.buildUnmerge(S16, Reg);
6241 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6242 PackedRegs.push_back(Unmerge.getReg(I));
6243 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6244 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6245 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6246 }
6247
6248 if (StoreVT.getNumElements() == 4) {
6249 SmallVector<Register, 4> PackedRegs;
6250 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6251 auto Unmerge = B.buildUnmerge(S32, Reg);
6252 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6253 PackedRegs.push_back(Unmerge.getReg(I));
6254 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6255 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6256 .getReg(0);
6257 }
6258
6259 llvm_unreachable("invalid data type");
6260 }
6261
6262 if (StoreVT == LLT::fixed_vector(3, S16)) {
6263 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6264 .getReg(0);
6265 }
6266 return Reg;
6267}
6268
6270 Register VData, LLT MemTy,
6271 bool IsFormat) const {
6272 MachineRegisterInfo *MRI = B.getMRI();
6273 LLT Ty = MRI->getType(VData);
6274
6275 const LLT S16 = LLT::scalar(16);
6276
6277 // Fixup buffer resources themselves needing to be v4i128.
6279 return castBufferRsrcToV4I32(VData, B);
6280
6281 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6282 Ty = getBitcastRegisterType(Ty);
6283 VData = B.buildBitcast(Ty, VData).getReg(0);
6284 }
6285 // Fixup illegal register types for i8 stores.
6286 if (Ty == LLT::scalar(8) || Ty == S16) {
6287 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6288 return AnyExt;
6289 }
6290
6291 if (Ty.isVector()) {
6292 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6293 if (IsFormat)
6294 return handleD16VData(B, *MRI, VData);
6295 }
6296 }
6297
6298 return VData;
6299}
6300
6302 LegalizerHelper &Helper,
6303 bool IsTyped,
6304 bool IsFormat) const {
6305 MachineIRBuilder &B = Helper.MIRBuilder;
6306 MachineRegisterInfo &MRI = *B.getMRI();
6307
6308 Register VData = MI.getOperand(1).getReg();
6309 LLT Ty = MRI.getType(VData);
6310 LLT EltTy = Ty.getScalarType();
6311 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6312 const LLT S32 = LLT::scalar(32);
6313
6314 MachineMemOperand *MMO = *MI.memoperands_begin();
6315 const int MemSize = MMO->getSize().getValue();
6316 LLT MemTy = MMO->getMemoryType();
6317
6318 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6319
6321 Register RSrc = MI.getOperand(2).getReg();
6322
6323 unsigned ImmOffset;
6324
6325 // The typed intrinsics add an immediate after the registers.
6326 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6327
6328 // The struct intrinsic variants add one additional operand over raw.
6329 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6330 Register VIndex;
6331 int OpOffset = 0;
6332 if (HasVIndex) {
6333 VIndex = MI.getOperand(3).getReg();
6334 OpOffset = 1;
6335 } else {
6336 VIndex = B.buildConstant(S32, 0).getReg(0);
6337 }
6338
6339 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6340 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6341
6342 unsigned Format = 0;
6343 if (IsTyped) {
6344 Format = MI.getOperand(5 + OpOffset).getImm();
6345 ++OpOffset;
6346 }
6347
6348 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6349
6350 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6351
6352 unsigned Opc;
6353 if (IsTyped) {
6354 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6355 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6356 } else if (IsFormat) {
6357 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6358 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6359 } else {
6360 switch (MemSize) {
6361 case 1:
6362 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6363 break;
6364 case 2:
6365 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6366 break;
6367 default:
6368 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6369 break;
6370 }
6371 }
6372
6373 auto MIB = B.buildInstr(Opc)
6374 .addUse(VData) // vdata
6375 .addUse(RSrc) // rsrc
6376 .addUse(VIndex) // vindex
6377 .addUse(VOffset) // voffset
6378 .addUse(SOffset) // soffset
6379 .addImm(ImmOffset); // offset(imm)
6380
6381 if (IsTyped)
6382 MIB.addImm(Format);
6383
6384 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6385 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6386 .addMemOperand(MMO);
6387
6388 MI.eraseFromParent();
6389 return true;
6390}
6391
6392static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6393 Register VIndex, Register VOffset, Register SOffset,
6394 unsigned ImmOffset, unsigned Format,
6395 unsigned AuxiliaryData, MachineMemOperand *MMO,
6396 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6397 auto MIB = B.buildInstr(Opc)
6398 .addDef(LoadDstReg) // vdata
6399 .addUse(RSrc) // rsrc
6400 .addUse(VIndex) // vindex
6401 .addUse(VOffset) // voffset
6402 .addUse(SOffset) // soffset
6403 .addImm(ImmOffset); // offset(imm)
6404
6405 if (IsTyped)
6406 MIB.addImm(Format);
6407
6408 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6409 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6410 .addMemOperand(MMO);
6411}
6412
6414 LegalizerHelper &Helper,
6415 bool IsFormat,
6416 bool IsTyped) const {
6417 MachineIRBuilder &B = Helper.MIRBuilder;
6418 MachineRegisterInfo &MRI = *B.getMRI();
6419 GISelChangeObserver &Observer = Helper.Observer;
6420
6421 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6422 MachineMemOperand *MMO = *MI.memoperands_begin();
6423 const LLT MemTy = MMO->getMemoryType();
6424 const LLT S32 = LLT::scalar(32);
6425
6426 Register Dst = MI.getOperand(0).getReg();
6427
6428 Register StatusDst;
6429 int OpOffset = 0;
6430 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6431 bool IsTFE = MI.getNumExplicitDefs() == 2;
6432 if (IsTFE) {
6433 StatusDst = MI.getOperand(1).getReg();
6434 ++OpOffset;
6435 }
6436
6437 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6438 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6439
6440 // The typed intrinsics add an immediate after the registers.
6441 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6442
6443 // The struct intrinsic variants add one additional operand over raw.
6444 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6445 Register VIndex;
6446 if (HasVIndex) {
6447 VIndex = MI.getOperand(3 + OpOffset).getReg();
6448 ++OpOffset;
6449 } else {
6450 VIndex = B.buildConstant(S32, 0).getReg(0);
6451 }
6452
6453 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6454 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6455
6456 unsigned Format = 0;
6457 if (IsTyped) {
6458 Format = MI.getOperand(5 + OpOffset).getImm();
6459 ++OpOffset;
6460 }
6461
6462 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6463 unsigned ImmOffset;
6464
6465 LLT Ty = MRI.getType(Dst);
6466 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6467 // logic doesn't have to handle that case.
6468 if (hasBufferRsrcWorkaround(Ty)) {
6469 Observer.changingInstr(MI);
6470 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6471 Observer.changedInstr(MI);
6472 Dst = MI.getOperand(0).getReg();
6473 B.setInsertPt(B.getMBB(), MI);
6474 }
6475 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6476 Ty = getBitcastRegisterType(Ty);
6477 Observer.changingInstr(MI);
6478 Helper.bitcastDst(MI, Ty, 0);
6479 Observer.changedInstr(MI);
6480 Dst = MI.getOperand(0).getReg();
6481 B.setInsertPt(B.getMBB(), MI);
6482 }
6483
6484 LLT EltTy = Ty.getScalarType();
6485 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6486 const bool Unpacked = ST.hasUnpackedD16VMem();
6487
6488 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6489
6490 unsigned Opc;
6491
6492 // TODO: Support TFE for typed and narrow loads.
6493 if (IsTyped) {
6494 if (IsTFE)
6495 return false;
6496 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6497 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6498 } else if (IsFormat) {
6499 if (IsD16) {
6500 if (IsTFE)
6501 return false;
6502 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6503 } else {
6504 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6505 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6506 }
6507 } else {
6508 switch (MemTy.getSizeInBits()) {
6509 case 8:
6510 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6511 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6512 break;
6513 case 16:
6514 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6515 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6516 break;
6517 default:
6518 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6519 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6520 break;
6521 }
6522 }
6523
6524 if (IsTFE) {
6525 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6526 unsigned NumLoadDWords = NumValueDWords + 1;
6527 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6528 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6529 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6530 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6531 if (MemTy.getSizeInBits() < 32) {
6532 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6533 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6534 B.buildTrunc(Dst, ExtDst);
6535 } else if (NumValueDWords == 1) {
6536 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6537 } else {
6538 SmallVector<Register, 5> LoadElts;
6539 for (unsigned I = 0; I != NumValueDWords; ++I)
6540 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6541 LoadElts.push_back(StatusDst);
6542 B.buildUnmerge(LoadElts, LoadDstReg);
6543 LoadElts.truncate(NumValueDWords);
6544 B.buildMergeLikeInstr(Dst, LoadElts);
6545 }
6546 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6547 (IsD16 && !Ty.isVector())) {
6548 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6549 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6550 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6551 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6552 B.buildTrunc(Dst, LoadDstReg);
6553 } else if (Unpacked && IsD16 && Ty.isVector()) {
6554 LLT UnpackedTy = Ty.changeElementSize(32);
6555 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6556 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6557 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6558 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6559 // FIXME: G_TRUNC should work, but legalization currently fails
6560 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6562 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6563 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6564 B.buildMergeLikeInstr(Dst, Repack);
6565 } else {
6566 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6567 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6568 }
6569
6570 MI.eraseFromParent();
6571 return true;
6572}
6573
6574static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6575 switch (IntrID) {
6576 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6577 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6578 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6579 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6580 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6581 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6582 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6583 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6584 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6585 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6586 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6587 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6588 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6589 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6590 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6591 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6592 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6593 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6594 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6595 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6596 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6597 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6598 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6599 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6600 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6601 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6602 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6603 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6605 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6606 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6607 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6608 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6610 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6611 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6612 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6613 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6614 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6615 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6616 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6617 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6618 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6620 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6621 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6623 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6624 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6625 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6626 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6627 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6628 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6629 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6630 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6631 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6632 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6633 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6634 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6635 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6636 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6638 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6640 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6641 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6642 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6643 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6644 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6646 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6647 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6648 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6649 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6650 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6651 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6653 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6654 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6655 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6656 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6657 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6658 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6659 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6660 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6661 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6662 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6663 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6664 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6665 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6666 default:
6667 llvm_unreachable("unhandled atomic opcode");
6668 }
6669}
6670
6673 Intrinsic::ID IID) const {
6674 const bool IsCmpSwap =
6675 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6676 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6677 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6678 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6679
6680 Register Dst = MI.getOperand(0).getReg();
6681 // Since we don't have 128-bit atomics, we don't need to handle the case of
6682 // p8 argmunents to the atomic itself
6683 Register VData = MI.getOperand(2).getReg();
6684
6685 Register CmpVal;
6686 int OpOffset = 0;
6687
6688 if (IsCmpSwap) {
6689 CmpVal = MI.getOperand(3).getReg();
6690 ++OpOffset;
6691 }
6692
6693 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6694 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6695 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6696
6697 // The struct intrinsic variants add one additional operand over raw.
6698 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6699 Register VIndex;
6700 if (HasVIndex) {
6701 VIndex = MI.getOperand(4 + OpOffset).getReg();
6702 ++OpOffset;
6703 } else {
6704 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6705 }
6706
6707 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6708 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6709 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6710
6711 MachineMemOperand *MMO = *MI.memoperands_begin();
6712
6713 unsigned ImmOffset;
6714 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6715
6716 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6717 .addDef(Dst)
6718 .addUse(VData); // vdata
6719
6720 if (IsCmpSwap)
6721 MIB.addReg(CmpVal);
6722
6723 MIB.addUse(RSrc) // rsrc
6724 .addUse(VIndex) // vindex
6725 .addUse(VOffset) // voffset
6726 .addUse(SOffset) // soffset
6727 .addImm(ImmOffset) // offset(imm)
6728 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6729 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6730 .addMemOperand(MMO);
6731
6732 MI.eraseFromParent();
6733 return true;
6734}
6735
6736/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6737/// vector with s16 typed elements.
6739 SmallVectorImpl<Register> &PackedAddrs,
6740 unsigned ArgOffset,
6742 bool IsA16, bool IsG16) {
6743 const LLT S16 = LLT::scalar(16);
6744 const LLT V2S16 = LLT::fixed_vector(2, 16);
6745 auto EndIdx = Intr->VAddrEnd;
6746
6747 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6748 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6749 if (!SrcOp.isReg())
6750 continue; // _L to _LZ may have eliminated this.
6751
6752 Register AddrReg = SrcOp.getReg();
6753
6754 if ((I < Intr->GradientStart) ||
6755 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6756 (I >= Intr->CoordStart && !IsA16)) {
6757 if ((I < Intr->GradientStart) && IsA16 &&
6758 (B.getMRI()->getType(AddrReg) == S16)) {
6759 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6760 // Special handling of bias when A16 is on. Bias is of type half but
6761 // occupies full 32-bit.
6762 PackedAddrs.push_back(
6763 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6764 .getReg(0));
6765 } else {
6766 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6767 "Bias needs to be converted to 16 bit in A16 mode");
6768 // Handle any gradient or coordinate operands that should not be packed
6769 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6770 PackedAddrs.push_back(AddrReg);
6771 }
6772 } else {
6773 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6774 // derivatives dx/dh and dx/dv are packed with undef.
6775 if (((I + 1) >= EndIdx) ||
6776 ((Intr->NumGradients / 2) % 2 == 1 &&
6777 (I == static_cast<unsigned>(Intr->GradientStart +
6778 (Intr->NumGradients / 2) - 1) ||
6779 I == static_cast<unsigned>(Intr->GradientStart +
6780 Intr->NumGradients - 1))) ||
6781 // Check for _L to _LZ optimization
6782 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6783 PackedAddrs.push_back(
6784 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6785 .getReg(0));
6786 } else {
6787 PackedAddrs.push_back(
6788 B.buildBuildVector(
6789 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6790 .getReg(0));
6791 ++I;
6792 }
6793 }
6794 }
6795}
6796
6797/// Convert from separate vaddr components to a single vector address register,
6798/// and replace the remaining operands with $noreg.
6800 int DimIdx, int NumVAddrs) {
6801 const LLT S32 = LLT::scalar(32);
6802 (void)S32;
6803 SmallVector<Register, 8> AddrRegs;
6804 for (int I = 0; I != NumVAddrs; ++I) {
6805 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6806 if (SrcOp.isReg()) {
6807 AddrRegs.push_back(SrcOp.getReg());
6808 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6809 }
6810 }
6811
6812 int NumAddrRegs = AddrRegs.size();
6813 if (NumAddrRegs != 1) {
6814 auto VAddr =
6815 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6816 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6817 }
6818
6819 for (int I = 1; I != NumVAddrs; ++I) {
6820 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6821 if (SrcOp.isReg())
6822 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6823 }
6824}
6825
6826/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6827///
6828/// Depending on the subtarget, load/store with 16-bit element data need to be
6829/// rewritten to use the low half of 32-bit registers, or directly use a packed
6830/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6831/// registers.
6832///
6833/// We don't want to directly select image instructions just yet, but also want
6834/// to exposes all register repacking to the legalizer/combiners. We also don't
6835/// want a selected instruction entering RegBankSelect. In order to avoid
6836/// defining a multitude of intermediate image instructions, directly hack on
6837/// the intrinsic's arguments. In cases like a16 addresses, this requires
6838/// padding now unnecessary arguments with $noreg.
6841 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6842
6843 const MachineFunction &MF = *MI.getMF();
6844 const unsigned NumDefs = MI.getNumExplicitDefs();
6845 const unsigned ArgOffset = NumDefs + 1;
6846 bool IsTFE = NumDefs == 2;
6847 // We are only processing the operands of d16 image operations on subtargets
6848 // that use the unpacked register layout, or need to repack the TFE result.
6849
6850 // TODO: Do we need to guard against already legalized intrinsics?
6851 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6853
6854 MachineRegisterInfo *MRI = B.getMRI();
6855 const LLT S32 = LLT::scalar(32);
6856 const LLT S16 = LLT::scalar(16);
6857 const LLT V2S16 = LLT::fixed_vector(2, 16);
6858
6859 unsigned DMask = 0;
6860 Register VData;
6861 LLT Ty;
6862
6863 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6864 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6865 Ty = MRI->getType(VData);
6866 }
6867
6868 const bool IsAtomicPacked16Bit =
6869 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6870 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6871
6872 // Check for 16 bit addresses and pack if true.
6873 LLT GradTy =
6874 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6875 LLT AddrTy =
6876 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6877 const bool IsG16 =
6878 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6879 const bool IsA16 = AddrTy == S16;
6880 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6881
6882 int DMaskLanes = 0;
6883 if (!BaseOpcode->Atomic) {
6884 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6885 if (BaseOpcode->Gather4) {
6886 DMaskLanes = 4;
6887 } else if (DMask != 0) {
6888 DMaskLanes = llvm::popcount(DMask);
6889 } else if (!IsTFE && !BaseOpcode->Store) {
6890 // If dmask is 0, this is a no-op load. This can be eliminated.
6891 B.buildUndef(MI.getOperand(0));
6892 MI.eraseFromParent();
6893 return true;
6894 }
6895 }
6896
6897 Observer.changingInstr(MI);
6898 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
6899
6900 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6901 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6902 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6903 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6904 unsigned NewOpcode = LoadOpcode;
6905 if (BaseOpcode->Store)
6906 NewOpcode = StoreOpcode;
6907 else if (BaseOpcode->NoReturn)
6908 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6909
6910 // Track that we legalized this
6911 MI.setDesc(B.getTII().get(NewOpcode));
6912
6913 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6914 // dmask to be at least 1 otherwise the instruction will fail
6915 if (IsTFE && DMask == 0) {
6916 DMask = 0x1;
6917 DMaskLanes = 1;
6918 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6919 }
6920
6921 if (BaseOpcode->Atomic) {
6922 Register VData0 = MI.getOperand(2).getReg();
6923 LLT Ty = MRI->getType(VData0);
6924
6925 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6926 if (Ty.isVector() && !IsAtomicPacked16Bit)
6927 return false;
6928
6929 if (BaseOpcode->AtomicX2) {
6930 Register VData1 = MI.getOperand(3).getReg();
6931 // The two values are packed in one register.
6932 LLT PackedTy = LLT::fixed_vector(2, Ty);
6933 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6934 MI.getOperand(2).setReg(Concat.getReg(0));
6935 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6936 }
6937 }
6938
6939 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6940
6941 // Rewrite the addressing register layout before doing anything else.
6942 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6943 // 16 bit gradients are supported, but are tied to the A16 control
6944 // so both gradients and addresses must be 16 bit
6945 return false;
6946 }
6947
6948 if (IsA16 && !ST.hasA16()) {
6949 // A16 not supported
6950 return false;
6951 }
6952
6953 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6954 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6955
6956 if (IsA16 || IsG16) {
6957 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6958 // instructions expect VGPR_32
6959 SmallVector<Register, 4> PackedRegs;
6960
6961 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6962
6963 // See also below in the non-a16 branch
6964 const bool UseNSA = ST.hasNSAEncoding() &&
6965 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6966 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6967 const bool UsePartialNSA =
6968 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6969
6970 if (UsePartialNSA) {
6971 // Pack registers that would go over NSAMaxSize into last VAddr register
6972 LLT PackedAddrTy =
6973 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6974 auto Concat = B.buildConcatVectors(
6975 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6976 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6977 PackedRegs.resize(NSAMaxSize);
6978 } else if (!UseNSA && PackedRegs.size() > 1) {
6979 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6980 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6981 PackedRegs[0] = Concat.getReg(0);
6982 PackedRegs.resize(1);
6983 }
6984
6985 const unsigned NumPacked = PackedRegs.size();
6986 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6987 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6988 if (!SrcOp.isReg()) {
6989 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6990 continue;
6991 }
6992
6993 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6994
6995 if (I - Intr->VAddrStart < NumPacked)
6996 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6997 else
6998 SrcOp.setReg(AMDGPU::NoRegister);
6999 }
7000 } else {
7001 // If the register allocator cannot place the address registers contiguously
7002 // without introducing moves, then using the non-sequential address encoding
7003 // is always preferable, since it saves VALU instructions and is usually a
7004 // wash in terms of code size or even better.
7005 //
7006 // However, we currently have no way of hinting to the register allocator
7007 // that MIMG addresses should be placed contiguously when it is possible to
7008 // do so, so force non-NSA for the common 2-address case as a heuristic.
7009 //
7010 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7011 // allocation when possible.
7012 //
7013 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7014 // set of the remaining addresses.
7015 const bool UseNSA = ST.hasNSAEncoding() &&
7016 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7017 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7018 const bool UsePartialNSA =
7019 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7020
7021 if (UsePartialNSA) {
7023 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7024 Intr->NumVAddrs - NSAMaxSize + 1);
7025 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7026 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7027 Intr->NumVAddrs);
7028 }
7029 }
7030
7031 int Flags = 0;
7032 if (IsA16)
7033 Flags |= 1;
7034 if (IsG16)
7035 Flags |= 2;
7036 MI.addOperand(MachineOperand::CreateImm(Flags));
7037
7038 if (BaseOpcode->NoReturn) { // No TFE for stores?
7039 // TODO: Handle dmask trim
7040 if (!Ty.isVector() || !IsD16)
7041 return true;
7042
7043 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7044 if (RepackedReg != VData) {
7045 MI.getOperand(1).setReg(RepackedReg);
7046 }
7047
7048 return true;
7049 }
7050
7051 Register DstReg = MI.getOperand(0).getReg();
7052 const LLT EltTy = Ty.getScalarType();
7053 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7054
7055 // Confirm that the return type is large enough for the dmask specified
7056 if (NumElts < DMaskLanes)
7057 return false;
7058
7059 if (NumElts > 4 || DMaskLanes > 4)
7060 return false;
7061
7062 // Image atomic instructions are using DMask to specify how many bits
7063 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7064 // DMaskLanes for image atomic has default value '0'.
7065 // We must be sure that atomic variants (especially packed) will not be
7066 // truncated from v2s16 or v4s16 to s16 type.
7067 //
7068 // ChangeElementCount will be needed for image load where Ty is always scalar.
7069 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7070 const LLT AdjustedTy =
7071 DMaskLanes == 0
7072 ? Ty
7073 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7074
7075 // The raw dword aligned data component of the load. The only legal cases
7076 // where this matters should be when using the packed D16 format, for
7077 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7078 LLT RoundedTy;
7079
7080 // S32 vector to cover all data, plus TFE result element.
7081 LLT TFETy;
7082
7083 // Register type to use for each loaded component. Will be S32 or V2S16.
7084 LLT RegTy;
7085
7086 if (IsD16 && ST.hasUnpackedD16VMem()) {
7087 RoundedTy =
7088 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7089 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7090 RegTy = S32;
7091 } else {
7092 unsigned EltSize = EltTy.getSizeInBits();
7093 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7094 unsigned RoundedSize = 32 * RoundedElts;
7095 RoundedTy = LLT::scalarOrVector(
7096 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7097 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7098 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7099 }
7100
7101 // The return type does not need adjustment.
7102 // TODO: Should we change s16 case to s32 or <2 x s16>?
7103 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7104 return true;
7105
7106 Register Dst1Reg;
7107
7108 // Insert after the instruction.
7109 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7110
7111 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7112 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7113 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7114 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7115
7116 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7117
7118 MI.getOperand(0).setReg(NewResultReg);
7119
7120 // In the IR, TFE is supposed to be used with a 2 element struct return
7121 // type. The instruction really returns these two values in one contiguous
7122 // register, with one additional dword beyond the loaded data. Rewrite the
7123 // return type to use a single register result.
7124
7125 if (IsTFE) {
7126 Dst1Reg = MI.getOperand(1).getReg();
7127 if (MRI->getType(Dst1Reg) != S32)
7128 return false;
7129
7130 // TODO: Make sure the TFE operand bit is set.
7131 MI.removeOperand(1);
7132
7133 // Handle the easy case that requires no repack instructions.
7134 if (Ty == S32) {
7135 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7136 return true;
7137 }
7138 }
7139
7140 // Now figure out how to copy the new result register back into the old
7141 // result.
7142 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7143
7144 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7145
7146 if (ResultNumRegs == 1) {
7147 assert(!IsTFE);
7148 ResultRegs[0] = NewResultReg;
7149 } else {
7150 // We have to repack into a new vector of some kind.
7151 for (int I = 0; I != NumDataRegs; ++I)
7152 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7153 B.buildUnmerge(ResultRegs, NewResultReg);
7154
7155 // Drop the final TFE element to get the data part. The TFE result is
7156 // directly written to the right place already.
7157 if (IsTFE)
7158 ResultRegs.resize(NumDataRegs);
7159 }
7160
7161 // For an s16 scalar result, we form an s32 result with a truncate regardless
7162 // of packed vs. unpacked.
7163 if (IsD16 && !Ty.isVector()) {
7164 B.buildTrunc(DstReg, ResultRegs[0]);
7165 return true;
7166 }
7167
7168 // Avoid a build/concat_vector of 1 entry.
7169 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7170 B.buildBitcast(DstReg, ResultRegs[0]);
7171 return true;
7172 }
7173
7174 assert(Ty.isVector());
7175
7176 if (IsD16) {
7177 // For packed D16 results with TFE enabled, all the data components are
7178 // S32. Cast back to the expected type.
7179 //
7180 // TODO: We don't really need to use load s32 elements. We would only need one
7181 // cast for the TFE result if a multiple of v2s16 was used.
7182 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7183 for (Register &Reg : ResultRegs)
7184 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7185 } else if (ST.hasUnpackedD16VMem()) {
7186 for (Register &Reg : ResultRegs)
7187 Reg = B.buildTrunc(S16, Reg).getReg(0);
7188 }
7189 }
7190
7191 auto padWithUndef = [&](LLT Ty, int NumElts) {
7192 if (NumElts == 0)
7193 return;
7194 Register Undef = B.buildUndef(Ty).getReg(0);
7195 for (int I = 0; I != NumElts; ++I)
7196 ResultRegs.push_back(Undef);
7197 };
7198
7199 // Pad out any elements eliminated due to the dmask.
7200 LLT ResTy = MRI->getType(ResultRegs[0]);
7201 if (!ResTy.isVector()) {
7202 padWithUndef(ResTy, NumElts - ResultRegs.size());
7203 B.buildBuildVector(DstReg, ResultRegs);
7204 return true;
7205 }
7206
7207 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7208 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7209
7210 // Deal with the one annoying legal case.
7211 const LLT V3S16 = LLT::fixed_vector(3, 16);
7212 if (Ty == V3S16) {
7213 if (IsTFE) {
7214 if (ResultRegs.size() == 1) {
7215 NewResultReg = ResultRegs[0];
7216 } else if (ResultRegs.size() == 2) {
7217 LLT V4S16 = LLT::fixed_vector(4, 16);
7218 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7219 } else {
7220 return false;
7221 }
7222 }
7223
7224 if (MRI->getType(DstReg).getNumElements() <
7225 MRI->getType(NewResultReg).getNumElements()) {
7226 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7227 } else {
7228 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7229 }
7230 return true;
7231 }
7232
7233 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7234 B.buildConcatVectors(DstReg, ResultRegs);
7235 return true;
7236}
7237
7239 MachineInstr &MI) const {
7240 MachineIRBuilder &B = Helper.MIRBuilder;
7241 GISelChangeObserver &Observer = Helper.Observer;
7242
7243 Register OrigDst = MI.getOperand(0).getReg();
7244 Register Dst;
7245 LLT Ty = B.getMRI()->getType(OrigDst);
7246 unsigned Size = Ty.getSizeInBits();
7247 MachineFunction &MF = B.getMF();
7248 unsigned Opc = 0;
7249 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7250 assert(Size == 8 || Size == 16);
7251 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7252 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7253 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7254 // destination register.
7255 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7256 } else {
7257 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7258 Dst = OrigDst;
7259 }
7260
7261 Observer.changingInstr(MI);
7262
7263 // Handle needing to s.buffer.load() a p8 value.
7264 if (hasBufferRsrcWorkaround(Ty)) {
7265 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7266 B.setInsertPt(B.getMBB(), MI);
7267 }
7269 Ty = getBitcastRegisterType(Ty);
7270 Helper.bitcastDst(MI, Ty, 0);
7271 B.setInsertPt(B.getMBB(), MI);
7272 }
7273
7274 // FIXME: We don't really need this intermediate instruction. The intrinsic
7275 // should be fixed to have a memory operand. Since it's readnone, we're not
7276 // allowed to add one.
7277 MI.setDesc(B.getTII().get(Opc));
7278 MI.removeOperand(1); // Remove intrinsic ID
7279
7280 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7281 const unsigned MemSize = (Size + 7) / 8;
7282 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7288 MemSize, MemAlign);
7289 MI.addMemOperand(MF, MMO);
7290 if (Dst != OrigDst) {
7291 MI.getOperand(0).setReg(Dst);
7292 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7293 B.buildTrunc(OrigDst, Dst);
7294 }
7295
7296 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7297 // always be legal. We may need to restore this to a 96-bit result if it turns
7298 // out this needs to be converted to a vector load during RegBankSelect.
7299 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7300 if (Ty.isVector())
7302 else
7303 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7304 }
7305
7306 Observer.changedInstr(MI);
7307 return true;
7308}
7309
7311 MachineInstr &MI) const {
7312 MachineIRBuilder &B = Helper.MIRBuilder;
7313 GISelChangeObserver &Observer = Helper.Observer;
7314 Observer.changingInstr(MI);
7315 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7316 MI.removeOperand(0); // Remove intrinsic ID
7318 Observer.changedInstr(MI);
7319 return true;
7320}
7321
7322// TODO: Move to selection
7325 MachineIRBuilder &B) const {
7326 if (!ST.hasTrapHandler() ||
7327 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7328 return legalizeTrapEndpgm(MI, MRI, B);
7329
7330 return ST.supportsGetDoorbellID() ?
7332}
7333
7336 const DebugLoc &DL = MI.getDebugLoc();
7337 MachineBasicBlock &BB = B.getMBB();
7338 MachineFunction *MF = BB.getParent();
7339
7340 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7341 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7342 .addImm(0);
7343 MI.eraseFromParent();
7344 return true;
7345 }
7346
7347 // We need a block split to make the real endpgm a terminator. We also don't
7348 // want to break phis in successor blocks, so we can't just delete to the
7349 // end of the block.
7350 BB.splitAt(MI, false /*UpdateLiveIns*/);
7352 MF->push_back(TrapBB);
7353 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7354 .addImm(0);
7355 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7356 .addMBB(TrapBB);
7357
7358 BB.addSuccessor(TrapBB);
7359 MI.eraseFromParent();
7360 return true;
7361}
7362
7365 MachineFunction &MF = B.getMF();
7366 const LLT S64 = LLT::scalar(64);
7367
7368 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7369 // For code object version 5, queue_ptr is passed through implicit kernarg.
7375 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7376
7377 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7379
7380 if (!loadInputValue(KernargPtrReg, B,
7382 return false;
7383
7384 // TODO: can we be smarter about machine pointer info?
7387 PtrInfo.getWithOffset(Offset),
7391
7392 // Pointer address
7393 Register LoadAddr = MRI.createGenericVirtualRegister(
7395 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7396 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7397 // Load address
7398 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7399 B.buildCopy(SGPR01, Temp);
7400 B.buildInstr(AMDGPU::S_TRAP)
7401 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7402 .addReg(SGPR01, RegState::Implicit);
7403 MI.eraseFromParent();
7404 return true;
7405 }
7406
7407 // Pass queue pointer to trap handler as input, and insert trap instruction
7408 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7409 Register LiveIn =
7410 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
7412 return false;
7413
7414 B.buildCopy(SGPR01, LiveIn);
7415 B.buildInstr(AMDGPU::S_TRAP)
7416 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7417 .addReg(SGPR01, RegState::Implicit);
7418
7419 MI.eraseFromParent();
7420 return true;
7421}
7422
7425 MachineIRBuilder &B) const {
7426 // We need to simulate the 's_trap 2' instruction on targets that run in
7427 // PRIV=1 (where it is treated as a nop).
7428 if (ST.hasPrivEnabledTrap2NopBug()) {
7429 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7430 MI.getDebugLoc());
7431 MI.eraseFromParent();
7432 return true;
7433 }
7434
7435 B.buildInstr(AMDGPU::S_TRAP)
7436 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7437 MI.eraseFromParent();
7438 return true;
7439}
7440
7443 MachineIRBuilder &B) const {
7444 // Is non-HSA path or trap-handler disabled? Then, report a warning
7445 // accordingly
7446 if (!ST.hasTrapHandler() ||
7447 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7448 Function &Fn = B.getMF().getFunction();
7450 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7451 } else {
7452 // Insert debug-trap instruction
7453 B.buildInstr(AMDGPU::S_TRAP)
7454 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7455 }
7456
7457 MI.eraseFromParent();
7458 return true;
7459}
7460
7462 MachineInstr &MI, MachineIRBuilder &B) const {
7463 MachineRegisterInfo &MRI = *B.getMRI();
7464 const LLT S16 = LLT::scalar(16);
7465 const LLT S32 = LLT::scalar(32);
7466 const LLT V2S16 = LLT::fixed_vector(2, 16);
7467 const LLT V3S32 = LLT::fixed_vector(3, 32);
7468
7469 Register DstReg = MI.getOperand(0).getReg();
7470 Register NodePtr = MI.getOperand(2).getReg();
7471 Register RayExtent = MI.getOperand(3).getReg();
7472 Register RayOrigin = MI.getOperand(4).getReg();
7473 Register RayDir = MI.getOperand(5).getReg();
7474 Register RayInvDir = MI.getOperand(6).getReg();
7475 Register TDescr = MI.getOperand(7).getReg();
7476
7477 if (!ST.hasGFX10_AEncoding()) {
7478 Function &Fn = B.getMF().getFunction();
7480 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7481 return false;
7482 }
7483
7484 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7485 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7486 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7487 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7488 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7489 const unsigned NumVDataDwords = 4;
7490 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7491 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7492 const bool UseNSA =
7493 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7494
7495 const unsigned BaseOpcodes[2][2] = {
7496 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7497 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7498 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7499 int Opcode;
7500 if (UseNSA) {
7501 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7502 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7503 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7504 : AMDGPU::MIMGEncGfx10NSA,
7505 NumVDataDwords, NumVAddrDwords);
7506 } else {
7507 assert(!IsGFX12Plus);
7508 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7509 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7510 : AMDGPU::MIMGEncGfx10Default,
7511 NumVDataDwords, NumVAddrDwords);
7512 }
7513 assert(Opcode != -1);
7514
7516 if (UseNSA && IsGFX11Plus) {
7517 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7518 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7519 auto Merged = B.buildMergeLikeInstr(
7520 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7521 Ops.push_back(Merged.getReg(0));
7522 };
7523
7524 Ops.push_back(NodePtr);
7525 Ops.push_back(RayExtent);
7526 packLanes(RayOrigin);
7527
7528 if (IsA16) {
7529 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7530 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7531 auto MergedDir = B.buildMergeLikeInstr(
7532 V3S32,
7533 {B.buildBitcast(
7534 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7535 UnmergeRayDir.getReg(0)}))
7536 .getReg(0),
7537 B.buildBitcast(
7538 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7539 UnmergeRayDir.getReg(1)}))
7540 .getReg(0),
7541 B.buildBitcast(
7542 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7543 UnmergeRayDir.getReg(2)}))
7544 .getReg(0)});
7545 Ops.push_back(MergedDir.getReg(0));
7546 } else {
7547 packLanes(RayDir);
7548 packLanes(RayInvDir);
7549 }
7550 } else {
7551 if (Is64) {
7552 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7553 Ops.push_back(Unmerge.getReg(0));
7554 Ops.push_back(Unmerge.getReg(1));
7555 } else {
7556 Ops.push_back(NodePtr);
7557 }
7558 Ops.push_back(RayExtent);
7559
7560 auto packLanes = [&Ops, &S32, &B](Register Src) {
7561 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7562 Ops.push_back(Unmerge.getReg(0));
7563 Ops.push_back(Unmerge.getReg(1));
7564 Ops.push_back(Unmerge.getReg(2));
7565 };
7566
7567 packLanes(RayOrigin);
7568 if (IsA16) {
7569 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7570 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7571 Register R1 = MRI.createGenericVirtualRegister(S32);
7572 Register R2 = MRI.createGenericVirtualRegister(S32);
7573 Register R3 = MRI.createGenericVirtualRegister(S32);
7574 B.buildMergeLikeInstr(R1,
7575 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7576 B.buildMergeLikeInstr(
7577 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7578 B.buildMergeLikeInstr(
7579 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7580 Ops.push_back(R1);
7581 Ops.push_back(R2);
7582 Ops.push_back(R3);
7583 } else {
7584 packLanes(RayDir);
7585 packLanes(RayInvDir);
7586 }
7587 }
7588
7589 if (!UseNSA) {
7590 // Build a single vector containing all the operands so far prepared.
7591 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7592 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7593 Ops.clear();
7594 Ops.push_back(MergedOps);
7595 }
7596
7597 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7598 .addDef(DstReg)
7599 .addImm(Opcode);
7600
7601 for (Register R : Ops) {
7602 MIB.addUse(R);
7603 }
7604
7605 MIB.addUse(TDescr)
7606 .addImm(IsA16 ? 1 : 0)
7607 .cloneMemRefs(MI);
7608
7609 MI.eraseFromParent();
7610 return true;
7611}
7612
7614 MachineInstr &MI, MachineIRBuilder &B) const {
7615 const LLT S32 = LLT::scalar(32);
7616 const LLT V2S32 = LLT::fixed_vector(2, 32);
7617
7618 Register DstReg = MI.getOperand(0).getReg();
7619 Register DstOrigin = MI.getOperand(1).getReg();
7620 Register DstDir = MI.getOperand(2).getReg();
7621 Register NodePtr = MI.getOperand(4).getReg();
7622 Register RayExtent = MI.getOperand(5).getReg();
7623 Register InstanceMask = MI.getOperand(6).getReg();
7624 Register RayOrigin = MI.getOperand(7).getReg();
7625 Register RayDir = MI.getOperand(8).getReg();
7626 Register Offsets = MI.getOperand(9).getReg();
7627 Register TDescr = MI.getOperand(10).getReg();
7628
7629 if (!ST.hasBVHDualAndBVH8Insts()) {
7630 Function &Fn = B.getMF().getFunction();
7632 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7633 return false;
7634 }
7635
7636 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7637 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7638 const unsigned NumVDataDwords = 10;
7639 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7640 int Opcode = AMDGPU::getMIMGOpcode(
7641 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7642 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7643 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7644 assert(Opcode != -1);
7645
7646 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7647 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7648
7649 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7650 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7651 .addDef(DstReg)
7652 .addDef(DstOrigin)
7653 .addDef(DstDir)
7654 .addImm(Opcode)
7655 .addUse(NodePtr)
7656 .addUse(RayExtentInstanceMaskVec.getReg(0))
7657 .addUse(RayOrigin)
7658 .addUse(RayDir)
7659 .addUse(Offsets)
7660 .addUse(TDescr)
7661 .cloneMemRefs(MI);
7662
7663 MI.eraseFromParent();
7664 return true;
7665}
7666
7668 MachineIRBuilder &B) const {
7669 const SITargetLowering *TLI = ST.getTargetLowering();
7671 Register DstReg = MI.getOperand(0).getReg();
7672 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7673 MI.eraseFromParent();
7674 return true;
7675}
7676
7678 MachineIRBuilder &B) const {
7679 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7680 if (!ST.hasArchitectedSGPRs())
7681 return false;
7682 LLT S32 = LLT::scalar(32);
7683 Register DstReg = MI.getOperand(0).getReg();
7684 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7685 auto LSB = B.buildConstant(S32, 25);
7686 auto Width = B.buildConstant(S32, 5);
7687 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7688 MI.eraseFromParent();
7689 return true;
7690}
7691
7694 AMDGPU::Hwreg::Id HwReg,
7695 unsigned LowBit,
7696 unsigned Width) const {
7697 MachineRegisterInfo &MRI = *B.getMRI();
7698 Register DstReg = MI.getOperand(0).getReg();
7699 if (!MRI.getRegClassOrNull(DstReg))
7700 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7701 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7702 .addDef(DstReg)
7703 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7704 MI.eraseFromParent();
7705 return true;
7706}
7707
7708static constexpr unsigned FPEnvModeBitField =
7710
7711static constexpr unsigned FPEnvTrapBitField =
7713
7716 MachineIRBuilder &B) const {
7717 Register Src = MI.getOperand(0).getReg();
7718 if (MRI.getType(Src) != S64)
7719 return false;
7720
7721 auto ModeReg =
7722 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7723 /*HasSideEffects=*/true, /*isConvergent=*/false)
7724 .addImm(FPEnvModeBitField);
7725 auto TrapReg =
7726 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7727 /*HasSideEffects=*/true, /*isConvergent=*/false)
7728 .addImm(FPEnvTrapBitField);
7729 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7730 MI.eraseFromParent();
7731 return true;
7732}
7733
7736 MachineIRBuilder &B) const {
7737 Register Src = MI.getOperand(0).getReg();
7738 if (MRI.getType(Src) != S64)
7739 return false;
7740
7741 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7742 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7743 /*HasSideEffects=*/true, /*isConvergent=*/false)
7744 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7745 .addReg(Unmerge.getReg(0));
7746 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7747 /*HasSideEffects=*/true, /*isConvergent=*/false)
7748 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7749 .addReg(Unmerge.getReg(1));
7750 MI.eraseFromParent();
7751 return true;
7752}
7753
7755 MachineInstr &MI) const {
7756 MachineIRBuilder &B = Helper.MIRBuilder;
7757 MachineRegisterInfo &MRI = *B.getMRI();
7758
7759 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7760 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7761 switch (IntrID) {
7762 case Intrinsic::amdgcn_if:
7763 case Intrinsic::amdgcn_else: {
7764 MachineInstr *Br = nullptr;
7765 MachineBasicBlock *UncondBrTarget = nullptr;
7766 bool Negated = false;
7767 if (MachineInstr *BrCond =
7768 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7769 const SIRegisterInfo *TRI
7770 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7771
7772 Register Def = MI.getOperand(1).getReg();
7773 Register Use = MI.getOperand(3).getReg();
7774
7775 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7776
7777 if (Negated)
7778 std::swap(CondBrTarget, UncondBrTarget);
7779
7780 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7781 if (IntrID == Intrinsic::amdgcn_if) {
7782 B.buildInstr(AMDGPU::SI_IF)
7783 .addDef(Def)
7784 .addUse(Use)
7785 .addMBB(UncondBrTarget);
7786 } else {
7787 B.buildInstr(AMDGPU::SI_ELSE)
7788 .addDef(Def)
7789 .addUse(Use)
7790 .addMBB(UncondBrTarget);
7791 }
7792
7793 if (Br) {
7794 Br->getOperand(0).setMBB(CondBrTarget);
7795 } else {
7796 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7797 // since we're swapping branch targets it needs to be reinserted.
7798 // FIXME: IRTranslator should probably not do this
7799 B.buildBr(*CondBrTarget);
7800 }
7801
7802 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7803 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7804 MI.eraseFromParent();
7805 BrCond->eraseFromParent();
7806 return true;
7807 }
7808
7809 return false;
7810 }
7811 case Intrinsic::amdgcn_loop: {
7812 MachineInstr *Br = nullptr;
7813 MachineBasicBlock *UncondBrTarget = nullptr;
7814 bool Negated = false;
7815 if (MachineInstr *BrCond =
7816 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7817 const SIRegisterInfo *TRI
7818 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7819
7820 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7821 Register Reg = MI.getOperand(2).getReg();
7822
7823 if (Negated)
7824 std::swap(CondBrTarget, UncondBrTarget);
7825
7826 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7827 B.buildInstr(AMDGPU::SI_LOOP)
7828 .addUse(Reg)
7829 .addMBB(UncondBrTarget);
7830
7831 if (Br)
7832 Br->getOperand(0).setMBB(CondBrTarget);
7833 else
7834 B.buildBr(*CondBrTarget);
7835
7836 MI.eraseFromParent();
7837 BrCond->eraseFromParent();
7838 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7839 return true;
7840 }
7841
7842 return false;
7843 }
7844 case Intrinsic::amdgcn_addrspacecast_nonnull:
7845 return legalizeAddrSpaceCast(MI, MRI, B);
7846 case Intrinsic::amdgcn_make_buffer_rsrc:
7848 case Intrinsic::amdgcn_kernarg_segment_ptr:
7849 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
7850 // This only makes sense to call in a kernel, so just lower to null.
7851 B.buildConstant(MI.getOperand(0).getReg(), 0);
7852 MI.eraseFromParent();
7853 return true;
7854 }
7855
7858 case Intrinsic::amdgcn_implicitarg_ptr:
7859 return legalizeImplicitArgPtr(MI, MRI, B);
7860 case Intrinsic::amdgcn_workitem_id_x:
7861 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7863 case Intrinsic::amdgcn_workitem_id_y:
7864 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7866 case Intrinsic::amdgcn_workitem_id_z:
7867 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7869 case Intrinsic::amdgcn_workgroup_id_x:
7870 return legalizeWorkGroupId(
7874 case Intrinsic::amdgcn_workgroup_id_y:
7875 return legalizeWorkGroupId(
7879 case Intrinsic::amdgcn_workgroup_id_z:
7880 return legalizeWorkGroupId(
7884 case Intrinsic::amdgcn_cluster_id_x:
7885 return ST.hasClusters() &&
7888 case Intrinsic::amdgcn_cluster_id_y:
7889 return ST.hasClusters() &&
7892 case Intrinsic::amdgcn_cluster_id_z:
7893 return ST.hasClusters() &&
7896 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7897 return ST.hasClusters() &&
7900 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7901 return ST.hasClusters() &&
7904 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7905 return ST.hasClusters() &&
7908 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7909 return ST.hasClusters() &&
7911 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7912 return ST.hasClusters() &&
7915 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7916 return ST.hasClusters() &&
7919 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7920 return ST.hasClusters() &&
7923 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7924 return ST.hasClusters() &&
7926 MI, MRI, B,
7928 case Intrinsic::amdgcn_wave_id:
7929 return legalizeWaveID(MI, B);
7930 case Intrinsic::amdgcn_lds_kernel_id:
7933 case Intrinsic::amdgcn_dispatch_ptr:
7936 case Intrinsic::amdgcn_queue_ptr:
7939 case Intrinsic::amdgcn_implicit_buffer_ptr:
7942 case Intrinsic::amdgcn_dispatch_id:
7945 case Intrinsic::r600_read_ngroups_x:
7946 // TODO: Emit error for hsa
7949 case Intrinsic::r600_read_ngroups_y:
7952 case Intrinsic::r600_read_ngroups_z:
7955 case Intrinsic::r600_read_local_size_x:
7956 // TODO: Could insert G_ASSERT_ZEXT from s16
7958 case Intrinsic::r600_read_local_size_y:
7959 // TODO: Could insert G_ASSERT_ZEXT from s16
7961 // TODO: Could insert G_ASSERT_ZEXT from s16
7962 case Intrinsic::r600_read_local_size_z:
7965 case Intrinsic::amdgcn_fdiv_fast:
7966 return legalizeFDIVFastIntrin(MI, MRI, B);
7967 case Intrinsic::amdgcn_is_shared:
7969 case Intrinsic::amdgcn_is_private:
7971 case Intrinsic::amdgcn_wavefrontsize: {
7972 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7973 MI.eraseFromParent();
7974 return true;
7975 }
7976 case Intrinsic::amdgcn_s_buffer_load:
7977 return legalizeSBufferLoad(Helper, MI);
7978 case Intrinsic::amdgcn_raw_buffer_store:
7979 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7980 case Intrinsic::amdgcn_struct_buffer_store:
7981 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7982 return legalizeBufferStore(MI, Helper, false, false);
7983 case Intrinsic::amdgcn_raw_buffer_store_format:
7984 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7985 case Intrinsic::amdgcn_struct_buffer_store_format:
7986 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7987 return legalizeBufferStore(MI, Helper, false, true);
7988 case Intrinsic::amdgcn_raw_tbuffer_store:
7989 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7990 case Intrinsic::amdgcn_struct_tbuffer_store:
7991 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7992 return legalizeBufferStore(MI, Helper, true, true);
7993 case Intrinsic::amdgcn_raw_buffer_load:
7994 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7995 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7996 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7997 case Intrinsic::amdgcn_struct_buffer_load:
7998 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7999 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8000 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8001 return legalizeBufferLoad(MI, Helper, false, false);
8002 case Intrinsic::amdgcn_raw_buffer_load_format:
8003 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8004 case Intrinsic::amdgcn_struct_buffer_load_format:
8005 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8006 return legalizeBufferLoad(MI, Helper, true, false);
8007 case Intrinsic::amdgcn_raw_tbuffer_load:
8008 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8009 case Intrinsic::amdgcn_struct_tbuffer_load:
8010 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8011 return legalizeBufferLoad(MI, Helper, true, true);
8012 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8013 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8014 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8015 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8016 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8017 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8018 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8019 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8020 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8021 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8022 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8023 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8024 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8025 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8026 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8027 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8028 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8029 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8030 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8031 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8032 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8033 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8034 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8035 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8036 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8037 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8038 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8039 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8040 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8041 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8042 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8043 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8044 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8045 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8046 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8047 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8048 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8049 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8050 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8051 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8052 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8053 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8054 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8055 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8056 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8057 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8058 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8059 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8060 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8061 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8062 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8064 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8065 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8066 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8067 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8068 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8069 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8070 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8071 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8072 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8073 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8074 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8076 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8077 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8078 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8080 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8082 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8084 return legalizeBufferAtomic(MI, B, IntrID);
8085 case Intrinsic::amdgcn_rsq_clamp:
8087 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8089 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8090 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8092 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8093 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8094 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8095 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8096 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8097 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8098 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8099 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8100 Register Index = MI.getOperand(5).getReg();
8101 LLT S64 = LLT::scalar(64);
8102 if (MRI.getType(Index) != S64)
8103 MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0));
8104 return true;
8105 }
8106 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8107 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8108 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8109 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8110 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8111 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8112 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8113 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8114 Register Index = MI.getOperand(5).getReg();
8115 LLT S32 = LLT::scalar(32);
8116 if (MRI.getType(Index) != S32)
8117 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8118 return true;
8119 }
8120 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8121 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8122 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8123 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8124 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8125 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8126 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8127 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8128 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8129 Register Index = MI.getOperand(7).getReg();
8130 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8131 ? LLT::scalar(64)
8132 : LLT::scalar(32);
8133 if (MRI.getType(Index) != IdxTy)
8134 MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0));
8135 return true;
8136 }
8137
8138 case Intrinsic::amdgcn_fmed3: {
8139 GISelChangeObserver &Observer = Helper.Observer;
8140
8141 // FIXME: This is to workaround the inability of tablegen match combiners to
8142 // match intrinsics in patterns.
8143 Observer.changingInstr(MI);
8144 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8145 MI.removeOperand(1);
8146 Observer.changedInstr(MI);
8147 return true;
8148 }
8149 case Intrinsic::amdgcn_readlane:
8150 case Intrinsic::amdgcn_writelane:
8151 case Intrinsic::amdgcn_readfirstlane:
8152 case Intrinsic::amdgcn_permlane16:
8153 case Intrinsic::amdgcn_permlanex16:
8154 case Intrinsic::amdgcn_permlane64:
8155 case Intrinsic::amdgcn_set_inactive:
8156 case Intrinsic::amdgcn_set_inactive_chain_arg:
8157 case Intrinsic::amdgcn_mov_dpp8:
8158 case Intrinsic::amdgcn_update_dpp:
8159 return legalizeLaneOp(Helper, MI, IntrID);
8160 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8161 return legalizeSBufferPrefetch(Helper, MI);
8162 case Intrinsic::amdgcn_dead: {
8163 // TODO: Use poison instead of undef
8164 for (const MachineOperand &Def : MI.defs())
8165 B.buildUndef(Def);
8166 MI.eraseFromParent();
8167 return true;
8168 }
8169 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8170 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8171 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8172 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8173 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8174 MI.eraseFromParent();
8175 return true;
8176 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8177 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8178 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8179 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8180 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8181 MI.eraseFromParent();
8182 return true;
8183 default: {
8184 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8186 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8187 return true;
8188 }
8189 }
8190
8191 return true;
8192}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:71
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1260
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1221
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1201
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1161
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:133
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:561
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:387
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
TargetOptions Options
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:922
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:2042
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:654
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:462
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:315
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1726
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.