LLVM 22.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
36#include "llvm/IR/IntrinsicsAMDGPU.h"
37#include "llvm/IR/IntrinsicsR600.h"
38
39#define DEBUG_TYPE "amdgpu-legalinfo"
40
41using namespace llvm;
42using namespace LegalizeActions;
43using namespace LegalizeMutations;
44using namespace LegalityPredicates;
45using namespace MIPatternMatch;
46
47// Hack until load/store selection patterns support any tuple of legal types.
49 "amdgpu-global-isel-new-legality",
50 cl::desc("Use GlobalISel desired legality, rather than try to use"
51 "rules compatible with selection patterns"),
52 cl::init(false),
54
55static constexpr unsigned MaxRegisterSize = 1024;
56
57// Round the number of elements to the next power of two elements
59 unsigned NElts = Ty.getNumElements();
60 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
61 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
62}
63
64// Round the number of bits to the next power of two bits
66 unsigned Bits = Ty.getSizeInBits();
67 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
68 return LLT::scalar(Pow2Bits);
69}
70
71/// \returns true if this is an odd sized vector which should widen by adding an
72/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
73/// excludes s1 vectors, which should always be scalarized.
74static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75 return [=](const LegalityQuery &Query) {
76 const LLT Ty = Query.Types[TypeIdx];
77 if (!Ty.isVector())
78 return false;
79
80 const LLT EltTy = Ty.getElementType();
81 const unsigned EltSize = EltTy.getSizeInBits();
82 return Ty.getNumElements() % 2 != 0 &&
83 EltSize > 1 && EltSize < 32 &&
84 Ty.getSizeInBits() % 32 != 0;
85 };
86}
87
88static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
89 return [=](const LegalityQuery &Query) {
90 const LLT Ty = Query.Types[TypeIdx];
91 return Ty.getSizeInBits() % 32 == 0;
92 };
93}
94
95static LegalityPredicate isWideVec16(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getScalarType();
99 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
100 };
101}
102
103static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
104 return [=](const LegalityQuery &Query) {
105 const LLT Ty = Query.Types[TypeIdx];
106 const LLT EltTy = Ty.getElementType();
107 return std::pair(TypeIdx,
108 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
109 };
110}
111
113 return [=](const LegalityQuery &Query) {
114 const LLT Ty = Query.Types[TypeIdx];
115 const LLT EltTy = Ty.getElementType();
116 unsigned Size = Ty.getSizeInBits();
117 unsigned Pieces = (Size + 63) / 64;
118 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
119 return std::pair(TypeIdx, LLT::scalarOrVector(
120 ElementCount::getFixed(NewNumElts), EltTy));
121 };
122}
123
124// Increase the number of vector elements to reach the next multiple of 32-bit
125// type.
126static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT Ty = Query.Types[TypeIdx];
129
130 const LLT EltTy = Ty.getElementType();
131 const int Size = Ty.getSizeInBits();
132 const int EltSize = EltTy.getSizeInBits();
133 const int NextMul32 = (Size + 31) / 32;
134
135 assert(EltSize < 32);
136
137 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
138 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
139 };
140}
141
142// Retrieves the scalar type that's the same size as the mem desc
144 return [=](const LegalityQuery &Query) {
145 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
146 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
147 };
148}
149
150// Increase the number of vector elements to reach the next legal RegClass.
152 return [=](const LegalityQuery &Query) {
153 const LLT Ty = Query.Types[TypeIdx];
154 const unsigned NumElts = Ty.getNumElements();
155 const unsigned EltSize = Ty.getElementType().getSizeInBits();
156 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
157
158 assert(EltSize == 32 || EltSize == 64);
159 assert(Ty.getSizeInBits() < MaxRegisterSize);
160
161 unsigned NewNumElts;
162 // Find the nearest legal RegClass that is larger than the current type.
163 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
164 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
165 break;
166 }
167 return std::pair(TypeIdx,
168 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
169 };
170}
171
173 if (!Ty.isVector())
174 return LLT::scalar(128);
175 const ElementCount NumElems = Ty.getElementCount();
176 return LLT::vector(NumElems, LLT::scalar(128));
177}
178
180 if (!Ty.isVector())
181 return LLT::fixed_vector(4, LLT::scalar(32));
182 const unsigned NumElems = Ty.getElementCount().getFixedValue();
183 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
184}
185
187 const unsigned Size = Ty.getSizeInBits();
188
189 if (Size <= 32) {
190 // <2 x s8> -> s16
191 // <4 x s8> -> s32
192 return LLT::scalar(Size);
193 }
194
196}
197
198static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
199 return [=](const LegalityQuery &Query) {
200 const LLT Ty = Query.Types[TypeIdx];
201 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
202 };
203}
204
206 return [=](const LegalityQuery &Query) {
207 const LLT Ty = Query.Types[TypeIdx];
208 unsigned Size = Ty.getSizeInBits();
209 assert(Size % 32 == 0);
210 return std::pair(
212 };
213}
214
215static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
216 return [=](const LegalityQuery &Query) {
217 const LLT QueryTy = Query.Types[TypeIdx];
218 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
219 };
220}
221
222static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
223 return [=](const LegalityQuery &Query) {
224 const LLT QueryTy = Query.Types[TypeIdx];
225 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
226 };
227}
228
229static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
230 return [=](const LegalityQuery &Query) {
231 const LLT QueryTy = Query.Types[TypeIdx];
232 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
233 };
234}
235
236static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
237 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
239}
240
242 const int EltSize = EltTy.getSizeInBits();
243 return EltSize == 16 || EltSize % 32 == 0;
244}
245
246static bool isRegisterVectorType(LLT Ty) {
247 const int EltSize = Ty.getElementType().getSizeInBits();
248 return EltSize == 32 || EltSize == 64 ||
249 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
250 EltSize == 128 || EltSize == 256;
251}
252
253// TODO: replace all uses of isRegisterType with isRegisterClassType
254static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
255 if (!isRegisterSize(ST, Ty.getSizeInBits()))
256 return false;
257
258 if (Ty.isVector())
259 return isRegisterVectorType(Ty);
260
261 return true;
262}
263
264// Any combination of 32 or 64-bit elements up the maximum register size, and
265// multiples of v2s16.
267 unsigned TypeIdx) {
268 return [=, &ST](const LegalityQuery &Query) {
269 return isRegisterType(ST, Query.Types[TypeIdx]);
270 };
271}
272
273// RegisterType that doesn't have a corresponding RegClass.
274// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
275// should be removed.
277 unsigned TypeIdx) {
278 return [=, &ST](const LegalityQuery &Query) {
279 LLT Ty = Query.Types[TypeIdx];
280 return isRegisterType(ST, Ty) &&
281 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
282 };
283}
284
285static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
286 return [=](const LegalityQuery &Query) {
287 const LLT QueryTy = Query.Types[TypeIdx];
288 if (!QueryTy.isVector())
289 return false;
290 const LLT EltTy = QueryTy.getElementType();
291 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
292 };
293}
294
295constexpr LLT S1 = LLT::scalar(1);
296constexpr LLT S8 = LLT::scalar(8);
297constexpr LLT S16 = LLT::scalar(16);
298constexpr LLT S32 = LLT::scalar(32);
299constexpr LLT F32 = LLT::float32();
300constexpr LLT S64 = LLT::scalar(64);
301constexpr LLT F64 = LLT::float64();
302constexpr LLT S96 = LLT::scalar(96);
303constexpr LLT S128 = LLT::scalar(128);
304constexpr LLT S160 = LLT::scalar(160);
305constexpr LLT S192 = LLT::scalar(192);
306constexpr LLT S224 = LLT::scalar(224);
307constexpr LLT S256 = LLT::scalar(256);
308constexpr LLT S512 = LLT::scalar(512);
309constexpr LLT S1024 = LLT::scalar(1024);
311
312constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
313constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
314constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
315constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
316constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
317constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
318constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
319constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
320
322constexpr LLT V2BF16 = V2F16; // FIXME
323
324constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
325constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
326constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
327constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
328constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
329constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
330constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
331constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
332constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
333constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
334constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
335constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
336constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
337
338constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
339constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
340constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
341constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
342constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
343constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
344constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
345constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
346
347constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
348constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
349
350constexpr std::initializer_list<LLT> AllScalarTypes = {
352
353constexpr std::initializer_list<LLT> AllS16Vectors{
355
356constexpr std::initializer_list<LLT> AllS32Vectors = {
359
360constexpr std::initializer_list<LLT> AllS64Vectors = {
362
368
369// Checks whether a type is in the list of legal register types.
370static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
371 if (Ty.isPointerOrPointerVector())
372 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
373
376 (ST.useRealTrue16Insts() && Ty == S16) ||
378}
379
381 unsigned TypeIdx) {
382 return [&ST, TypeIdx](const LegalityQuery &Query) {
383 return isRegisterClassType(ST, Query.Types[TypeIdx]);
384 };
385}
386
387// If we have a truncating store or an extending load with a data size larger
388// than 32-bits, we need to reduce to a 32-bit type.
390 return [=](const LegalityQuery &Query) {
391 const LLT Ty = Query.Types[TypeIdx];
392 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
393 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
394 };
395}
396
397// If we have a truncating store or an extending load with a data size larger
398// than 32-bits and mem location is a power of 2
400 return [=](const LegalityQuery &Query) {
401 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
402 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
403 isPowerOf2_64(MemSize);
404 };
405}
406
407// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
408// handle some operations by just promoting the register during
409// selection. There are also d16 loads on GFX9+ which preserve the high bits.
410static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
411 bool IsLoad, bool IsAtomic) {
412 switch (AS) {
414 // FIXME: Private element size.
415 return ST.enableFlatScratch() ? 128 : 32;
417 return ST.useDS128() ? 128 : 64;
422 // Treat constant and global as identical. SMRD loads are sometimes usable for
423 // global loads (ideally constant address space should be eliminated)
424 // depending on the context. Legality cannot be context dependent, but
425 // RegBankSelect can split the load as necessary depending on the pointer
426 // register bank/uniformity and if the memory is invariant or not written in a
427 // kernel.
428 return IsLoad ? 512 : 128;
429 default:
430 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
431 // if they may alias scratch depending on the subtarget. This needs to be
432 // moved to custom handling to use addressMayBeAccessedAsPrivate
433 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
434 }
435}
436
437static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
438 const LegalityQuery &Query) {
439 const LLT Ty = Query.Types[0];
440
441 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
442 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
443
444 unsigned RegSize = Ty.getSizeInBits();
445 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
446 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
447 unsigned AS = Query.Types[1].getAddressSpace();
448
449 // All of these need to be custom lowered to cast the pointer operand.
451 return false;
452
453 // Do not handle extending vector loads.
454 if (Ty.isVector() && MemSize != RegSize)
455 return false;
456
457 // TODO: We should be able to widen loads if the alignment is high enough, but
458 // we also need to modify the memory access size.
459#if 0
460 // Accept widening loads based on alignment.
461 if (IsLoad && MemSize < Size)
462 MemSize = std::max(MemSize, Align);
463#endif
464
465 // Only 1-byte and 2-byte to 32-bit extloads are valid.
466 if (MemSize != RegSize && RegSize != 32)
467 return false;
468
469 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
470 Query.MMODescrs[0].Ordering !=
472 return false;
473
474 switch (MemSize) {
475 case 8:
476 case 16:
477 case 32:
478 case 64:
479 case 128:
480 break;
481 case 96:
482 if (!ST.hasDwordx3LoadStores())
483 return false;
484 break;
485 case 256:
486 case 512:
487 // These may contextually need to be broken down.
488 break;
489 default:
490 return false;
491 }
492
493 assert(RegSize >= MemSize);
494
495 if (AlignBits < MemSize) {
496 const SITargetLowering *TLI = ST.getTargetLowering();
497 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
498 Align(AlignBits / 8)))
499 return false;
500 }
501
502 return true;
503}
504
505// The newer buffer intrinsic forms take their resource arguments as
506// pointers in address space 8, aka s128 values. However, in order to not break
507// SelectionDAG, the underlying operations have to continue to take v4i32
508// arguments. Therefore, we convert resource pointers - or vectors of them
509// to integer values here.
510static bool hasBufferRsrcWorkaround(const LLT Ty) {
511 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
512 return true;
513 if (Ty.isVector()) {
514 const LLT ElemTy = Ty.getElementType();
515 return hasBufferRsrcWorkaround(ElemTy);
516 }
517 return false;
518}
519
520// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
521// workaround this. Eventually it should ignore the type for loads and only care
522// about the size. Return true in cases where we will workaround this for now by
523// bitcasting.
524static bool loadStoreBitcastWorkaround(const LLT Ty) {
526 return false;
527
528 const unsigned Size = Ty.getSizeInBits();
529 if (Ty.isPointerVector())
530 return true;
531 if (Size <= 64)
532 return false;
533 // Address space 8 pointers get their own workaround.
535 return false;
536 if (!Ty.isVector())
537 return true;
538
539 unsigned EltSize = Ty.getScalarSizeInBits();
540 return EltSize != 32 && EltSize != 64;
541}
542
543static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
544 const LLT Ty = Query.Types[0];
545 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
547}
548
549/// Return true if a load or store of the type should be lowered with a bitcast
550/// to a different type.
551static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
552 const LLT MemTy) {
553 const unsigned MemSizeInBits = MemTy.getSizeInBits();
554 const unsigned Size = Ty.getSizeInBits();
555 if (Size != MemSizeInBits)
556 return Size <= 32 && Ty.isVector();
557
559 return true;
560
561 // Don't try to handle bitcasting vector ext loads for now.
562 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
563 (Size <= 32 || isRegisterSize(ST, Size)) &&
564 !isRegisterVectorElementType(Ty.getElementType());
565}
566
567/// Return true if we should legalize a load by widening an odd sized memory
568/// access up to the alignment. Note this case when the memory access itself
569/// changes, not the size of the result register.
570static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
571 uint64_t AlignInBits, unsigned AddrSpace,
572 unsigned Opcode) {
573 unsigned SizeInBits = MemoryTy.getSizeInBits();
574 // We don't want to widen cases that are naturally legal.
575 if (isPowerOf2_32(SizeInBits))
576 return false;
577
578 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
579 // end up widening these for a scalar load during RegBankSelect, if we don't
580 // have 96-bit scalar loads.
581 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
582 return false;
583
584 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
585 return false;
586
587 // A load is known dereferenceable up to the alignment, so it's legal to widen
588 // to it.
589 //
590 // TODO: Could check dereferenceable for less aligned cases.
591 unsigned RoundedSize = NextPowerOf2(SizeInBits);
592 if (AlignInBits < RoundedSize)
593 return false;
594
595 // Do not widen if it would introduce a slow unaligned load.
596 const SITargetLowering *TLI = ST.getTargetLowering();
597 unsigned Fast = 0;
599 RoundedSize, AddrSpace, Align(AlignInBits / 8),
601 Fast;
602}
603
604static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
605 unsigned Opcode) {
606 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
607 return false;
608
609 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
610 Query.MMODescrs[0].AlignInBits,
611 Query.Types[1].getAddressSpace(), Opcode);
612}
613
614/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
615/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
616/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
618 MachineRegisterInfo &MRI, unsigned Idx) {
619 MachineOperand &MO = MI.getOperand(Idx);
620
621 const LLT PointerTy = MRI.getType(MO.getReg());
622
623 // Paranoidly prevent us from doing this multiple times.
625 return PointerTy;
626
627 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
628 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
629 if (!PointerTy.isVector()) {
630 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
631 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
632 const LLT S32 = LLT::scalar(32);
633
634 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
635 std::array<Register, 4> VectorElems;
636 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
637 for (unsigned I = 0; I < NumParts; ++I)
638 VectorElems[I] =
639 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
640 B.buildMergeValues(MO, VectorElems);
641 MO.setReg(VectorReg);
642 return VectorTy;
643 }
644 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
645 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
646 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
647 B.buildIntToPtr(MO, Scalar);
648 MO.setReg(BitcastReg);
649
650 return VectorTy;
651}
652
653/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
654/// the form in which the value must be in order to be passed to the low-level
655/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
656/// needed in order to account for the fact that we can't define a register
657/// class for s128 without breaking SelectionDAG.
659 MachineRegisterInfo &MRI = *B.getMRI();
660 const LLT PointerTy = MRI.getType(Pointer);
661 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
662 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
663
664 if (!PointerTy.isVector()) {
665 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
666 SmallVector<Register, 4> PointerParts;
667 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
668 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
669 for (unsigned I = 0; I < NumParts; ++I)
670 PointerParts.push_back(Unmerged.getReg(I));
671 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
672 }
673 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
674 return B.buildBitcast(VectorTy, Scalar).getReg(0);
675}
676
678 unsigned Idx) {
679 MachineOperand &MO = MI.getOperand(Idx);
680
681 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
682 // Paranoidly prevent us from doing this multiple times.
684 return;
686}
687
689 const GCNTargetMachine &TM)
690 : ST(ST_) {
691 using namespace TargetOpcode;
692
693 auto GetAddrSpacePtr = [&TM](unsigned AS) {
694 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
695 };
696
697 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
698 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
699 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
700 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
701 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
702 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
703 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
704 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
705 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
706 const LLT BufferStridedPtr =
707 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
708
709 const LLT CodePtr = FlatPtr;
710
711 const std::initializer_list<LLT> AddrSpaces64 = {
712 GlobalPtr, ConstantPtr, FlatPtr
713 };
714
715 const std::initializer_list<LLT> AddrSpaces32 = {
716 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
717 };
718
719 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
720
721 const std::initializer_list<LLT> FPTypesBase = {
722 S32, S64
723 };
724
725 const std::initializer_list<LLT> FPTypes16 = {
726 S32, S64, S16
727 };
728
729 const std::initializer_list<LLT> FPTypesPK16 = {
730 S32, S64, S16, V2S16
731 };
732
733 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
734
735 // s1 for VCC branches, s32 for SCC branches.
737
738 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
739 // elements for v3s16
742 .legalFor(AllS32Vectors)
744 .legalFor(AddrSpaces64)
745 .legalFor(AddrSpaces32)
746 .legalFor(AddrSpaces128)
747 .legalIf(isPointer(0))
748 .clampScalar(0, S16, S256)
750 .clampMaxNumElements(0, S32, 16)
752 .scalarize(0);
753
754 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
755 // Full set of gfx9 features.
756 if (ST.hasScalarAddSub64()) {
757 getActionDefinitionsBuilder({G_ADD, G_SUB})
758 .legalFor({S64, S32, S16, V2S16})
759 .clampMaxNumElementsStrict(0, S16, 2)
760 .scalarize(0)
761 .minScalar(0, S16)
763 .maxScalar(0, S32);
764 } else {
765 getActionDefinitionsBuilder({G_ADD, G_SUB})
766 .legalFor({S32, S16, V2S16})
767 .clampMaxNumElementsStrict(0, S16, 2)
768 .scalarize(0)
769 .minScalar(0, S16)
771 .maxScalar(0, S32);
772 }
773
774 if (ST.hasScalarSMulU64()) {
776 .legalFor({S64, S32, S16, V2S16})
777 .clampMaxNumElementsStrict(0, S16, 2)
778 .scalarize(0)
779 .minScalar(0, S16)
781 .custom();
782 } else {
784 .legalFor({S32, S16, V2S16})
785 .clampMaxNumElementsStrict(0, S16, 2)
786 .scalarize(0)
787 .minScalar(0, S16)
789 .custom();
790 }
791 assert(ST.hasMad64_32());
792
793 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
794 .legalFor({S32, S16, V2S16}) // Clamp modifier
795 .minScalarOrElt(0, S16)
797 .scalarize(0)
799 .lower();
800 } else if (ST.has16BitInsts()) {
801 getActionDefinitionsBuilder({G_ADD, G_SUB})
802 .legalFor({S32, S16})
803 .minScalar(0, S16)
805 .maxScalar(0, S32)
806 .scalarize(0);
807
809 .legalFor({S32, S16})
810 .scalarize(0)
811 .minScalar(0, S16)
813 .custom();
814 assert(ST.hasMad64_32());
815
816 // Technically the saturating operations require clamp bit support, but this
817 // was introduced at the same time as 16-bit operations.
818 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
819 .legalFor({S32, S16}) // Clamp modifier
820 .minScalar(0, S16)
821 .scalarize(0)
823 .lower();
824
825 // We're just lowering this, but it helps get a better result to try to
826 // coerce to the desired type first.
827 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
828 .minScalar(0, S16)
829 .scalarize(0)
830 .lower();
831 } else {
832 getActionDefinitionsBuilder({G_ADD, G_SUB})
833 .legalFor({S32})
834 .widenScalarToNextMultipleOf(0, 32)
835 .clampScalar(0, S32, S32)
836 .scalarize(0);
837
838 auto &Mul = getActionDefinitionsBuilder(G_MUL)
839 .legalFor({S32})
840 .scalarize(0)
841 .minScalar(0, S32)
843
844 if (ST.hasMad64_32())
845 Mul.custom();
846 else
847 Mul.maxScalar(0, S32);
848
849 if (ST.hasIntClamp()) {
850 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
851 .legalFor({S32}) // Clamp modifier.
852 .scalarize(0)
854 .lower();
855 } else {
856 // Clamp bit support was added in VI, along with 16-bit operations.
857 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
858 .minScalar(0, S32)
859 .scalarize(0)
860 .lower();
861 }
862
863 // FIXME: DAG expansion gets better results. The widening uses the smaller
864 // range values and goes for the min/max lowering directly.
865 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
866 .minScalar(0, S32)
867 .scalarize(0)
868 .lower();
869 }
870
872 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
873 .customFor({S32, S64})
874 .clampScalar(0, S32, S64)
876 .scalarize(0);
877
878 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
879 .legalFor({S32})
880 .maxScalar(0, S32);
881
882 if (ST.hasVOP3PInsts()) {
883 Mulh
884 .clampMaxNumElements(0, S8, 2)
885 .lowerFor({V2S8});
886 }
887
888 Mulh
889 .scalarize(0)
890 .lower();
891
892 // Report legal for any types we can handle anywhere. For the cases only legal
893 // on the SALU, RegBankSelect will be able to re-legalize.
894 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
895 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
896 .clampScalar(0, S32, S64)
902 .scalarize(0);
903
905 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
906 .legalFor({{S32, S1}, {S32, S32}})
907 .clampScalar(0, S32, S32)
908 .scalarize(0);
909
911 // Don't worry about the size constraint.
913 .lower();
914
916 .legalFor({S1, S32, S64, S16, GlobalPtr,
917 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
918 .legalIf(isPointer(0))
919 .clampScalar(0, S32, S64)
921
922 getActionDefinitionsBuilder(G_FCONSTANT)
923 .legalFor({S32, S64, S16})
924 .clampScalar(0, S16, S64);
925
926 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
927 .legalIf(isRegisterClassType(ST, 0))
928 // s1 and s16 are special cases because they have legal operations on
929 // them, but don't really occupy registers in the normal way.
930 .legalFor({S1, S16})
931 .clampNumElements(0, V16S32, V32S32)
935 .clampMaxNumElements(0, S32, 16);
936
937 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
938
939 // If the amount is divergent, we have to do a wave reduction to get the
940 // maximum value, so this is expanded during RegBankSelect.
941 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
942 .legalFor({{PrivatePtr, S32}});
943
944 getActionDefinitionsBuilder(G_STACKSAVE)
945 .customFor({PrivatePtr});
946 getActionDefinitionsBuilder(G_STACKRESTORE)
947 .legalFor({PrivatePtr});
948
949 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
950
951 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
952 .customIf(typeIsNot(0, PrivatePtr));
953
954 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
955
956 auto &FPOpActions = getActionDefinitionsBuilder(
957 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
958 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
959 .legalFor({S32, S64});
960 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
961 .customFor({S32, S64});
962 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
963 .customFor({S32, S64});
964
965 if (ST.has16BitInsts()) {
966 if (ST.hasVOP3PInsts())
967 FPOpActions.legalFor({S16, V2S16});
968 else
969 FPOpActions.legalFor({S16});
970
971 TrigActions.customFor({S16});
972 FDIVActions.customFor({S16});
973 }
974
975 if (ST.hasPackedFP32Ops()) {
976 FPOpActions.legalFor({V2S32});
977 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
978 }
979
980 auto &MinNumMaxNumIeee =
981 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
982
983 if (ST.hasVOP3PInsts()) {
984 MinNumMaxNumIeee.legalFor(FPTypesPK16)
985 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
986 .clampMaxNumElements(0, S16, 2)
987 .clampScalar(0, S16, S64)
988 .scalarize(0);
989 } else if (ST.has16BitInsts()) {
990 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
991 } else {
992 MinNumMaxNumIeee.legalFor(FPTypesBase)
993 .clampScalar(0, S32, S64)
994 .scalarize(0);
995 }
996
997 auto &MinNumMaxNum = getActionDefinitionsBuilder(
998 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
999
1000 if (ST.hasVOP3PInsts()) {
1001 MinNumMaxNum.customFor(FPTypesPK16)
1002 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1003 .clampMaxNumElements(0, S16, 2)
1004 .clampScalar(0, S16, S64)
1005 .scalarize(0);
1006 } else if (ST.has16BitInsts()) {
1007 MinNumMaxNum.customFor(FPTypes16)
1008 .clampScalar(0, S16, S64)
1009 .scalarize(0);
1010 } else {
1011 MinNumMaxNum.customFor(FPTypesBase)
1012 .clampScalar(0, S32, S64)
1013 .scalarize(0);
1014 }
1015
1016 if (ST.hasVOP3PInsts())
1017 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1018
1019 FPOpActions
1020 .scalarize(0)
1021 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1022
1023 TrigActions
1024 .scalarize(0)
1025 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1026
1027 FDIVActions
1028 .scalarize(0)
1029 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1030
1031 getActionDefinitionsBuilder({G_FNEG, G_FABS})
1032 .legalFor(FPTypesPK16)
1034 .scalarize(0)
1035 .clampScalar(0, S16, S64);
1036
1037 if (ST.has16BitInsts()) {
1039 .legalFor({S16})
1040 .customFor({S32, S64})
1041 .scalarize(0)
1042 .unsupported();
1044 .legalFor({S32, S64, S16})
1045 .scalarize(0)
1046 .clampScalar(0, S16, S64);
1047
1048 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1049 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1050 .scalarize(0)
1051 .maxScalarIf(typeIs(0, S16), 1, S16)
1052 .clampScalar(1, S32, S32)
1053 .lower();
1054
1056 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1057 .scalarize(0)
1058 .lower();
1059 } else {
1061 .customFor({S32, S64, S16})
1062 .scalarize(0)
1063 .unsupported();
1064
1065
1066 if (ST.hasFractBug()) {
1068 .customFor({S64})
1069 .legalFor({S32, S64})
1070 .scalarize(0)
1071 .clampScalar(0, S32, S64);
1072 } else {
1074 .legalFor({S32, S64})
1075 .scalarize(0)
1076 .clampScalar(0, S32, S64);
1077 }
1078
1079 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1080 .legalFor({{S32, S32}, {S64, S32}})
1081 .scalarize(0)
1082 .clampScalar(0, S32, S64)
1083 .clampScalar(1, S32, S32)
1084 .lower();
1085
1087 .customFor({{S32, S32}, {S64, S32}})
1088 .scalarize(0)
1089 .minScalar(0, S32)
1090 .clampScalar(1, S32, S32)
1091 .lower();
1092 }
1093
1094 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1095 if (ST.hasCvtPkF16F32Inst()) {
1096 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1097 .clampMaxNumElements(0, S16, 2);
1098 } else {
1099 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1100 }
1101 FPTruncActions.scalarize(0).lower();
1102
1104 .legalFor({{S64, S32}, {S32, S16}})
1105 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1106 .scalarize(0);
1107
1108 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1109 if (ST.has16BitInsts()) {
1110 FSubActions
1111 // Use actual fsub instruction
1112 .legalFor({S32, S16})
1113 // Must use fadd + fneg
1114 .lowerFor({S64, V2S16});
1115 } else {
1116 FSubActions
1117 // Use actual fsub instruction
1118 .legalFor({S32})
1119 // Must use fadd + fneg
1120 .lowerFor({S64, S16, V2S16});
1121 }
1122
1123 FSubActions
1124 .scalarize(0)
1125 .clampScalar(0, S32, S64);
1126
1127 // Whether this is legal depends on the floating point mode for the function.
1128 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1129 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1130 FMad.customFor({S32, S16});
1131 else if (ST.hasMadMacF32Insts())
1132 FMad.customFor({S32});
1133 else if (ST.hasMadF16())
1134 FMad.customFor({S16});
1135 FMad.scalarize(0)
1136 .lower();
1137
1138 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1139 if (ST.has16BitInsts()) {
1140 FRem.customFor({S16, S32, S64});
1141 } else {
1142 FRem.minScalar(0, S32)
1143 .customFor({S32, S64});
1144 }
1145 FRem.scalarize(0);
1146
1147 // TODO: Do we need to clamp maximum bitwidth?
1149 .legalIf(isScalar(0))
1150 .legalFor({{V2S16, V2S32}})
1151 .clampMaxNumElements(0, S16, 2)
1152 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1153 // situations (like an invalid implicit use), we don't want to infinite loop
1154 // in the legalizer.
1156 .alwaysLegal();
1157
1158 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1159 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1160 {S32, S1}, {S64, S1}, {S16, S1}})
1161 .scalarize(0)
1162 .clampScalar(0, S32, S64)
1163 .widenScalarToNextPow2(1, 32);
1164
1165 // TODO: Split s1->s64 during regbankselect for VALU.
1166 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1167 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1168 .lowerIf(typeIs(1, S1))
1169 .customFor({{S32, S64}, {S64, S64}});
1170 if (ST.has16BitInsts())
1171 IToFP.legalFor({{S16, S16}});
1172 IToFP.clampScalar(1, S32, S64)
1173 .minScalar(0, S32)
1174 .scalarize(0)
1176
1177 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1178 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1179 .customFor({{S64, S32}, {S64, S64}})
1180 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1181 if (ST.has16BitInsts())
1182 FPToI.legalFor({{S16, S16}});
1183 else
1184 FPToI.minScalar(1, S32);
1185
1186 FPToI.minScalar(0, S32)
1187 .widenScalarToNextPow2(0, 32)
1188 .scalarize(0)
1189 .lower();
1190
1191 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1192 .clampScalar(0, S16, S64)
1193 .scalarize(0)
1194 .lower();
1195
1196 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1197 .legalFor({S16, S32})
1198 .scalarize(0)
1199 .lower();
1200
1201 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1202 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1203 .scalarize(0)
1204 .lower();
1205
1206 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1207 .clampScalar(0, S16, S64)
1208 .scalarize(0)
1209 .lower();
1210
1211 if (ST.has16BitInsts()) {
1213 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1214 .legalFor({S16, S32, S64})
1215 .clampScalar(0, S16, S64)
1216 .scalarize(0);
1217 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1219 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1220 .legalFor({S32, S64})
1221 .clampScalar(0, S32, S64)
1222 .scalarize(0);
1223 } else {
1225 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1226 .legalFor({S32})
1227 .customFor({S64})
1228 .clampScalar(0, S32, S64)
1229 .scalarize(0);
1230 }
1231
1233 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1234 .legalIf(all(isPointer(0), sameSize(0, 1)))
1235 .scalarize(0)
1236 .scalarSameSizeAs(1, 0);
1237
1239 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1240 .scalarSameSizeAs(1, 0)
1241 .scalarize(0);
1242
1243 auto &CmpBuilder =
1245 // The compare output type differs based on the register bank of the output,
1246 // so make both s1 and s32 legal.
1247 //
1248 // Scalar compares producing output in scc will be promoted to s32, as that
1249 // is the allocatable register type that will be needed for the copy from
1250 // scc. This will be promoted during RegBankSelect, and we assume something
1251 // before that won't try to use s32 result types.
1252 //
1253 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1254 // bank.
1256 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1257 .legalForCartesianProduct(
1258 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1259 if (ST.has16BitInsts()) {
1260 CmpBuilder.legalFor({{S1, S16}});
1261 }
1262
1263 CmpBuilder
1265 .clampScalar(1, S32, S64)
1266 .scalarize(0)
1267 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1268
1269 auto &FCmpBuilder =
1271 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1272
1273 if (ST.hasSALUFloatInsts())
1274 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1275
1276 FCmpBuilder
1278 .clampScalar(1, S32, S64)
1279 .scalarize(0);
1280
1281 // FIXME: fpow has a selection pattern that should move to custom lowering.
1282 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1283 if (ST.has16BitInsts())
1284 ExpOps.customFor({{S32}, {S16}});
1285 else
1286 ExpOps.customFor({S32});
1287 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1288 .scalarize(0);
1289
1291 .clampScalar(0, MinScalarFPTy, S32)
1292 .lower();
1293
1294 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1295 Log2Ops.customFor({S32});
1296 if (ST.has16BitInsts())
1297 Log2Ops.legalFor({S16});
1298 else
1299 Log2Ops.customFor({S16});
1300 Log2Ops.scalarize(0)
1301 .lower();
1302
1303 auto &LogOps =
1304 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1305 LogOps.customFor({S32, S16});
1306 LogOps.clampScalar(0, MinScalarFPTy, S32)
1307 .scalarize(0);
1308
1309 // The 64-bit versions produce 32-bit results, but only on the SALU.
1311 .legalFor({{S32, S32}, {S32, S64}})
1312 .clampScalar(0, S32, S32)
1313 .widenScalarToNextPow2(1, 32)
1314 .clampScalar(1, S32, S64)
1315 .scalarize(0)
1316 .widenScalarToNextPow2(0, 32);
1317
1318 // If no 16 bit instr is available, lower into different instructions.
1319 if (ST.has16BitInsts())
1320 getActionDefinitionsBuilder(G_IS_FPCLASS)
1321 .legalForCartesianProduct({S1}, FPTypes16)
1322 .widenScalarToNextPow2(1)
1323 .scalarize(0)
1324 .lower();
1325 else
1326 getActionDefinitionsBuilder(G_IS_FPCLASS)
1327 .legalForCartesianProduct({S1}, FPTypesBase)
1328 .lowerFor({S1, S16})
1329 .widenScalarToNextPow2(1)
1330 .scalarize(0)
1331 .lower();
1332
1333 // The hardware instructions return a different result on 0 than the generic
1334 // instructions expect. The hardware produces -1, but these produce the
1335 // bitwidth.
1336 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1337 .scalarize(0)
1338 .clampScalar(0, S32, S32)
1339 .clampScalar(1, S32, S64)
1340 .widenScalarToNextPow2(0, 32)
1341 .widenScalarToNextPow2(1, 32)
1342 .custom();
1343
1344 // The 64-bit versions produce 32-bit results, but only on the SALU.
1345 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1346 .legalFor({{S32, S32}, {S32, S64}})
1347 .customIf(scalarNarrowerThan(1, 32))
1348 .clampScalar(0, S32, S32)
1349 .clampScalar(1, S32, S64)
1350 .scalarize(0)
1351 .widenScalarToNextPow2(0, 32)
1352 .widenScalarToNextPow2(1, 32);
1353
1354 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1355 .legalFor({{S32, S32}, {S32, S64}})
1356 .clampScalar(0, S32, S32)
1357 .clampScalar(1, S32, S64)
1358 .scalarize(0)
1359 .widenScalarToNextPow2(0, 32)
1360 .widenScalarToNextPow2(1, 32);
1361
1362 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1363 // RegBankSelect.
1364 getActionDefinitionsBuilder(G_BITREVERSE)
1365 .legalFor({S32, S64})
1366 .clampScalar(0, S32, S64)
1367 .scalarize(0)
1369
1370 if (ST.has16BitInsts()) {
1372 .legalFor({S16, S32, V2S16})
1373 .clampMaxNumElementsStrict(0, S16, 2)
1374 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1375 // narrowScalar limitation.
1377 .clampScalar(0, S16, S32)
1378 .scalarize(0);
1379
1380 if (ST.hasVOP3PInsts()) {
1382 .legalFor({S32, S16, V2S16})
1383 .clampMaxNumElements(0, S16, 2)
1384 .minScalar(0, S16)
1386 .scalarize(0)
1387 .lower();
1388 if (ST.hasIntMinMax64()) {
1389 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1390 .legalFor({S32, S16, S64, V2S16})
1391 .clampMaxNumElements(0, S16, 2)
1392 .minScalar(0, S16)
1394 .scalarize(0)
1395 .lower();
1396 } else {
1397 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1398 .legalFor({S32, S16, V2S16})
1399 .clampMaxNumElements(0, S16, 2)
1400 .minScalar(0, S16)
1402 .scalarize(0)
1403 .lower();
1404 }
1405 } else {
1406 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1407 .legalFor({S32, S16})
1408 .widenScalarToNextPow2(0)
1409 .minScalar(0, S16)
1410 .scalarize(0)
1411 .lower();
1412 }
1413 } else {
1414 // TODO: Should have same legality without v_perm_b32
1416 .legalFor({S32})
1417 .lowerIf(scalarNarrowerThan(0, 32))
1418 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1419 // narrowScalar limitation.
1421 .maxScalar(0, S32)
1422 .scalarize(0)
1423 .lower();
1424
1425 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1426 .legalFor({S32})
1427 .minScalar(0, S32)
1429 .scalarize(0)
1430 .lower();
1431 }
1432
1433 getActionDefinitionsBuilder(G_INTTOPTR)
1434 // List the common cases
1435 .legalForCartesianProduct(AddrSpaces64, {S64})
1436 .legalForCartesianProduct(AddrSpaces32, {S32})
1437 .scalarize(0)
1438 // Accept any address space as long as the size matches
1439 .legalIf(sameSize(0, 1))
1441 [](const LegalityQuery &Query) {
1442 return std::pair(
1443 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1444 })
1445 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1446 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1447 });
1448
1449 getActionDefinitionsBuilder(G_PTRTOINT)
1450 // List the common cases
1451 .legalForCartesianProduct(AddrSpaces64, {S64})
1452 .legalForCartesianProduct(AddrSpaces32, {S32})
1453 .scalarize(0)
1454 // Accept any address space as long as the size matches
1455 .legalIf(sameSize(0, 1))
1457 [](const LegalityQuery &Query) {
1458 return std::pair(
1459 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1460 })
1461 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1462 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1463 });
1464
1465 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1466 .scalarize(0)
1467 .custom();
1468
1469 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1470 bool IsLoad) -> bool {
1471 const LLT DstTy = Query.Types[0];
1472
1473 // Split vector extloads.
1474 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1475
1476 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1477 return true;
1478
1479 const LLT PtrTy = Query.Types[1];
1480 unsigned AS = PtrTy.getAddressSpace();
1481 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1482 Query.MMODescrs[0].Ordering !=
1484 return true;
1485
1486 // Catch weird sized loads that don't evenly divide into the access sizes
1487 // TODO: May be able to widen depending on alignment etc.
1488 unsigned NumRegs = (MemSize + 31) / 32;
1489 if (NumRegs == 3) {
1490 if (!ST.hasDwordx3LoadStores())
1491 return true;
1492 } else {
1493 // If the alignment allows, these should have been widened.
1494 if (!isPowerOf2_32(NumRegs))
1495 return true;
1496 }
1497
1498 return false;
1499 };
1500
1501 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1502 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1503 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1504
1505 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1506 // LDS
1507 // TODO: Unsupported flat for SI.
1508
1509 for (unsigned Op : {G_LOAD, G_STORE}) {
1510 const bool IsStore = Op == G_STORE;
1511
1512 auto &Actions = getActionDefinitionsBuilder(Op);
1513 // Explicitly list some common cases.
1514 // TODO: Does this help compile time at all?
1515 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1516 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1517 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1518 {S64, GlobalPtr, S64, GlobalAlign32},
1519 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1520 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1521 {S32, GlobalPtr, S8, GlobalAlign8},
1522 {S32, GlobalPtr, S16, GlobalAlign16},
1523
1524 {S32, LocalPtr, S32, 32},
1525 {S64, LocalPtr, S64, 32},
1526 {V2S32, LocalPtr, V2S32, 32},
1527 {S32, LocalPtr, S8, 8},
1528 {S32, LocalPtr, S16, 16},
1529 {V2S16, LocalPtr, S32, 32},
1530
1531 {S32, PrivatePtr, S32, 32},
1532 {S32, PrivatePtr, S8, 8},
1533 {S32, PrivatePtr, S16, 16},
1534 {V2S16, PrivatePtr, S32, 32},
1535
1536 {S32, ConstantPtr, S32, GlobalAlign32},
1537 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1538 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1539 {S64, ConstantPtr, S64, GlobalAlign32},
1540 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1541 Actions.legalIf(
1542 [=](const LegalityQuery &Query) -> bool {
1543 return isLoadStoreLegal(ST, Query);
1544 });
1545
1546 // The custom pointers (fat pointers, buffer resources) don't work with load
1547 // and store at this level. Fat pointers should have been lowered to
1548 // intrinsics before the translation to MIR.
1549 Actions.unsupportedIf(
1550 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1551
1552 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1553 // ptrtoint. This is needed to account for the fact that we can't have i128
1554 // as a register class for SelectionDAG reasons.
1555 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1556 return hasBufferRsrcWorkaround(Query.Types[0]);
1557 });
1558
1559 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1560 // 64-bits.
1561 //
1562 // TODO: Should generalize bitcast action into coerce, which will also cover
1563 // inserting addrspacecasts.
1564 Actions.customIf(typeIs(1, Constant32Ptr));
1565
1566 // Turn any illegal element vectors into something easier to deal
1567 // with. These will ultimately produce 32-bit scalar shifts to extract the
1568 // parts anyway.
1569 //
1570 // For odd 16-bit element vectors, prefer to split those into pieces with
1571 // 16-bit vector parts.
1572 Actions.bitcastIf(
1573 [=](const LegalityQuery &Query) -> bool {
1574 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1575 Query.MMODescrs[0].MemoryTy);
1576 }, bitcastToRegisterType(0));
1577
1578 if (!IsStore) {
1579 // Widen suitably aligned loads by loading extra bytes. The standard
1580 // legalization actions can't properly express widening memory operands.
1581 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1582 return shouldWidenLoad(ST, Query, G_LOAD);
1583 });
1584 }
1585
1586 // FIXME: load/store narrowing should be moved to lower action
1587 Actions
1588 .narrowScalarIf(
1589 [=](const LegalityQuery &Query) -> bool {
1590 return !Query.Types[0].isVector() &&
1591 needToSplitMemOp(Query, Op == G_LOAD);
1592 },
1593 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1594 const LLT DstTy = Query.Types[0];
1595 const LLT PtrTy = Query.Types[1];
1596
1597 const unsigned DstSize = DstTy.getSizeInBits();
1598 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1599
1600 // Split extloads.
1601 if (DstSize > MemSize)
1602 return std::pair(0, LLT::scalar(MemSize));
1603
1604 unsigned MaxSize = maxSizeForAddrSpace(
1605 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1606 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1607 if (MemSize > MaxSize)
1608 return std::pair(0, LLT::scalar(MaxSize));
1609
1610 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1611 return std::pair(0, LLT::scalar(Align));
1612 })
1613 .fewerElementsIf(
1614 [=](const LegalityQuery &Query) -> bool {
1615 return Query.Types[0].isVector() &&
1616 needToSplitMemOp(Query, Op == G_LOAD);
1617 },
1618 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1619 const LLT DstTy = Query.Types[0];
1620 const LLT PtrTy = Query.Types[1];
1621
1622 LLT EltTy = DstTy.getElementType();
1623 unsigned MaxSize = maxSizeForAddrSpace(
1624 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1625 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1626
1627 // FIXME: Handle widened to power of 2 results better. This ends
1628 // up scalarizing.
1629 // FIXME: 3 element stores scalarized on SI
1630
1631 // Split if it's too large for the address space.
1632 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1633 if (MemSize > MaxSize) {
1634 unsigned NumElts = DstTy.getNumElements();
1635 unsigned EltSize = EltTy.getSizeInBits();
1636
1637 if (MaxSize % EltSize == 0) {
1638 return std::pair(
1640 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1641 }
1642
1643 unsigned NumPieces = MemSize / MaxSize;
1644
1645 // FIXME: Refine when odd breakdowns handled
1646 // The scalars will need to be re-legalized.
1647 if (NumPieces == 1 || NumPieces >= NumElts ||
1648 NumElts % NumPieces != 0)
1649 return std::pair(0, EltTy);
1650
1651 return std::pair(0,
1652 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1653 }
1654
1655 // FIXME: We could probably handle weird extending loads better.
1656 if (DstTy.getSizeInBits() > MemSize)
1657 return std::pair(0, EltTy);
1658
1659 unsigned EltSize = EltTy.getSizeInBits();
1660 unsigned DstSize = DstTy.getSizeInBits();
1661 if (!isPowerOf2_32(DstSize)) {
1662 // We're probably decomposing an odd sized store. Try to split
1663 // to the widest type. TODO: Account for alignment. As-is it
1664 // should be OK, since the new parts will be further legalized.
1665 unsigned FloorSize = llvm::bit_floor(DstSize);
1666 return std::pair(
1668 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1669 }
1670
1671 // May need relegalization for the scalars.
1672 return std::pair(0, EltTy);
1673 })
1674 .minScalar(0, S32)
1675 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1677 .widenScalarToNextPow2(0)
1678 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1679 .lower();
1680 }
1681
1682 // FIXME: Unaligned accesses not lowered.
1683 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1684 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1685 {S32, GlobalPtr, S16, 2 * 8},
1686 {S32, LocalPtr, S8, 8},
1687 {S32, LocalPtr, S16, 16},
1688 {S32, PrivatePtr, S8, 8},
1689 {S32, PrivatePtr, S16, 16},
1690 {S32, ConstantPtr, S8, 8},
1691 {S32, ConstantPtr, S16, 2 * 8}})
1692 .legalIf(
1693 [=](const LegalityQuery &Query) -> bool {
1694 return isLoadStoreLegal(ST, Query);
1695 });
1696
1697 if (ST.hasFlatAddressSpace()) {
1698 ExtLoads.legalForTypesWithMemDesc(
1699 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1700 }
1701
1702 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1703 // 64-bits.
1704 //
1705 // TODO: Should generalize bitcast action into coerce, which will also cover
1706 // inserting addrspacecasts.
1707 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1708
1709 ExtLoads.clampScalar(0, S32, S32)
1711 .lower();
1712
1713 auto &Atomics = getActionDefinitionsBuilder(
1714 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1715 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1716 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1717 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1718 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1719 {S64, GlobalPtr}, {S64, LocalPtr},
1720 {S32, RegionPtr}, {S64, RegionPtr}});
1721 if (ST.hasFlatAddressSpace()) {
1722 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1723 }
1724
1725 auto &Atomics32 =
1726 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1727 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1728 if (ST.hasFlatAddressSpace()) {
1729 Atomics32.legalFor({{S32, FlatPtr}});
1730 }
1731
1732 // TODO: v2bf16 operations, and fat buffer pointer support.
1733 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1734 if (ST.hasLDSFPAtomicAddF32()) {
1735 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1736 if (ST.hasLdsAtomicAddF64())
1737 Atomic.legalFor({{S64, LocalPtr}});
1738 if (ST.hasAtomicDsPkAdd16Insts())
1739 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1740 }
1741 if (ST.hasAtomicFaddInsts())
1742 Atomic.legalFor({{S32, GlobalPtr}});
1743 if (ST.hasFlatAtomicFaddF32Inst())
1744 Atomic.legalFor({{S32, FlatPtr}});
1745
1746 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1747 // These are legal with some caveats, and should have undergone expansion in
1748 // the IR in most situations
1749 // TODO: Move atomic expansion into legalizer
1750 Atomic.legalFor({
1751 {S32, GlobalPtr},
1752 {S64, GlobalPtr},
1753 {S64, FlatPtr}
1754 });
1755 }
1756
1757 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1758 ST.hasAtomicBufferGlobalPkAddF16Insts())
1759 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1760 if (ST.hasAtomicGlobalPkAddBF16Inst())
1761 Atomic.legalFor({{V2BF16, GlobalPtr}});
1762 if (ST.hasAtomicFlatPkAdd16Insts())
1763 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1764
1765
1766 // Most of the legalization work here is done by AtomicExpand. We could
1767 // probably use a simpler legality rule that just assumes anything is OK.
1768 auto &AtomicFMinFMax =
1769 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1770 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1771
1772 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1773 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1774 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1775 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1776 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1777 AtomicFMinFMax.legalFor({F32, FlatPtr});
1778 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1779 AtomicFMinFMax.legalFor({F64, FlatPtr});
1780
1781 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1782 // demarshalling
1783 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1784 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1785 {S32, FlatPtr}, {S64, FlatPtr}})
1786 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1787 {S32, RegionPtr}, {S64, RegionPtr}});
1788 // TODO: Pointer types, any 32-bit or 64-bit vector
1789
1790 // Condition should be s32 for scalar, s1 for vector.
1793 LocalPtr, FlatPtr, PrivatePtr,
1794 LLT::fixed_vector(2, LocalPtr),
1795 LLT::fixed_vector(2, PrivatePtr)},
1796 {S1, S32})
1797 .clampScalar(0, S16, S64)
1798 .scalarize(1)
1801 .clampMaxNumElements(0, S32, 2)
1802 .clampMaxNumElements(0, LocalPtr, 2)
1803 .clampMaxNumElements(0, PrivatePtr, 2)
1804 .scalarize(0)
1806 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1807
1808 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1809 // be more flexible with the shift amount type.
1810 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1811 .legalFor({{S32, S32}, {S64, S32}});
1812 if (ST.has16BitInsts()) {
1813 if (ST.hasVOP3PInsts()) {
1814 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1815 .clampMaxNumElements(0, S16, 2);
1816 } else
1817 Shifts.legalFor({{S16, S16}});
1818
1819 // TODO: Support 16-bit shift amounts for all types
1820 Shifts.widenScalarIf(
1821 [=](const LegalityQuery &Query) {
1822 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1823 // 32-bit amount.
1824 const LLT ValTy = Query.Types[0];
1825 const LLT AmountTy = Query.Types[1];
1826 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1827 AmountTy.getSizeInBits() < 16;
1828 }, changeTo(1, S16));
1829 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1830 Shifts.clampScalar(1, S32, S32);
1831 Shifts.widenScalarToNextPow2(0, 16);
1832 Shifts.clampScalar(0, S16, S64);
1833
1834 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1835 .minScalar(0, S16)
1836 .scalarize(0)
1837 .lower();
1838 } else {
1839 // Make sure we legalize the shift amount type first, as the general
1840 // expansion for the shifted type will produce much worse code if it hasn't
1841 // been truncated already.
1842 Shifts.clampScalar(1, S32, S32);
1843 Shifts.widenScalarToNextPow2(0, 32);
1844 Shifts.clampScalar(0, S32, S64);
1845
1846 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1847 .minScalar(0, S32)
1848 .scalarize(0)
1849 .lower();
1850 }
1851 Shifts.scalarize(0);
1852
1853 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1854 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1855 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1856 unsigned IdxTypeIdx = 2;
1857
1859 .customIf([=](const LegalityQuery &Query) {
1860 const LLT EltTy = Query.Types[EltTypeIdx];
1861 const LLT VecTy = Query.Types[VecTypeIdx];
1862 const LLT IdxTy = Query.Types[IdxTypeIdx];
1863 const unsigned EltSize = EltTy.getSizeInBits();
1864 const bool isLegalVecType =
1866 // Address space 8 pointers are 128-bit wide values, but the logic
1867 // below will try to bitcast them to 2N x s64, which will fail.
1868 // Therefore, as an intermediate step, wrap extracts/insertions from a
1869 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1870 // extraction result) in order to produce a vector operation that can
1871 // be handled by the logic below.
1872 if (EltTy.isPointer() && EltSize > 64)
1873 return true;
1874 return (EltSize == 32 || EltSize == 64) &&
1875 VecTy.getSizeInBits() % 32 == 0 &&
1876 VecTy.getSizeInBits() <= MaxRegisterSize &&
1877 IdxTy.getSizeInBits() == 32 &&
1878 isLegalVecType;
1879 })
1880 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1881 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1882 bitcastToVectorElement32(VecTypeIdx))
1883 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1884 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1885 scalarOrEltWiderThan(VecTypeIdx, 64)),
1886 [=](const LegalityQuery &Query) {
1887 // For > 64-bit element types, try to turn this into a
1888 // 64-bit element vector since we may be able to do better
1889 // indexing if this is scalar. If not, fall back to 32.
1890 const LLT EltTy = Query.Types[EltTypeIdx];
1891 const LLT VecTy = Query.Types[VecTypeIdx];
1892 const unsigned DstEltSize = EltTy.getSizeInBits();
1893 const unsigned VecSize = VecTy.getSizeInBits();
1894
1895 const unsigned TargetEltSize =
1896 DstEltSize % 64 == 0 ? 64 : 32;
1897 return std::pair(VecTypeIdx,
1898 LLT::fixed_vector(VecSize / TargetEltSize,
1899 TargetEltSize));
1900 })
1901 .clampScalar(EltTypeIdx, S32, S64)
1902 .clampScalar(VecTypeIdx, S32, S64)
1903 .clampScalar(IdxTypeIdx, S32, S32)
1904 .clampMaxNumElements(VecTypeIdx, S32, 32)
1905 // TODO: Clamp elements for 64-bit vectors?
1906 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1908 // It should only be necessary with variable indexes.
1909 // As a last resort, lower to the stack
1910 .lower();
1911 }
1912
1913 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1914 .unsupportedIf([=](const LegalityQuery &Query) {
1915 const LLT &EltTy = Query.Types[1].getElementType();
1916 return Query.Types[0] != EltTy;
1917 });
1918
1919 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1920 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1921 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1922
1923 // FIXME: Doesn't handle extract of illegal sizes.
1925 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1926 .lowerIf([=](const LegalityQuery &Query) {
1927 // Sub-vector(or single element) insert and extract.
1928 // TODO: verify immediate offset here since lower only works with
1929 // whole elements.
1930 const LLT BigTy = Query.Types[BigTyIdx];
1931 return BigTy.isVector();
1932 })
1933 // FIXME: Multiples of 16 should not be legal.
1934 .legalIf([=](const LegalityQuery &Query) {
1935 const LLT BigTy = Query.Types[BigTyIdx];
1936 const LLT LitTy = Query.Types[LitTyIdx];
1937 return (BigTy.getSizeInBits() % 32 == 0) &&
1938 (LitTy.getSizeInBits() % 16 == 0);
1939 })
1940 .widenScalarIf(
1941 [=](const LegalityQuery &Query) {
1942 const LLT BigTy = Query.Types[BigTyIdx];
1943 return (BigTy.getScalarSizeInBits() < 16);
1944 },
1946 .widenScalarIf(
1947 [=](const LegalityQuery &Query) {
1948 const LLT LitTy = Query.Types[LitTyIdx];
1949 return (LitTy.getScalarSizeInBits() < 16);
1950 },
1952 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1953 .widenScalarToNextPow2(BigTyIdx, 32);
1954
1955 }
1956
1957 auto &BuildVector =
1958 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1960 .legalForCartesianProduct(AllS64Vectors, {S64})
1961 .clampNumElements(0, V16S32, V32S32)
1966
1967 if (ST.hasScalarPackInsts()) {
1968 BuildVector
1969 // FIXME: Should probably widen s1 vectors straight to s32
1970 .minScalarOrElt(0, S16)
1971 .minScalar(1, S16);
1972
1973 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1974 .legalFor({V2S16, S32})
1975 .lower();
1976 } else {
1977 BuildVector.customFor({V2S16, S16});
1978 BuildVector.minScalarOrElt(0, S32);
1979
1980 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1981 .customFor({V2S16, S32})
1982 .lower();
1983 }
1984
1985 BuildVector.legalIf(isRegisterType(ST, 0));
1986
1987 // FIXME: Clamp maximum size
1988 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1989 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1990 .clampMaxNumElements(0, S32, 32)
1991 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1992 .clampMaxNumElements(0, S16, 64);
1993
1994 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1995
1996 // Merge/Unmerge
1997 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1998 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1999 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2000
2001 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2002 const LLT Ty = Query.Types[TypeIdx];
2003 if (Ty.isVector()) {
2004 const LLT &EltTy = Ty.getElementType();
2005 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2006 return true;
2008 return true;
2009 }
2010 return false;
2011 };
2012
2013 auto &Builder =
2015 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2016 .lowerFor({{S16, V2S16}})
2017 .lowerIf([=](const LegalityQuery &Query) {
2018 const LLT BigTy = Query.Types[BigTyIdx];
2019 return BigTy.getSizeInBits() == 32;
2020 })
2021 // Try to widen to s16 first for small types.
2022 // TODO: Only do this on targets with legal s16 shifts
2023 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2024 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2026 oneMoreElement(BigTyIdx))
2028 elementTypeIs(1, S16)),
2029 changeTo(1, V2S16))
2030 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2031 // not worth considering the multiples of 64 since 2*192 and 2*384
2032 // are not valid.
2033 .clampScalar(LitTyIdx, S32, S512)
2034 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2035 // Break up vectors with weird elements into scalars
2037 [=](const LegalityQuery &Query) {
2038 return notValidElt(Query, LitTyIdx);
2039 },
2040 scalarize(0))
2041 .fewerElementsIf(
2042 [=](const LegalityQuery &Query) {
2043 return notValidElt(Query, BigTyIdx);
2044 },
2045 scalarize(1))
2046 .clampScalar(BigTyIdx, S32, MaxScalar);
2047
2048 if (Op == G_MERGE_VALUES) {
2049 Builder.widenScalarIf(
2050 // TODO: Use 16-bit shifts if legal for 8-bit values?
2051 [=](const LegalityQuery &Query) {
2052 const LLT Ty = Query.Types[LitTyIdx];
2053 return Ty.getSizeInBits() < 32;
2054 },
2055 changeTo(LitTyIdx, S32));
2056 }
2057
2058 Builder.widenScalarIf(
2059 [=](const LegalityQuery &Query) {
2060 const LLT Ty = Query.Types[BigTyIdx];
2061 return Ty.getSizeInBits() % 16 != 0;
2062 },
2063 [=](const LegalityQuery &Query) {
2064 // Pick the next power of 2, or a multiple of 64 over 128.
2065 // Whichever is smaller.
2066 const LLT &Ty = Query.Types[BigTyIdx];
2067 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2068 if (NewSizeInBits >= 256) {
2069 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2070 if (RoundedTo < NewSizeInBits)
2071 NewSizeInBits = RoundedTo;
2072 }
2073 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2074 })
2075 // Any vectors left are the wrong size. Scalarize them.
2076 .scalarize(0)
2077 .scalarize(1);
2078 }
2079
2080 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2081 // RegBankSelect.
2082 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2083 .legalFor({{S32}, {S64}})
2084 .clampScalar(0, S32, S64);
2085
2086 if (ST.hasVOP3PInsts()) {
2087 SextInReg.lowerFor({{V2S16}})
2088 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2089 // get more vector shift opportunities, since we'll get those when
2090 // expanded.
2091 .clampMaxNumElementsStrict(0, S16, 2);
2092 } else if (ST.has16BitInsts()) {
2093 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2094 } else {
2095 // Prefer to promote to s32 before lowering if we don't have 16-bit
2096 // shifts. This avoid a lot of intermediate truncate and extend operations.
2097 SextInReg.lowerFor({{S32}, {S64}});
2098 }
2099
2100 SextInReg
2101 .scalarize(0)
2102 .clampScalar(0, S32, S64)
2103 .lower();
2104
2105 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2106 .scalarize(0)
2107 .lower();
2108
2109 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2110 FSHRActionDefs.legalFor({{S32, S32}})
2111 .clampMaxNumElementsStrict(0, S16, 2);
2112 if (ST.hasVOP3PInsts())
2113 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2114 FSHRActionDefs.scalarize(0).lower();
2115
2116 if (ST.hasVOP3PInsts()) {
2118 .lowerFor({{V2S16, V2S16}})
2119 .clampMaxNumElementsStrict(0, S16, 2)
2120 .scalarize(0)
2121 .lower();
2122 } else {
2124 .scalarize(0)
2125 .lower();
2126 }
2127
2128 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2129 .legalFor({S64});
2130
2131 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2132
2134 .alwaysLegal();
2135
2136 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2137 .scalarize(0)
2138 .minScalar(0, S32)
2139 .lower();
2140
2141 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2142 .legalFor({{S32, S32}, {S64, S32}})
2143 .clampScalar(1, S32, S32)
2144 .clampScalar(0, S32, S64)
2146 .scalarize(0);
2147
2149 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2150 G_FCOPYSIGN,
2151
2152 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2153 G_READ_REGISTER, G_WRITE_REGISTER,
2154
2155 G_SADDO, G_SSUBO})
2156 .lower();
2157
2158 if (ST.hasIEEEMinimumMaximumInsts()) {
2159 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2160 .legalFor(FPTypesPK16)
2161 .clampMaxNumElements(0, S16, 2)
2162 .scalarize(0);
2163 } else if (ST.hasVOP3PInsts()) {
2164 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2165 .lowerFor({V2S16})
2166 .clampMaxNumElementsStrict(0, S16, 2)
2167 .scalarize(0)
2168 .lower();
2169 } else {
2170 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2171 .scalarize(0)
2172 .clampScalar(0, S32, S64)
2173 .lower();
2174 }
2175
2176 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2177 .lower();
2178
2179 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2180
2181 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2182 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2183 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2184 .unsupported();
2185
2187
2189 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2190 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2191 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2192 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2193 .legalFor(AllVectors)
2194 .scalarize(1)
2195 .lower();
2196
2198 verify(*ST.getInstrInfo());
2199}
2200
2203 LostDebugLocObserver &LocObserver) const {
2204 MachineIRBuilder &B = Helper.MIRBuilder;
2205 MachineRegisterInfo &MRI = *B.getMRI();
2206
2207 switch (MI.getOpcode()) {
2208 case TargetOpcode::G_ADDRSPACE_CAST:
2209 return legalizeAddrSpaceCast(MI, MRI, B);
2210 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2211 return legalizeFroundeven(MI, MRI, B);
2212 case TargetOpcode::G_FCEIL:
2213 return legalizeFceil(MI, MRI, B);
2214 case TargetOpcode::G_FREM:
2215 return legalizeFrem(MI, MRI, B);
2216 case TargetOpcode::G_INTRINSIC_TRUNC:
2217 return legalizeIntrinsicTrunc(MI, MRI, B);
2218 case TargetOpcode::G_SITOFP:
2219 return legalizeITOFP(MI, MRI, B, true);
2220 case TargetOpcode::G_UITOFP:
2221 return legalizeITOFP(MI, MRI, B, false);
2222 case TargetOpcode::G_FPTOSI:
2223 return legalizeFPTOI(MI, MRI, B, true);
2224 case TargetOpcode::G_FPTOUI:
2225 return legalizeFPTOI(MI, MRI, B, false);
2226 case TargetOpcode::G_FMINNUM:
2227 case TargetOpcode::G_FMAXNUM:
2228 case TargetOpcode::G_FMINIMUMNUM:
2229 case TargetOpcode::G_FMAXIMUMNUM:
2230 return legalizeMinNumMaxNum(Helper, MI);
2231 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2232 return legalizeExtractVectorElt(MI, MRI, B);
2233 case TargetOpcode::G_INSERT_VECTOR_ELT:
2234 return legalizeInsertVectorElt(MI, MRI, B);
2235 case TargetOpcode::G_FSIN:
2236 case TargetOpcode::G_FCOS:
2237 return legalizeSinCos(MI, MRI, B);
2238 case TargetOpcode::G_GLOBAL_VALUE:
2239 return legalizeGlobalValue(MI, MRI, B);
2240 case TargetOpcode::G_LOAD:
2241 case TargetOpcode::G_SEXTLOAD:
2242 case TargetOpcode::G_ZEXTLOAD:
2243 return legalizeLoad(Helper, MI);
2244 case TargetOpcode::G_STORE:
2245 return legalizeStore(Helper, MI);
2246 case TargetOpcode::G_FMAD:
2247 return legalizeFMad(MI, MRI, B);
2248 case TargetOpcode::G_FDIV:
2249 return legalizeFDIV(MI, MRI, B);
2250 case TargetOpcode::G_FFREXP:
2251 return legalizeFFREXP(MI, MRI, B);
2252 case TargetOpcode::G_FSQRT:
2253 return legalizeFSQRT(MI, MRI, B);
2254 case TargetOpcode::G_UDIV:
2255 case TargetOpcode::G_UREM:
2256 case TargetOpcode::G_UDIVREM:
2257 return legalizeUnsignedDIV_REM(MI, MRI, B);
2258 case TargetOpcode::G_SDIV:
2259 case TargetOpcode::G_SREM:
2260 case TargetOpcode::G_SDIVREM:
2261 return legalizeSignedDIV_REM(MI, MRI, B);
2262 case TargetOpcode::G_ATOMIC_CMPXCHG:
2263 return legalizeAtomicCmpXChg(MI, MRI, B);
2264 case TargetOpcode::G_FLOG2:
2265 return legalizeFlog2(MI, B);
2266 case TargetOpcode::G_FLOG:
2267 case TargetOpcode::G_FLOG10:
2268 return legalizeFlogCommon(MI, B);
2269 case TargetOpcode::G_FEXP2:
2270 return legalizeFExp2(MI, B);
2271 case TargetOpcode::G_FEXP:
2272 case TargetOpcode::G_FEXP10:
2273 return legalizeFExp(MI, B);
2274 case TargetOpcode::G_FPOW:
2275 return legalizeFPow(MI, B);
2276 case TargetOpcode::G_FFLOOR:
2277 return legalizeFFloor(MI, MRI, B);
2278 case TargetOpcode::G_BUILD_VECTOR:
2279 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2280 return legalizeBuildVector(MI, MRI, B);
2281 case TargetOpcode::G_MUL:
2282 return legalizeMul(Helper, MI);
2283 case TargetOpcode::G_CTLZ:
2284 case TargetOpcode::G_CTTZ:
2285 return legalizeCTLZ_CTTZ(MI, MRI, B);
2286 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2287 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2288 case TargetOpcode::G_STACKSAVE:
2289 return legalizeStackSave(MI, B);
2290 case TargetOpcode::G_GET_FPENV:
2291 return legalizeGetFPEnv(MI, MRI, B);
2292 case TargetOpcode::G_SET_FPENV:
2293 return legalizeSetFPEnv(MI, MRI, B);
2294 case TargetOpcode::G_TRAP:
2295 return legalizeTrap(MI, MRI, B);
2296 case TargetOpcode::G_DEBUGTRAP:
2297 return legalizeDebugTrap(MI, MRI, B);
2298 default:
2299 return false;
2300 }
2301
2302 llvm_unreachable("expected switch to return");
2303}
2304
2306 unsigned AS,
2308 MachineIRBuilder &B) const {
2309 MachineFunction &MF = B.getMF();
2310 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2311 const LLT S32 = LLT::scalar(32);
2312 const LLT S64 = LLT::scalar(64);
2313
2315
2316 if (ST.hasApertureRegs()) {
2317 // Note: this register is somewhat broken. When used as a 32-bit operand,
2318 // it only returns zeroes. The real value is in the upper 32 bits.
2319 // Thus, we must emit extract the high 32 bits.
2320 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2321 ? AMDGPU::SRC_SHARED_BASE
2322 : AMDGPU::SRC_PRIVATE_BASE;
2323 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2324 !ST.hasGloballyAddressableScratch()) &&
2325 "Cannot use src_private_base with globally addressable scratch!");
2326 Register Dst = MRI.createGenericVirtualRegister(S64);
2327 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2328 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2329 return B.buildUnmerge(S32, Dst).getReg(1);
2330 }
2331
2332 Register LoadAddr = MRI.createGenericVirtualRegister(
2334 // For code object version 5, private_base and shared_base are passed through
2335 // implicit kernargs.
2339
2344 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2345
2346 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2348
2349 if (!loadInputValue(KernargPtrReg, B,
2351 return Register();
2352
2354 PtrInfo.getWithOffset(Offset),
2358
2359 // Pointer address
2360 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2361 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2362 // Load address
2363 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2364 }
2365
2366 Register QueuePtr = MRI.createGenericVirtualRegister(
2368
2370 return Register();
2371
2372 // TODO: Use custom PseudoSourceValue
2374
2375 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2376 // private_segment_aperture_base_hi.
2377 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2378
2380 PtrInfo,
2383 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2384
2385 B.buildObjectPtrOffset(
2386 LoadAddr, QueuePtr,
2387 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2388 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2389}
2390
2391/// Return true if the value is a known valid address, such that a null check is
2392/// not necessary.
2394 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2395 MachineInstr *Def = MRI.getVRegDef(Val);
2396 switch (Def->getOpcode()) {
2397 case AMDGPU::G_FRAME_INDEX:
2398 case AMDGPU::G_GLOBAL_VALUE:
2399 case AMDGPU::G_BLOCK_ADDR:
2400 return true;
2401 case AMDGPU::G_CONSTANT: {
2402 const ConstantInt *CI = Def->getOperand(1).getCImm();
2403 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2404 }
2405 default:
2406 return false;
2407 }
2408
2409 return false;
2410}
2411
2414 MachineIRBuilder &B) const {
2415 MachineFunction &MF = B.getMF();
2416
2417 // MI can either be a G_ADDRSPACE_CAST or a
2418 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2419 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2420 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2421 Intrinsic::amdgcn_addrspacecast_nonnull));
2422
2423 const LLT S32 = LLT::scalar(32);
2424 Register Dst = MI.getOperand(0).getReg();
2425 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2426 : MI.getOperand(1).getReg();
2427 LLT DstTy = MRI.getType(Dst);
2428 LLT SrcTy = MRI.getType(Src);
2429 unsigned DestAS = DstTy.getAddressSpace();
2430 unsigned SrcAS = SrcTy.getAddressSpace();
2431
2432 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2433 // vector element.
2434 assert(!DstTy.isVector());
2435
2436 const AMDGPUTargetMachine &TM
2437 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2438
2439 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2440 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2441 return true;
2442 }
2443
2444 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2445 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2446 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2447 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2448 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2449 ST.hasGloballyAddressableScratch()) {
2450 // flat -> private with globally addressable scratch: subtract
2451 // src_flat_scratch_base_lo.
2452 const LLT S32 = LLT::scalar(32);
2453 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2454 Register FlatScratchBaseLo =
2455 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2456 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2457 .getReg(0);
2458 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2459 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2460 return B.buildIntToPtr(Dst, Sub).getReg(0);
2461 }
2462
2463 // Extract low 32-bits of the pointer.
2464 return B.buildExtract(Dst, Src, 0).getReg(0);
2465 };
2466
2467 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2468 // G_ADDRSPACE_CAST we need to guess.
2469 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2470 castFlatToLocalOrPrivate(Dst);
2471 MI.eraseFromParent();
2472 return true;
2473 }
2474
2475 unsigned NullVal = TM.getNullPointerValue(DestAS);
2476
2477 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2478 auto FlatNull = B.buildConstant(SrcTy, 0);
2479
2480 // Extract low 32-bits of the pointer.
2481 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2482
2483 auto CmpRes =
2484 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2485 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2486
2487 MI.eraseFromParent();
2488 return true;
2489 }
2490
2491 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2492 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2493 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2494 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2495 // Coerce the type of the low half of the result so we can use
2496 // merge_values.
2497 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2498
2499 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2500 ST.hasGloballyAddressableScratch()) {
2501 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2502 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2503 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2504 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2505 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2506 .addUse(AllOnes)
2507 .addUse(ThreadID)
2508 .getReg(0);
2509 if (ST.isWave64()) {
2510 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2511 .addUse(AllOnes)
2512 .addUse(ThreadID)
2513 .getReg(0);
2514 }
2515 Register ShAmt =
2516 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2517 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2518 Register CvtPtr =
2519 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2520 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2521 // 64-bit hi:lo value.
2522 Register FlatScratchBase =
2523 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2524 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2525 .getReg(0);
2526 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2527 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2528 }
2529
2530 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2531 if (!ApertureReg.isValid())
2532 return false;
2533
2534 // TODO: Should we allow mismatched types but matching sizes in merges to
2535 // avoid the ptrtoint?
2536 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2537 };
2538
2539 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2540 // G_ADDRSPACE_CAST we need to guess.
2541 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2542 castLocalOrPrivateToFlat(Dst);
2543 MI.eraseFromParent();
2544 return true;
2545 }
2546
2547 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2548
2549 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2550 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2551
2552 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2553 SegmentNull.getReg(0));
2554
2555 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2556
2557 MI.eraseFromParent();
2558 return true;
2559 }
2560
2561 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2562 SrcTy.getSizeInBits() == 64) {
2563 // Truncate.
2564 B.buildExtract(Dst, Src, 0);
2565 MI.eraseFromParent();
2566 return true;
2567 }
2568
2569 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2570 DstTy.getSizeInBits() == 64) {
2572 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2573 auto PtrLo = B.buildPtrToInt(S32, Src);
2574 if (AddrHiVal == 0) {
2575 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2576 B.buildIntToPtr(Dst, Zext);
2577 } else {
2578 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2579 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2580 }
2581
2582 MI.eraseFromParent();
2583 return true;
2584 }
2585
2586 // Invalid casts are poison.
2587 // TODO: Should return poison
2588 B.buildUndef(Dst);
2589 MI.eraseFromParent();
2590 return true;
2591}
2592
2595 MachineIRBuilder &B) const {
2596 Register Src = MI.getOperand(1).getReg();
2597 LLT Ty = MRI.getType(Src);
2598 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2599
2600 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2601 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2602
2603 auto C1 = B.buildFConstant(Ty, C1Val);
2604 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2605
2606 // TODO: Should this propagate fast-math-flags?
2607 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2608 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2609
2610 auto C2 = B.buildFConstant(Ty, C2Val);
2611 auto Fabs = B.buildFAbs(Ty, Src);
2612
2613 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2614 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2615 MI.eraseFromParent();
2616 return true;
2617}
2618
2621 MachineIRBuilder &B) const {
2622
2623 const LLT S1 = LLT::scalar(1);
2624 const LLT S64 = LLT::scalar(64);
2625
2626 Register Src = MI.getOperand(1).getReg();
2627 assert(MRI.getType(Src) == S64);
2628
2629 // result = trunc(src)
2630 // if (src > 0.0 && src != result)
2631 // result += 1.0
2632
2633 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2634
2635 const auto Zero = B.buildFConstant(S64, 0.0);
2636 const auto One = B.buildFConstant(S64, 1.0);
2637 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2638 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2639 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2640 auto Add = B.buildSelect(S64, And, One, Zero);
2641
2642 // TODO: Should this propagate fast-math-flags?
2643 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2644 MI.eraseFromParent();
2645 return true;
2646}
2647
2650 MachineIRBuilder &B) const {
2651 Register DstReg = MI.getOperand(0).getReg();
2652 Register Src0Reg = MI.getOperand(1).getReg();
2653 Register Src1Reg = MI.getOperand(2).getReg();
2654 auto Flags = MI.getFlags();
2655 LLT Ty = MRI.getType(DstReg);
2656
2657 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2658 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2659 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2660 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2661 MI.eraseFromParent();
2662 return true;
2663}
2664
2667 const unsigned FractBits = 52;
2668 const unsigned ExpBits = 11;
2669 LLT S32 = LLT::scalar(32);
2670
2671 auto Const0 = B.buildConstant(S32, FractBits - 32);
2672 auto Const1 = B.buildConstant(S32, ExpBits);
2673
2674 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2675 .addUse(Hi)
2676 .addUse(Const0.getReg(0))
2677 .addUse(Const1.getReg(0));
2678
2679 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2680}
2681
2684 MachineIRBuilder &B) const {
2685 const LLT S1 = LLT::scalar(1);
2686 const LLT S32 = LLT::scalar(32);
2687 const LLT S64 = LLT::scalar(64);
2688
2689 Register Src = MI.getOperand(1).getReg();
2690 assert(MRI.getType(Src) == S64);
2691
2692 // TODO: Should this use extract since the low half is unused?
2693 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2694 Register Hi = Unmerge.getReg(1);
2695
2696 // Extract the upper half, since this is where we will find the sign and
2697 // exponent.
2698 auto Exp = extractF64Exponent(Hi, B);
2699
2700 const unsigned FractBits = 52;
2701
2702 // Extract the sign bit.
2703 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2704 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2705
2706 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2707
2708 const auto Zero32 = B.buildConstant(S32, 0);
2709
2710 // Extend back to 64-bits.
2711 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2712
2713 auto Shr = B.buildAShr(S64, FractMask, Exp);
2714 auto Not = B.buildNot(S64, Shr);
2715 auto Tmp0 = B.buildAnd(S64, Src, Not);
2716 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2717
2718 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2719 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2720
2721 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2722 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2723 MI.eraseFromParent();
2724 return true;
2725}
2726
2729 MachineIRBuilder &B, bool Signed) const {
2730
2731 Register Dst = MI.getOperand(0).getReg();
2732 Register Src = MI.getOperand(1).getReg();
2733
2734 const LLT S64 = LLT::scalar(64);
2735 const LLT S32 = LLT::scalar(32);
2736
2737 assert(MRI.getType(Src) == S64);
2738
2739 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2740 auto ThirtyTwo = B.buildConstant(S32, 32);
2741
2742 if (MRI.getType(Dst) == S64) {
2743 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2744 : B.buildUITOFP(S64, Unmerge.getReg(1));
2745
2746 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2747 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2748
2749 // TODO: Should this propagate fast-math-flags?
2750 B.buildFAdd(Dst, LdExp, CvtLo);
2751 MI.eraseFromParent();
2752 return true;
2753 }
2754
2755 assert(MRI.getType(Dst) == S32);
2756
2757 auto One = B.buildConstant(S32, 1);
2758
2759 MachineInstrBuilder ShAmt;
2760 if (Signed) {
2761 auto ThirtyOne = B.buildConstant(S32, 31);
2762 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2763 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2764 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2765 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2766 .addUse(Unmerge.getReg(1));
2767 auto LS2 = B.buildSub(S32, LS, One);
2768 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2769 } else
2770 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2771 auto Norm = B.buildShl(S64, Src, ShAmt);
2772 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2773 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2774 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2775 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2776 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2777 B.buildFLdexp(Dst, FVal, Scale);
2778 MI.eraseFromParent();
2779 return true;
2780}
2781
2782// TODO: Copied from DAG implementation. Verify logic and document how this
2783// actually works.
2787 bool Signed) const {
2788
2789 Register Dst = MI.getOperand(0).getReg();
2790 Register Src = MI.getOperand(1).getReg();
2791
2792 const LLT S64 = LLT::scalar(64);
2793 const LLT S32 = LLT::scalar(32);
2794
2795 const LLT SrcLT = MRI.getType(Src);
2796 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2797
2798 unsigned Flags = MI.getFlags();
2799
2800 // The basic idea of converting a floating point number into a pair of 32-bit
2801 // integers is illustrated as follows:
2802 //
2803 // tf := trunc(val);
2804 // hif := floor(tf * 2^-32);
2805 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2806 // hi := fptoi(hif);
2807 // lo := fptoi(lof);
2808 //
2809 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2811 if (Signed && SrcLT == S32) {
2812 // However, a 32-bit floating point number has only 23 bits mantissa and
2813 // it's not enough to hold all the significant bits of `lof` if val is
2814 // negative. To avoid the loss of precision, We need to take the absolute
2815 // value after truncating and flip the result back based on the original
2816 // signedness.
2817 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2818 Trunc = B.buildFAbs(S32, Trunc, Flags);
2819 }
2820 MachineInstrBuilder K0, K1;
2821 if (SrcLT == S64) {
2822 K0 = B.buildFConstant(
2823 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2824 K1 = B.buildFConstant(
2825 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2826 } else {
2827 K0 = B.buildFConstant(
2828 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2829 K1 = B.buildFConstant(
2830 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2831 }
2832
2833 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2834 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2835 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2836
2837 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2838 : B.buildFPTOUI(S32, FloorMul);
2839 auto Lo = B.buildFPTOUI(S32, Fma);
2840
2841 if (Signed && SrcLT == S32) {
2842 // Flip the result based on the signedness, which is either all 0s or 1s.
2843 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2844 // r := xor({lo, hi}, sign) - sign;
2845 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2846 Sign);
2847 } else
2848 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2849 MI.eraseFromParent();
2850
2851 return true;
2852}
2853
2855 MachineInstr &MI) const {
2856 MachineFunction &MF = Helper.MIRBuilder.getMF();
2858
2859 // With ieee_mode disabled, the instructions have the correct behavior.
2860 if (!MFI->getMode().IEEE)
2861 return true;
2862
2864}
2865
2868 MachineIRBuilder &B) const {
2869 // TODO: Should move some of this into LegalizerHelper.
2870
2871 // TODO: Promote dynamic indexing of s16 to s32
2872
2873 Register Dst = MI.getOperand(0).getReg();
2874 Register Vec = MI.getOperand(1).getReg();
2875
2876 LLT VecTy = MRI.getType(Vec);
2877 LLT EltTy = VecTy.getElementType();
2878 assert(EltTy == MRI.getType(Dst));
2879
2880 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2881 // but we can't go directly to that logic becasue you can't bitcast a vector
2882 // of pointers to a vector of integers. Therefore, introduce an intermediate
2883 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2884 // drive the legalization forward.
2885 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2886 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2887 LLT IntVecTy = VecTy.changeElementType(IntTy);
2888
2889 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2890 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2891 B.buildIntToPtr(Dst, IntElt);
2892
2893 MI.eraseFromParent();
2894 return true;
2895 }
2896
2897 // FIXME: Artifact combiner probably should have replaced the truncated
2898 // constant before this, so we shouldn't need
2899 // getIConstantVRegValWithLookThrough.
2900 std::optional<ValueAndVReg> MaybeIdxVal =
2901 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2902 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2903 return true;
2904 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2905
2906 if (IdxVal < VecTy.getNumElements()) {
2907 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2908 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2909 } else {
2910 B.buildUndef(Dst);
2911 }
2912
2913 MI.eraseFromParent();
2914 return true;
2915}
2916
2919 MachineIRBuilder &B) const {
2920 // TODO: Should move some of this into LegalizerHelper.
2921
2922 // TODO: Promote dynamic indexing of s16 to s32
2923
2924 Register Dst = MI.getOperand(0).getReg();
2925 Register Vec = MI.getOperand(1).getReg();
2926 Register Ins = MI.getOperand(2).getReg();
2927
2928 LLT VecTy = MRI.getType(Vec);
2929 LLT EltTy = VecTy.getElementType();
2930 assert(EltTy == MRI.getType(Ins));
2931
2932 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2933 // but we can't go directly to that logic becasue you can't bitcast a vector
2934 // of pointers to a vector of integers. Therefore, make the pointer vector
2935 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2936 // new value, and then inttoptr the result vector back. This will then allow
2937 // the rest of legalization to take over.
2938 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2939 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2940 LLT IntVecTy = VecTy.changeElementType(IntTy);
2941
2942 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2943 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2944 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2945 MI.getOperand(3));
2946 B.buildIntToPtr(Dst, IntVecDest);
2947 MI.eraseFromParent();
2948 return true;
2949 }
2950
2951 // FIXME: Artifact combiner probably should have replaced the truncated
2952 // constant before this, so we shouldn't need
2953 // getIConstantVRegValWithLookThrough.
2954 std::optional<ValueAndVReg> MaybeIdxVal =
2955 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2956 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2957 return true;
2958
2959 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2960
2961 unsigned NumElts = VecTy.getNumElements();
2962 if (IdxVal < NumElts) {
2964 for (unsigned i = 0; i < NumElts; ++i)
2965 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2966 B.buildUnmerge(SrcRegs, Vec);
2967
2968 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2969 B.buildMergeLikeInstr(Dst, SrcRegs);
2970 } else {
2971 B.buildUndef(Dst);
2972 }
2973
2974 MI.eraseFromParent();
2975 return true;
2976}
2977
2980 MachineIRBuilder &B) const {
2981
2982 Register DstReg = MI.getOperand(0).getReg();
2983 Register SrcReg = MI.getOperand(1).getReg();
2984 LLT Ty = MRI.getType(DstReg);
2985 unsigned Flags = MI.getFlags();
2986
2987 Register TrigVal;
2988 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2989 if (ST.hasTrigReducedRange()) {
2990 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2991 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2992 .addUse(MulVal.getReg(0))
2993 .setMIFlags(Flags)
2994 .getReg(0);
2995 } else
2996 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2997
2998 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2999 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3000 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3001 .addUse(TrigVal)
3002 .setMIFlags(Flags);
3003 MI.eraseFromParent();
3004 return true;
3005}
3006
3009 const GlobalValue *GV,
3010 int64_t Offset,
3011 unsigned GAFlags) const {
3012 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3013 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3014 // to the following code sequence:
3015 //
3016 // For constant address space:
3017 // s_getpc_b64 s[0:1]
3018 // s_add_u32 s0, s0, $symbol
3019 // s_addc_u32 s1, s1, 0
3020 //
3021 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3022 // a fixup or relocation is emitted to replace $symbol with a literal
3023 // constant, which is a pc-relative offset from the encoding of the $symbol
3024 // operand to the global variable.
3025 //
3026 // For global address space:
3027 // s_getpc_b64 s[0:1]
3028 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3029 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3030 //
3031 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3032 // fixups or relocations are emitted to replace $symbol@*@lo and
3033 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3034 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3035 // operand to the global variable.
3036
3038
3039 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3040 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3041
3042 if (ST.has64BitLiterals()) {
3043 assert(GAFlags != SIInstrInfo::MO_NONE);
3044
3046 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3047 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3048 } else {
3050 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3051
3052 MIB.addGlobalAddress(GV, Offset, GAFlags);
3053 if (GAFlags == SIInstrInfo::MO_NONE)
3054 MIB.addImm(0);
3055 else
3056 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3057 }
3058
3059 if (!B.getMRI()->getRegClassOrNull(PCReg))
3060 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3061
3062 if (PtrTy.getSizeInBits() == 32)
3063 B.buildExtract(DstReg, PCReg, 0);
3064 return true;
3065}
3066
3067// Emit a ABS32_LO / ABS32_HI relocation stub.
3069 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3070 MachineRegisterInfo &MRI) const {
3071 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3072
3073 if (RequiresHighHalf && ST.has64BitLiterals()) {
3074 if (!MRI.getRegClassOrNull(DstReg))
3075 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3076 B.buildInstr(AMDGPU::S_MOV_B64)
3077 .addDef(DstReg)
3078 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3079 return;
3080 }
3081
3082 LLT S32 = LLT::scalar(32);
3083
3084 // Use the destination directly, if and only if we store the lower address
3085 // part only and we don't have a register class being set.
3086 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3087 ? DstReg
3088 : MRI.createGenericVirtualRegister(S32);
3089
3090 if (!MRI.getRegClassOrNull(AddrLo))
3091 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3092
3093 // Write the lower half.
3094 B.buildInstr(AMDGPU::S_MOV_B32)
3095 .addDef(AddrLo)
3096 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3097
3098 // If required, write the upper half as well.
3099 if (RequiresHighHalf) {
3100 assert(PtrTy.getSizeInBits() == 64 &&
3101 "Must provide a 64-bit pointer type!");
3102
3103 Register AddrHi = MRI.createGenericVirtualRegister(S32);
3104 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3105
3106 B.buildInstr(AMDGPU::S_MOV_B32)
3107 .addDef(AddrHi)
3108 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3109
3110 // Use the destination directly, if and only if we don't have a register
3111 // class being set.
3112 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3113 ? DstReg
3114 : MRI.createGenericVirtualRegister(LLT::scalar(64));
3115
3116 if (!MRI.getRegClassOrNull(AddrDst))
3117 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3118
3119 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3120
3121 // If we created a new register for the destination, cast the result into
3122 // the final output.
3123 if (AddrDst != DstReg)
3124 B.buildCast(DstReg, AddrDst);
3125 } else if (AddrLo != DstReg) {
3126 // If we created a new register for the destination, cast the result into
3127 // the final output.
3128 B.buildCast(DstReg, AddrLo);
3129 }
3130}
3131
3134 MachineIRBuilder &B) const {
3135 Register DstReg = MI.getOperand(0).getReg();
3136 LLT Ty = MRI.getType(DstReg);
3137 unsigned AS = Ty.getAddressSpace();
3138
3139 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3140 MachineFunction &MF = B.getMF();
3142
3144 if (!MFI->isModuleEntryFunction() &&
3145 GV->getName() != "llvm.amdgcn.module.lds" &&
3147 const Function &Fn = MF.getFunction();
3149 Fn, "local memory global used by non-kernel function",
3150 MI.getDebugLoc(), DS_Warning));
3151
3152 // We currently don't have a way to correctly allocate LDS objects that
3153 // aren't directly associated with a kernel. We do force inlining of
3154 // functions that use local objects. However, if these dead functions are
3155 // not eliminated, we don't want a compile time error. Just emit a warning
3156 // and a trap, since there should be no callable path here.
3157 B.buildTrap();
3158 B.buildUndef(DstReg);
3159 MI.eraseFromParent();
3160 return true;
3161 }
3162
3163 // TODO: We could emit code to handle the initialization somewhere.
3164 // We ignore the initializer for now and legalize it to allow selection.
3165 // The initializer will anyway get errored out during assembly emission.
3166 const SITargetLowering *TLI = ST.getTargetLowering();
3167 if (!TLI->shouldUseLDSConstAddress(GV)) {
3168 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3169 return true; // Leave in place;
3170 }
3171
3172 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3173 Type *Ty = GV->getValueType();
3174 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3175 // zero-sized type in other languages to declare the dynamic shared
3176 // memory which size is not known at the compile time. They will be
3177 // allocated by the runtime and placed directly after the static
3178 // allocated ones. They all share the same offset.
3179 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3180 // Adjust alignment for that dynamic shared memory array.
3182 LLT S32 = LLT::scalar(32);
3183 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3184 B.buildIntToPtr(DstReg, Sz);
3185 MI.eraseFromParent();
3186 return true;
3187 }
3188 }
3189
3190 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3191 *cast<GlobalVariable>(GV)));
3192 MI.eraseFromParent();
3193 return true;
3194 }
3195
3196 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3197 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3198 MI.eraseFromParent();
3199 return true;
3200 }
3201
3202 const SITargetLowering *TLI = ST.getTargetLowering();
3203
3204 if (TLI->shouldEmitFixup(GV)) {
3205 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3206 MI.eraseFromParent();
3207 return true;
3208 }
3209
3210 if (TLI->shouldEmitPCReloc(GV)) {
3211 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3212 MI.eraseFromParent();
3213 return true;
3214 }
3215
3217 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3218
3219 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3224 LoadTy, Align(8));
3225
3226 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3227
3228 if (Ty.getSizeInBits() == 32) {
3229 // Truncate if this is a 32-bit constant address.
3230 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3231 B.buildExtract(DstReg, Load, 0);
3232 } else
3233 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3234
3235 MI.eraseFromParent();
3236 return true;
3237}
3238
3240 if (Ty.isVector())
3241 return Ty.changeElementCount(
3242 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3243 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3244}
3245
3247 MachineInstr &MI) const {
3248 MachineIRBuilder &B = Helper.MIRBuilder;
3249 MachineRegisterInfo &MRI = *B.getMRI();
3250 GISelChangeObserver &Observer = Helper.Observer;
3251
3252 Register PtrReg = MI.getOperand(1).getReg();
3253 LLT PtrTy = MRI.getType(PtrReg);
3254 unsigned AddrSpace = PtrTy.getAddressSpace();
3255
3256 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3258 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3259 Observer.changingInstr(MI);
3260 MI.getOperand(1).setReg(Cast.getReg(0));
3261 Observer.changedInstr(MI);
3262 return true;
3263 }
3264
3265 if (MI.getOpcode() != AMDGPU::G_LOAD)
3266 return false;
3267
3268 Register ValReg = MI.getOperand(0).getReg();
3269 LLT ValTy = MRI.getType(ValReg);
3270
3271 if (hasBufferRsrcWorkaround(ValTy)) {
3272 Observer.changingInstr(MI);
3274 Observer.changedInstr(MI);
3275 return true;
3276 }
3277
3278 MachineMemOperand *MMO = *MI.memoperands_begin();
3279 const unsigned ValSize = ValTy.getSizeInBits();
3280 const LLT MemTy = MMO->getMemoryType();
3281 const Align MemAlign = MMO->getAlign();
3282 const unsigned MemSize = MemTy.getSizeInBits();
3283 const uint64_t AlignInBits = 8 * MemAlign.value();
3284
3285 // Widen non-power-of-2 loads to the alignment if needed
3286 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3287 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3288
3289 // This was already the correct extending load result type, so just adjust
3290 // the memory type.
3291 if (WideMemSize == ValSize) {
3292 MachineFunction &MF = B.getMF();
3293
3294 MachineMemOperand *WideMMO =
3295 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3296 Observer.changingInstr(MI);
3297 MI.setMemRefs(MF, {WideMMO});
3298 Observer.changedInstr(MI);
3299 return true;
3300 }
3301
3302 // Don't bother handling edge case that should probably never be produced.
3303 if (ValSize > WideMemSize)
3304 return false;
3305
3306 LLT WideTy = widenToNextPowerOf2(ValTy);
3307
3308 Register WideLoad;
3309 if (!WideTy.isVector()) {
3310 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3311 B.buildTrunc(ValReg, WideLoad).getReg(0);
3312 } else {
3313 // Extract the subvector.
3314
3315 if (isRegisterType(ST, ValTy)) {
3316 // If this a case where G_EXTRACT is legal, use it.
3317 // (e.g. <3 x s32> -> <4 x s32>)
3318 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3319 B.buildExtract(ValReg, WideLoad, 0);
3320 } else {
3321 // For cases where the widened type isn't a nice register value, unmerge
3322 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3323 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3324 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3325 }
3326 }
3327
3328 MI.eraseFromParent();
3329 return true;
3330 }
3331
3332 return false;
3333}
3334
3336 MachineInstr &MI) const {
3337 MachineIRBuilder &B = Helper.MIRBuilder;
3338 MachineRegisterInfo &MRI = *B.getMRI();
3339 GISelChangeObserver &Observer = Helper.Observer;
3340
3341 Register DataReg = MI.getOperand(0).getReg();
3342 LLT DataTy = MRI.getType(DataReg);
3343
3344 if (hasBufferRsrcWorkaround(DataTy)) {
3345 Observer.changingInstr(MI);
3347 Observer.changedInstr(MI);
3348 return true;
3349 }
3350 return false;
3351}
3352
3355 MachineIRBuilder &B) const {
3356 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3357 assert(Ty.isScalar());
3358
3359 MachineFunction &MF = B.getMF();
3361
3362 // TODO: Always legal with future ftz flag.
3363 // FIXME: Do we need just output?
3364 if (Ty == LLT::float32() &&
3366 return true;
3367 if (Ty == LLT::float16() &&
3369 return true;
3370
3371 MachineIRBuilder HelperBuilder(MI);
3372 GISelObserverWrapper DummyObserver;
3373 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3374 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3375}
3376
3379 Register DstReg = MI.getOperand(0).getReg();
3380 Register PtrReg = MI.getOperand(1).getReg();
3381 Register CmpVal = MI.getOperand(2).getReg();
3382 Register NewVal = MI.getOperand(3).getReg();
3383
3384 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3385 "this should not have been custom lowered");
3386
3387 LLT ValTy = MRI.getType(CmpVal);
3388 LLT VecTy = LLT::fixed_vector(2, ValTy);
3389
3390 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3391
3392 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3393 .addDef(DstReg)
3394 .addUse(PtrReg)
3395 .addUse(PackedVal)
3396 .setMemRefs(MI.memoperands());
3397
3398 MI.eraseFromParent();
3399 return true;
3400}
3401
3402/// Return true if it's known that \p Src can never be an f32 denormal value.
3404 Register Src) {
3405 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3406 switch (DefMI->getOpcode()) {
3407 case TargetOpcode::G_INTRINSIC: {
3409 case Intrinsic::amdgcn_frexp_mant:
3410 case Intrinsic::amdgcn_log:
3411 case Intrinsic::amdgcn_log_clamp:
3412 case Intrinsic::amdgcn_exp2:
3413 case Intrinsic::amdgcn_sqrt:
3414 return true;
3415 default:
3416 break;
3417 }
3418
3419 break;
3420 }
3421 case TargetOpcode::G_FSQRT:
3422 return true;
3423 case TargetOpcode::G_FFREXP: {
3424 if (DefMI->getOperand(0).getReg() == Src)
3425 return true;
3426 break;
3427 }
3428 case TargetOpcode::G_FPEXT: {
3429 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3430 }
3431 default:
3432 return false;
3433 }
3434
3435 return false;
3436}
3437
3438static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3439 return Flags & MachineInstr::FmAfn;
3440}
3441
3443 unsigned Flags) {
3444 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3447}
3448
3449std::pair<Register, Register>
3451 unsigned Flags) const {
3452 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3453 return {};
3454
3455 const LLT F32 = LLT::scalar(32);
3456 auto SmallestNormal = B.buildFConstant(
3458 auto IsLtSmallestNormal =
3459 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3460
3461 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3462 auto One = B.buildFConstant(F32, 1.0);
3463 auto ScaleFactor =
3464 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3465 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3466
3467 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3468}
3469
3471 MachineIRBuilder &B) const {
3472 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3473 // If we have to handle denormals, scale up the input and adjust the result.
3474
3475 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3476 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3477
3478 Register Dst = MI.getOperand(0).getReg();
3479 Register Src = MI.getOperand(1).getReg();
3480 LLT Ty = B.getMRI()->getType(Dst);
3481 unsigned Flags = MI.getFlags();
3482
3483 if (Ty == LLT::scalar(16)) {
3484 const LLT F32 = LLT::scalar(32);
3485 // Nothing in half is a denormal when promoted to f32.
3486 auto Ext = B.buildFPExt(F32, Src, Flags);
3487 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3488 .addUse(Ext.getReg(0))
3489 .setMIFlags(Flags);
3490 B.buildFPTrunc(Dst, Log2, Flags);
3491 MI.eraseFromParent();
3492 return true;
3493 }
3494
3495 assert(Ty == LLT::scalar(32));
3496
3497 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3498 if (!ScaledInput) {
3499 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3500 .addUse(Src)
3501 .setMIFlags(Flags);
3502 MI.eraseFromParent();
3503 return true;
3504 }
3505
3506 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3507 .addUse(ScaledInput)
3508 .setMIFlags(Flags);
3509
3510 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3511 auto Zero = B.buildFConstant(Ty, 0.0);
3512 auto ResultOffset =
3513 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3514 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3515
3516 MI.eraseFromParent();
3517 return true;
3518}
3519
3521 Register Z, unsigned Flags) {
3522 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3523 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3524}
3525
3527 MachineIRBuilder &B) const {
3528 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3529 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3530
3531 MachineRegisterInfo &MRI = *B.getMRI();
3532 Register Dst = MI.getOperand(0).getReg();
3533 Register X = MI.getOperand(1).getReg();
3534 unsigned Flags = MI.getFlags();
3535 const LLT Ty = MRI.getType(X);
3536 MachineFunction &MF = B.getMF();
3537
3538 const LLT F32 = LLT::scalar(32);
3539 const LLT F16 = LLT::scalar(16);
3540
3541 const AMDGPUTargetMachine &TM =
3542 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3543
3544 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3545 if (Ty == F16 && !ST.has16BitInsts()) {
3546 Register LogVal = MRI.createGenericVirtualRegister(F32);
3547 auto PromoteSrc = B.buildFPExt(F32, X);
3548 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3549 B.buildFPTrunc(Dst, LogVal);
3550 } else {
3551 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3552 }
3553
3554 MI.eraseFromParent();
3555 return true;
3556 }
3557
3558 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3559 if (ScaledInput)
3560 X = ScaledInput;
3561
3562 auto Y =
3563 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3564
3565 Register R;
3566 if (ST.hasFastFMAF32()) {
3567 // c+cc are ln(2)/ln(10) to more than 49 bits
3568 const float c_log10 = 0x1.344134p-2f;
3569 const float cc_log10 = 0x1.09f79ep-26f;
3570
3571 // c + cc is ln(2) to more than 49 bits
3572 const float c_log = 0x1.62e42ep-1f;
3573 const float cc_log = 0x1.efa39ep-25f;
3574
3575 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3576 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3577 // This adds correction terms for which contraction may lead to an increase
3578 // in the error of the approximation, so disable it.
3579 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3580 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3581 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3582 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3583 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3584 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3585 } else {
3586 // ch+ct is ln(2)/ln(10) to more than 36 bits
3587 const float ch_log10 = 0x1.344000p-2f;
3588 const float ct_log10 = 0x1.3509f6p-18f;
3589
3590 // ch + ct is ln(2) to more than 36 bits
3591 const float ch_log = 0x1.62e000p-1f;
3592 const float ct_log = 0x1.0bfbe8p-15f;
3593
3594 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3595 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3596
3597 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3598 auto YH = B.buildAnd(Ty, Y, MaskConst);
3599 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3600 // This adds correction terms for which contraction may lead to an increase
3601 // in the error of the approximation, so disable it.
3602 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3603 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3604
3605 Register Mad0 =
3606 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3607 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3608 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3609 }
3610
3611 const bool IsFiniteOnly =
3612 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3613 MI.getFlag(MachineInstr::FmNoInfs);
3614
3615 if (!IsFiniteOnly) {
3616 // Expand isfinite(x) => fabs(x) < inf
3617 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3618 auto Fabs = B.buildFAbs(Ty, Y);
3619 auto IsFinite =
3620 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3621 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3622 }
3623
3624 if (ScaledInput) {
3625 auto Zero = B.buildFConstant(Ty, 0.0);
3626 auto ShiftK =
3627 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3628 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3629 B.buildFSub(Dst, R, Shift, Flags);
3630 } else {
3631 B.buildCopy(Dst, R);
3632 }
3633
3634 MI.eraseFromParent();
3635 return true;
3636}
3637
3639 Register Src, bool IsLog10,
3640 unsigned Flags) const {
3641 const double Log2BaseInverted =
3643
3644 LLT Ty = B.getMRI()->getType(Dst);
3645
3646 if (Ty == LLT::scalar(32)) {
3647 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3648 if (ScaledInput) {
3649 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3650 .addUse(Src)
3651 .setMIFlags(Flags);
3652 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3653 auto Zero = B.buildFConstant(Ty, 0.0);
3654 auto ResultOffset =
3655 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3656 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3657
3658 if (ST.hasFastFMAF32())
3659 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3660 else {
3661 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3662 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3663 }
3664
3665 return true;
3666 }
3667 }
3668
3669 auto Log2Operand = Ty == LLT::scalar(16)
3670 ? B.buildFLog2(Ty, Src, Flags)
3671 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3672 .addUse(Src)
3673 .setMIFlags(Flags);
3674 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3675 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3676 return true;
3677}
3678
3680 MachineIRBuilder &B) const {
3681 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3682 // If we have to handle denormals, scale up the input and adjust the result.
3683
3684 Register Dst = MI.getOperand(0).getReg();
3685 Register Src = MI.getOperand(1).getReg();
3686 unsigned Flags = MI.getFlags();
3687 LLT Ty = B.getMRI()->getType(Dst);
3688 const LLT F16 = LLT::scalar(16);
3689 const LLT F32 = LLT::scalar(32);
3690
3691 if (Ty == F16) {
3692 // Nothing in half is a denormal when promoted to f32.
3693 auto Ext = B.buildFPExt(F32, Src, Flags);
3694 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3695 .addUse(Ext.getReg(0))
3696 .setMIFlags(Flags);
3697 B.buildFPTrunc(Dst, Log2, Flags);
3698 MI.eraseFromParent();
3699 return true;
3700 }
3701
3702 assert(Ty == F32);
3703
3704 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3705 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3706 .addUse(Src)
3707 .setMIFlags(Flags);
3708 MI.eraseFromParent();
3709 return true;
3710 }
3711
3712 // bool needs_scaling = x < -0x1.f80000p+6f;
3713 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3714
3715 // -nextafter(128.0, -1)
3716 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3717 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3718 RangeCheckConst, Flags);
3719
3720 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3721 auto Zero = B.buildFConstant(Ty, 0.0);
3722 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3723 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3724
3725 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3726 .addUse(AddInput.getReg(0))
3727 .setMIFlags(Flags);
3728
3729 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3730 auto One = B.buildFConstant(Ty, 1.0);
3731 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3732 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3733 MI.eraseFromParent();
3734 return true;
3735}
3736
3738 const SrcOp &Src, unsigned Flags) {
3739 LLT Ty = Dst.getLLTTy(*B.getMRI());
3740
3741 if (Ty == LLT::scalar(32)) {
3742 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3743 .addUse(Src.getReg())
3744 .setMIFlags(Flags);
3745 }
3746 return B.buildFExp2(Dst, Src, Flags);
3747}
3748
3750 Register Dst, Register X,
3751 unsigned Flags,
3752 bool IsExp10) const {
3753 LLT Ty = B.getMRI()->getType(X);
3754
3755 // exp(x) -> exp2(M_LOG2E_F * x);
3756 // exp10(x) -> exp2(log2(10) * x);
3757 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3758 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3759 buildExp(B, Dst, Mul, Flags);
3760 return true;
3761}
3762
3764 Register X, unsigned Flags) const {
3765 LLT Ty = B.getMRI()->getType(Dst);
3766 LLT F32 = LLT::scalar(32);
3767
3768 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3769 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3770 }
3771
3772 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3773 auto NeedsScaling =
3774 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3775 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3776 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3777 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3778
3779 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3780 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3781
3782 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3783 .addUse(ExpInput.getReg(0))
3784 .setMIFlags(Flags);
3785
3786 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3787 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3788 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3789 return true;
3790}
3791
3793 Register Dst, Register X,
3794 unsigned Flags) const {
3795 LLT Ty = B.getMRI()->getType(Dst);
3796 LLT F32 = LLT::scalar(32);
3797
3798 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3799 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3800 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3801 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3802
3803 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3804 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3805 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3806 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3807 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3808 return true;
3809 }
3810
3811 // bool s = x < -0x1.2f7030p+5f;
3812 // x += s ? 0x1.0p+5f : 0.0f;
3813 // exp10 = exp2(x * 0x1.a92000p+1f) *
3814 // exp2(x * 0x1.4f0978p-11f) *
3815 // (s ? 0x1.9f623ep-107f : 1.0f);
3816
3817 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3818 auto NeedsScaling =
3819 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3820
3821 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3822 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3823 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3824
3825 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3826 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3827
3828 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3829 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3830 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3831 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3832
3833 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3834 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
3835 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3836
3837 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3838 return true;
3839}
3840
3842 MachineIRBuilder &B) const {
3843 Register Dst = MI.getOperand(0).getReg();
3844 Register X = MI.getOperand(1).getReg();
3845 const unsigned Flags = MI.getFlags();
3846 MachineFunction &MF = B.getMF();
3847 MachineRegisterInfo &MRI = *B.getMRI();
3848 LLT Ty = MRI.getType(Dst);
3849 const LLT F16 = LLT::scalar(16);
3850 const LLT F32 = LLT::scalar(32);
3851 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3852
3853 if (Ty == F16) {
3854 // v_exp_f16 (fmul x, log2e)
3855 if (allowApproxFunc(MF, Flags)) {
3856 // TODO: Does this really require fast?
3857 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
3858 : legalizeFExpUnsafe(B, Dst, X, Flags);
3859 MI.eraseFromParent();
3860 return true;
3861 }
3862
3863 // Nothing in half is a denormal when promoted to f32.
3864 //
3865 // exp(f16 x) ->
3866 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3867 //
3868 // exp10(f16 x) ->
3869 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3870 auto Ext = B.buildFPExt(F32, X, Flags);
3871 Register Lowered = MRI.createGenericVirtualRegister(F32);
3872 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
3873 B.buildFPTrunc(Dst, Lowered, Flags);
3874 MI.eraseFromParent();
3875 return true;
3876 }
3877
3878 assert(Ty == F32);
3879
3880 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3881 // library behavior. Also, is known-not-daz source sufficient?
3882 if (allowApproxFunc(MF, Flags)) {
3883 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
3884 : legalizeFExpUnsafe(B, Dst, X, Flags);
3885 MI.eraseFromParent();
3886 return true;
3887 }
3888
3889 // Algorithm:
3890 //
3891 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3892 //
3893 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3894 // n = 64*m + j, 0 <= j < 64
3895 //
3896 // e^x = 2^((64*m + j + f)/64)
3897 // = (2^m) * (2^(j/64)) * 2^(f/64)
3898 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3899 //
3900 // f = x*(64/ln(2)) - n
3901 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3902 //
3903 // e^x = (2^m) * (2^(j/64)) * e^r
3904 //
3905 // (2^(j/64)) is precomputed
3906 //
3907 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3908 // e^r = 1 + q
3909 //
3910 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3911 //
3912 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3913 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3914 Register PH, PL;
3915
3916 if (ST.hasFastFMAF32()) {
3917 const float c_exp = numbers::log2ef;
3918 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3919 const float c_exp10 = 0x1.a934f0p+1f;
3920 const float cc_exp10 = 0x1.2f346ep-24f;
3921
3922 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3923 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3924 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3925 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3926
3927 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3928 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3929 } else {
3930 const float ch_exp = 0x1.714000p+0f;
3931 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3932
3933 const float ch_exp10 = 0x1.a92000p+1f;
3934 const float cl_exp10 = 0x1.4f0978p-11f;
3935
3936 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3937 auto XH = B.buildAnd(Ty, X, MaskConst);
3938 auto XL = B.buildFSub(Ty, X, XH, Flags);
3939
3940 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3941 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3942
3943 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3944 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3945
3946 Register Mad0 =
3947 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3948 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3949 }
3950
3951 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3952
3953 // It is unsafe to contract this fsub into the PH multiply.
3954 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3955 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3956 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3957
3958 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3959 .addUse(A.getReg(0))
3960 .setMIFlags(Flags);
3961 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3962
3963 auto UnderflowCheckConst =
3964 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3965 auto Zero = B.buildFConstant(Ty, 0.0);
3966 auto Underflow =
3967 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3968
3969 R = B.buildSelect(Ty, Underflow, Zero, R);
3970
3971 if (!(Flags & MachineInstr::FmNoInfs)) {
3972 auto OverflowCheckConst =
3973 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3974
3975 auto Overflow =
3976 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3977 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3978 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3979 }
3980
3981 B.buildCopy(Dst, R);
3982 MI.eraseFromParent();
3983 return true;
3984}
3985
3987 MachineIRBuilder &B) const {
3988 Register Dst = MI.getOperand(0).getReg();
3989 Register Src0 = MI.getOperand(1).getReg();
3990 Register Src1 = MI.getOperand(2).getReg();
3991 unsigned Flags = MI.getFlags();
3992 LLT Ty = B.getMRI()->getType(Dst);
3993 const LLT F16 = LLT::float16();
3994 const LLT F32 = LLT::float32();
3995
3996 if (Ty == F32) {
3997 auto Log = B.buildFLog2(F32, Src0, Flags);
3998 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3999 .addUse(Log.getReg(0))
4000 .addUse(Src1)
4001 .setMIFlags(Flags);
4002 B.buildFExp2(Dst, Mul, Flags);
4003 } else if (Ty == F16) {
4004 // There's no f16 fmul_legacy, so we need to convert for it.
4005 auto Log = B.buildFLog2(F16, Src0, Flags);
4006 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4007 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4008 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4009 .addUse(Ext0.getReg(0))
4010 .addUse(Ext1.getReg(0))
4011 .setMIFlags(Flags);
4012 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4013 } else
4014 return false;
4015
4016 MI.eraseFromParent();
4017 return true;
4018}
4019
4020// Find a source register, ignoring any possible source modifiers.
4022 Register ModSrc = OrigSrc;
4023 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4024 ModSrc = SrcFNeg->getOperand(1).getReg();
4025 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4026 ModSrc = SrcFAbs->getOperand(1).getReg();
4027 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4028 ModSrc = SrcFAbs->getOperand(1).getReg();
4029 return ModSrc;
4030}
4031
4034 MachineIRBuilder &B) const {
4035
4036 const LLT S1 = LLT::scalar(1);
4037 const LLT F64 = LLT::float64();
4038 Register Dst = MI.getOperand(0).getReg();
4039 Register OrigSrc = MI.getOperand(1).getReg();
4040 unsigned Flags = MI.getFlags();
4041 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4042 "this should not have been custom lowered");
4043
4044 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4045 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4046 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4047 // V_FRACT bug is:
4048 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4049 //
4050 // Convert floor(x) to (x - fract(x))
4051
4052 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4053 .addUse(OrigSrc)
4054 .setMIFlags(Flags);
4055
4056 // Give source modifier matching some assistance before obscuring a foldable
4057 // pattern.
4058
4059 // TODO: We can avoid the neg on the fract? The input sign to fract
4060 // shouldn't matter?
4061 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4062
4063 auto Const =
4064 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4065
4066 Register Min = MRI.createGenericVirtualRegister(F64);
4067
4068 // We don't need to concern ourselves with the snan handling difference, so
4069 // use the one which will directly select.
4070 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4071 if (MFI->getMode().IEEE)
4072 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4073 else
4074 B.buildFMinNum(Min, Fract, Const, Flags);
4075
4076 Register CorrectedFract = Min;
4077 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4078 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4079 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4080 }
4081
4082 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4083 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4084
4085 MI.eraseFromParent();
4086 return true;
4087}
4088
4089// Turn an illegal packed v2s16 build vector into bit operations.
4090// TODO: This should probably be a bitcast action in LegalizerHelper.
4093 Register Dst = MI.getOperand(0).getReg();
4094 const LLT S32 = LLT::scalar(32);
4095 const LLT S16 = LLT::scalar(16);
4096 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4097
4098 Register Src0 = MI.getOperand(1).getReg();
4099 Register Src1 = MI.getOperand(2).getReg();
4100
4101 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4102 assert(MRI.getType(Src0) == S32);
4103 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4104 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4105 }
4106
4107 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4108 B.buildBitcast(Dst, Merge);
4109
4110 MI.eraseFromParent();
4111 return true;
4112}
4113
4114// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4115//
4116// Source and accumulation registers must all be 32-bits.
4117//
4118// TODO: When the multiply is uniform, we should produce a code sequence
4119// that is better suited to instruction selection on the SALU. Instead of
4120// the outer loop going over parts of the result, the outer loop should go
4121// over parts of one of the factors. This should result in instruction
4122// selection that makes full use of S_ADDC_U32 instructions.
4125 ArrayRef<Register> Src0,
4126 ArrayRef<Register> Src1,
4127 bool UsePartialMad64_32,
4128 bool SeparateOddAlignedProducts) const {
4129 // Use (possibly empty) vectors of S1 registers to represent the set of
4130 // carries from one pair of positions to the next.
4131 using Carry = SmallVector<Register, 2>;
4132
4133 MachineIRBuilder &B = Helper.MIRBuilder;
4134 GISelValueTracking &VT = *Helper.getValueTracking();
4135
4136 const LLT S1 = LLT::scalar(1);
4137 const LLT S32 = LLT::scalar(32);
4138 const LLT S64 = LLT::scalar(64);
4139
4140 Register Zero32;
4141 Register Zero64;
4142
4143 auto getZero32 = [&]() -> Register {
4144 if (!Zero32)
4145 Zero32 = B.buildConstant(S32, 0).getReg(0);
4146 return Zero32;
4147 };
4148 auto getZero64 = [&]() -> Register {
4149 if (!Zero64)
4150 Zero64 = B.buildConstant(S64, 0).getReg(0);
4151 return Zero64;
4152 };
4153
4154 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4155 for (unsigned i = 0; i < Src0.size(); ++i) {
4156 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4157 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4158 }
4159
4160 // Merge the given carries into the 32-bit LocalAccum, which is modified
4161 // in-place.
4162 //
4163 // Returns the carry-out, which is a single S1 register or null.
4164 auto mergeCarry =
4165 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4166 if (CarryIn.empty())
4167 return Register();
4168
4169 bool HaveCarryOut = true;
4170 Register CarryAccum;
4171 if (CarryIn.size() == 1) {
4172 if (!LocalAccum) {
4173 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4174 return Register();
4175 }
4176
4177 CarryAccum = getZero32();
4178 } else {
4179 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4180 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4181 CarryAccum =
4182 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4183 .getReg(0);
4184 }
4185
4186 if (!LocalAccum) {
4187 LocalAccum = getZero32();
4188 HaveCarryOut = false;
4189 }
4190 }
4191
4192 auto Add =
4193 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4194 LocalAccum = Add.getReg(0);
4195 return HaveCarryOut ? Add.getReg(1) : Register();
4196 };
4197
4198 // Build a multiply-add chain to compute
4199 //
4200 // LocalAccum + (partial products at DstIndex)
4201 // + (opportunistic subset of CarryIn)
4202 //
4203 // LocalAccum is an array of one or two 32-bit registers that are updated
4204 // in-place. The incoming registers may be null.
4205 //
4206 // In some edge cases, carry-ins can be consumed "for free". In that case,
4207 // the consumed carry bits are removed from CarryIn in-place.
4208 auto buildMadChain =
4209 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4210 -> Carry {
4211 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4212 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4213
4214 Carry CarryOut;
4215 unsigned j0 = 0;
4216
4217 // Use plain 32-bit multiplication for the most significant part of the
4218 // result by default.
4219 if (LocalAccum.size() == 1 &&
4220 (!UsePartialMad64_32 || !CarryIn.empty())) {
4221 do {
4222 // Skip multiplication if one of the operands is 0
4223 unsigned j1 = DstIndex - j0;
4224 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4225 ++j0;
4226 continue;
4227 }
4228 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4229 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4230 LocalAccum[0] = Mul.getReg(0);
4231 } else {
4232 if (CarryIn.empty()) {
4233 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4234 } else {
4235 LocalAccum[0] =
4236 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4237 .getReg(0);
4238 CarryIn.pop_back();
4239 }
4240 }
4241 ++j0;
4242 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4243 }
4244
4245 // Build full 64-bit multiplies.
4246 if (j0 <= DstIndex) {
4247 bool HaveSmallAccum = false;
4248 Register Tmp;
4249
4250 if (LocalAccum[0]) {
4251 if (LocalAccum.size() == 1) {
4252 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4253 HaveSmallAccum = true;
4254 } else if (LocalAccum[1]) {
4255 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4256 HaveSmallAccum = false;
4257 } else {
4258 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4259 HaveSmallAccum = true;
4260 }
4261 } else {
4262 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4263 Tmp = getZero64();
4264 HaveSmallAccum = true;
4265 }
4266
4267 do {
4268 unsigned j1 = DstIndex - j0;
4269 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4270 ++j0;
4271 continue;
4272 }
4273 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4274 {Src0[j0], Src1[j1], Tmp});
4275 Tmp = Mad.getReg(0);
4276 if (!HaveSmallAccum)
4277 CarryOut.push_back(Mad.getReg(1));
4278 HaveSmallAccum = false;
4279
4280 ++j0;
4281 } while (j0 <= DstIndex);
4282
4283 auto Unmerge = B.buildUnmerge(S32, Tmp);
4284 LocalAccum[0] = Unmerge.getReg(0);
4285 if (LocalAccum.size() > 1)
4286 LocalAccum[1] = Unmerge.getReg(1);
4287 }
4288
4289 return CarryOut;
4290 };
4291
4292 // Outer multiply loop, iterating over destination parts from least
4293 // significant to most significant parts.
4294 //
4295 // The columns of the following diagram correspond to the destination parts
4296 // affected by one iteration of the outer loop (ignoring boundary
4297 // conditions).
4298 //
4299 // Dest index relative to 2 * i: 1 0 -1
4300 // ------
4301 // Carries from previous iteration: e o
4302 // Even-aligned partial product sum: E E .
4303 // Odd-aligned partial product sum: O O
4304 //
4305 // 'o' is OddCarry, 'e' is EvenCarry.
4306 // EE and OO are computed from partial products via buildMadChain and use
4307 // accumulation where possible and appropriate.
4308 //
4309 Register SeparateOddCarry;
4310 Carry EvenCarry;
4311 Carry OddCarry;
4312
4313 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4314 Carry OddCarryIn = std::move(OddCarry);
4315 Carry EvenCarryIn = std::move(EvenCarry);
4316 OddCarry.clear();
4317 EvenCarry.clear();
4318
4319 // Partial products at offset 2 * i.
4320 if (2 * i < Accum.size()) {
4321 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4322 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4323 }
4324
4325 // Partial products at offset 2 * i - 1.
4326 if (i > 0) {
4327 if (!SeparateOddAlignedProducts) {
4328 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4329 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4330 } else {
4331 bool IsHighest = 2 * i >= Accum.size();
4332 Register SeparateOddOut[2];
4333 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4334 .take_front(IsHighest ? 1 : 2);
4335 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4336
4338
4339 if (i == 1) {
4340 if (!IsHighest)
4341 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4342 else
4343 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4344 } else {
4345 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4346 SeparateOddCarry);
4347 }
4348 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4349
4350 if (!IsHighest) {
4351 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4352 Lo->getOperand(1).getReg());
4353 Accum[2 * i] = Hi.getReg(0);
4354 SeparateOddCarry = Hi.getReg(1);
4355 }
4356 }
4357 }
4358
4359 // Add in the carries from the previous iteration
4360 if (i > 0) {
4361 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4362 EvenCarryIn.push_back(CarryOut);
4363
4364 if (2 * i < Accum.size()) {
4365 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4366 OddCarry.push_back(CarryOut);
4367 }
4368 }
4369 }
4370}
4371
4372// Custom narrowing of wide multiplies using wide multiply-add instructions.
4373//
4374// TODO: If the multiply is followed by an addition, we should attempt to
4375// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4377 MachineInstr &MI) const {
4378 assert(ST.hasMad64_32());
4379 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4380
4381 MachineIRBuilder &B = Helper.MIRBuilder;
4382 MachineRegisterInfo &MRI = *B.getMRI();
4383
4384 Register DstReg = MI.getOperand(0).getReg();
4385 Register Src0 = MI.getOperand(1).getReg();
4386 Register Src1 = MI.getOperand(2).getReg();
4387
4388 LLT Ty = MRI.getType(DstReg);
4389 assert(Ty.isScalar());
4390
4391 unsigned Size = Ty.getSizeInBits();
4392 if (ST.hasVectorMulU64() && Size == 64)
4393 return true;
4394
4395 unsigned NumParts = Size / 32;
4396 assert((Size % 32) == 0);
4397 assert(NumParts >= 2);
4398
4399 // Whether to use MAD_64_32 for partial products whose high half is
4400 // discarded. This avoids some ADD instructions but risks false dependency
4401 // stalls on some subtargets in some cases.
4402 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4403
4404 // Whether to compute odd-aligned partial products separately. This is
4405 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4406 // in an even-aligned VGPR.
4407 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4408
4409 LLT S32 = LLT::scalar(32);
4410 SmallVector<Register, 2> Src0Parts, Src1Parts;
4411 for (unsigned i = 0; i < NumParts; ++i) {
4412 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4413 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4414 }
4415 B.buildUnmerge(Src0Parts, Src0);
4416 B.buildUnmerge(Src1Parts, Src1);
4417
4418 SmallVector<Register, 2> AccumRegs(NumParts);
4419 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4420 SeparateOddAlignedProducts);
4421
4422 B.buildMergeLikeInstr(DstReg, AccumRegs);
4423 MI.eraseFromParent();
4424 return true;
4425}
4426
4427// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4428// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4429// case with a single min instruction instead of a compare+select.
4432 MachineIRBuilder &B) const {
4433 Register Dst = MI.getOperand(0).getReg();
4434 Register Src = MI.getOperand(1).getReg();
4435 LLT DstTy = MRI.getType(Dst);
4436 LLT SrcTy = MRI.getType(Src);
4437
4438 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4439 ? AMDGPU::G_AMDGPU_FFBH_U32
4440 : AMDGPU::G_AMDGPU_FFBL_B32;
4441 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4442 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4443
4444 MI.eraseFromParent();
4445 return true;
4446}
4447
4450 MachineIRBuilder &B) const {
4451 Register Dst = MI.getOperand(0).getReg();
4452 Register Src = MI.getOperand(1).getReg();
4453 LLT SrcTy = MRI.getType(Src);
4454 TypeSize NumBits = SrcTy.getSizeInBits();
4455
4456 assert(NumBits < 32u);
4457
4458 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4459 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4460 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4461 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4462 B.buildTrunc(Dst, Ctlz);
4463 MI.eraseFromParent();
4464 return true;
4465}
4466
4467// Check that this is a G_XOR x, -1
4468static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4469 if (MI.getOpcode() != TargetOpcode::G_XOR)
4470 return false;
4471 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4472 return ConstVal == -1;
4473}
4474
4475// Return the use branch instruction, otherwise null if the usage is invalid.
4476static MachineInstr *
4478 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4479 Register CondDef = MI.getOperand(0).getReg();
4480 if (!MRI.hasOneNonDBGUse(CondDef))
4481 return nullptr;
4482
4483 MachineBasicBlock *Parent = MI.getParent();
4484 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4485
4486 if (isNot(MRI, *UseMI)) {
4487 Register NegatedCond = UseMI->getOperand(0).getReg();
4488 if (!MRI.hasOneNonDBGUse(NegatedCond))
4489 return nullptr;
4490
4491 // We're deleting the def of this value, so we need to remove it.
4492 eraseInstr(*UseMI, MRI);
4493
4494 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4495 Negated = true;
4496 }
4497
4498 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4499 return nullptr;
4500
4501 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4502 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4503 if (Next == Parent->end()) {
4504 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4505 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4506 return nullptr;
4507 UncondBrTarget = &*NextMBB;
4508 } else {
4509 if (Next->getOpcode() != AMDGPU::G_BR)
4510 return nullptr;
4511 Br = &*Next;
4512 UncondBrTarget = Br->getOperand(0).getMBB();
4513 }
4514
4515 return UseMI;
4516}
4517
4520 const ArgDescriptor *Arg,
4521 const TargetRegisterClass *ArgRC,
4522 LLT ArgTy) const {
4523 MCRegister SrcReg = Arg->getRegister();
4524 assert(SrcReg.isPhysical() && "Physical register expected");
4525 assert(DstReg.isVirtual() && "Virtual register expected");
4526
4527 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4528 *ArgRC, B.getDebugLoc(), ArgTy);
4529 if (Arg->isMasked()) {
4530 // TODO: Should we try to emit this once in the entry block?
4531 const LLT S32 = LLT::scalar(32);
4532 const unsigned Mask = Arg->getMask();
4533 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4534
4535 Register AndMaskSrc = LiveIn;
4536
4537 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4538 // 0.
4539 if (Shift != 0) {
4540 auto ShiftAmt = B.buildConstant(S32, Shift);
4541 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4542 }
4543
4544 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4545 } else {
4546 B.buildCopy(DstReg, LiveIn);
4547 }
4548}
4549
4554 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4555 Register DstReg = MI.getOperand(0).getReg();
4556 if (!ST.hasClusters()) {
4557 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4558 return false;
4559 MI.eraseFromParent();
4560 return true;
4561 }
4562
4563 // Clusters are supported. Return the global position in the grid. If clusters
4564 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4565
4566 // WorkGroupIdXYZ = ClusterId == 0 ?
4567 // ClusterIdXYZ :
4568 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4569 MachineRegisterInfo &MRI = *B.getMRI();
4570 const LLT S32 = LLT::scalar(32);
4571 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4572 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4573 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4574 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4575 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4576 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4577 return false;
4578
4579 auto One = B.buildConstant(S32, 1);
4580 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4581 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4582 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4583
4584 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4585
4586 switch (MFI->getClusterDims().getKind()) {
4589 B.buildCopy(DstReg, GlobalIdXYZ);
4590 MI.eraseFromParent();
4591 return true;
4592 }
4594 B.buildCopy(DstReg, ClusterIdXYZ);
4595 MI.eraseFromParent();
4596 return true;
4597 }
4599 using namespace AMDGPU::Hwreg;
4600 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4601 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4602 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4603 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4604 .addDef(ClusterId)
4605 .addImm(ClusterIdField);
4606 auto Zero = B.buildConstant(S32, 0);
4607 auto NoClusters =
4608 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4609 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4610 MI.eraseFromParent();
4611 return true;
4612 }
4613 }
4614
4615 llvm_unreachable("nothing should reach here");
4616}
4617
4619 Register DstReg, MachineIRBuilder &B,
4621 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4622 const ArgDescriptor *Arg = nullptr;
4623 const TargetRegisterClass *ArgRC;
4624 LLT ArgTy;
4625
4626 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4627 const ArgDescriptor WorkGroupIDX =
4628 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4629 // If GridZ is not programmed in an entry function then the hardware will set
4630 // it to all zeros, so there is no need to mask the GridY value in the low
4631 // order bits.
4632 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4633 AMDGPU::TTMP7,
4634 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4635 const ArgDescriptor WorkGroupIDZ =
4636 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4637 const ArgDescriptor ClusterWorkGroupIDX =
4638 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4639 const ArgDescriptor ClusterWorkGroupIDY =
4640 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4641 const ArgDescriptor ClusterWorkGroupIDZ =
4642 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4643 const ArgDescriptor ClusterWorkGroupMaxIDX =
4644 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4645 const ArgDescriptor ClusterWorkGroupMaxIDY =
4646 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4647 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4648 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4649 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4650 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4651
4652 auto LoadConstant = [&](unsigned N) {
4653 B.buildConstant(DstReg, N);
4654 return true;
4655 };
4656
4657 if (ST.hasArchitectedSGPRs() &&
4659 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4660 bool HasFixedDims = ClusterDims.isFixedDims();
4661
4662 switch (ArgType) {
4664 Arg = &WorkGroupIDX;
4665 ArgRC = &AMDGPU::SReg_32RegClass;
4666 ArgTy = LLT::scalar(32);
4667 break;
4669 Arg = &WorkGroupIDY;
4670 ArgRC = &AMDGPU::SReg_32RegClass;
4671 ArgTy = LLT::scalar(32);
4672 break;
4674 Arg = &WorkGroupIDZ;
4675 ArgRC = &AMDGPU::SReg_32RegClass;
4676 ArgTy = LLT::scalar(32);
4677 break;
4679 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4680 return LoadConstant(0);
4681 Arg = &ClusterWorkGroupIDX;
4682 ArgRC = &AMDGPU::SReg_32RegClass;
4683 ArgTy = LLT::scalar(32);
4684 break;
4686 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4687 return LoadConstant(0);
4688 Arg = &ClusterWorkGroupIDY;
4689 ArgRC = &AMDGPU::SReg_32RegClass;
4690 ArgTy = LLT::scalar(32);
4691 break;
4693 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4694 return LoadConstant(0);
4695 Arg = &ClusterWorkGroupIDZ;
4696 ArgRC = &AMDGPU::SReg_32RegClass;
4697 ArgTy = LLT::scalar(32);
4698 break;
4700 if (HasFixedDims)
4701 return LoadConstant(ClusterDims.getDims()[0] - 1);
4702 Arg = &ClusterWorkGroupMaxIDX;
4703 ArgRC = &AMDGPU::SReg_32RegClass;
4704 ArgTy = LLT::scalar(32);
4705 break;
4707 if (HasFixedDims)
4708 return LoadConstant(ClusterDims.getDims()[1] - 1);
4709 Arg = &ClusterWorkGroupMaxIDY;
4710 ArgRC = &AMDGPU::SReg_32RegClass;
4711 ArgTy = LLT::scalar(32);
4712 break;
4714 if (HasFixedDims)
4715 return LoadConstant(ClusterDims.getDims()[2] - 1);
4716 Arg = &ClusterWorkGroupMaxIDZ;
4717 ArgRC = &AMDGPU::SReg_32RegClass;
4718 ArgTy = LLT::scalar(32);
4719 break;
4721 Arg = &ClusterWorkGroupMaxFlatID;
4722 ArgRC = &AMDGPU::SReg_32RegClass;
4723 ArgTy = LLT::scalar(32);
4724 break;
4725 default:
4726 break;
4727 }
4728 }
4729
4730 if (!Arg)
4731 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4732
4733 if (!Arg) {
4735 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4736 // which case the pointer argument may be missing and we use null.
4737 return LoadConstant(0);
4738 }
4739
4740 // It's undefined behavior if a function marked with the amdgpu-no-*
4741 // attributes uses the corresponding intrinsic.
4742 B.buildUndef(DstReg);
4743 return true;
4744 }
4745
4746 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4747 return false; // TODO: Handle these
4748 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4749 return true;
4750}
4751
4755 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4756 return false;
4757
4758 MI.eraseFromParent();
4759 return true;
4760}
4761
4763 int64_t C) {
4764 B.buildConstant(MI.getOperand(0).getReg(), C);
4765 MI.eraseFromParent();
4766 return true;
4767}
4768
4771 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4772 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4773 if (MaxID == 0)
4774 return replaceWithConstant(B, MI, 0);
4775
4776 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4777 const ArgDescriptor *Arg;
4778 const TargetRegisterClass *ArgRC;
4779 LLT ArgTy;
4780 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4781
4782 Register DstReg = MI.getOperand(0).getReg();
4783 if (!Arg) {
4784 // It's undefined behavior if a function marked with the amdgpu-no-*
4785 // attributes uses the corresponding intrinsic.
4786 B.buildUndef(DstReg);
4787 MI.eraseFromParent();
4788 return true;
4789 }
4790
4791 if (Arg->isMasked()) {
4792 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4793 // masking operations anyway.
4794 //
4795 // TODO: We could assert the top bit is 0 for the source copy.
4796 if (!loadInputValue(DstReg, B, ArgType))
4797 return false;
4798 } else {
4799 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4800 if (!loadInputValue(TmpReg, B, ArgType))
4801 return false;
4802 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4803 }
4804
4805 MI.eraseFromParent();
4806 return true;
4807}
4808
4811 // This isn't really a constant pool but close enough.
4814 return PtrInfo;
4815}
4816
4818 int64_t Offset) const {
4820 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4821
4822 // TODO: If we passed in the base kernel offset we could have a better
4823 // alignment than 4, but we don't really need it.
4824 if (!loadInputValue(KernArgReg, B,
4826 llvm_unreachable("failed to find kernarg segment ptr");
4827
4828 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4829 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4830}
4831
4832/// Legalize a value that's loaded from kernel arguments. This is only used by
4833/// legacy intrinsics.
4837 Align Alignment) const {
4838 Register DstReg = MI.getOperand(0).getReg();
4839
4840 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4841 "unexpected kernarg parameter type");
4842
4845 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
4848 MI.eraseFromParent();
4849 return true;
4850}
4851
4854 MachineIRBuilder &B) const {
4855 Register Dst = MI.getOperand(0).getReg();
4856 LLT DstTy = MRI.getType(Dst);
4857 LLT S16 = LLT::scalar(16);
4858 LLT S32 = LLT::scalar(32);
4859 LLT S64 = LLT::scalar(64);
4860
4861 if (DstTy == S16)
4862 return legalizeFDIV16(MI, MRI, B);
4863 if (DstTy == S32)
4864 return legalizeFDIV32(MI, MRI, B);
4865 if (DstTy == S64)
4866 return legalizeFDIV64(MI, MRI, B);
4867
4868 return false;
4869}
4870
4872 Register DstDivReg,
4873 Register DstRemReg,
4874 Register X,
4875 Register Y) const {
4876 const LLT S1 = LLT::scalar(1);
4877 const LLT S32 = LLT::scalar(32);
4878
4879 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4880 // algorithm used here.
4881
4882 // Initial estimate of inv(y).
4883 auto FloatY = B.buildUITOFP(S32, Y);
4884 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4885 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4886 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4887 auto Z = B.buildFPTOUI(S32, ScaledY);
4888
4889 // One round of UNR.
4890 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4891 auto NegYZ = B.buildMul(S32, NegY, Z);
4892 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4893
4894 // Quotient/remainder estimate.
4895 auto Q = B.buildUMulH(S32, X, Z);
4896 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4897
4898 // First quotient/remainder refinement.
4899 auto One = B.buildConstant(S32, 1);
4900 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4901 if (DstDivReg)
4902 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4903 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4904
4905 // Second quotient/remainder refinement.
4906 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4907 if (DstDivReg)
4908 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4909
4910 if (DstRemReg)
4911 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4912}
4913
4914// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4915//
4916// Return lo, hi of result
4917//
4918// %cvt.lo = G_UITOFP Val.lo
4919// %cvt.hi = G_UITOFP Val.hi
4920// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4921// %rcp = G_AMDGPU_RCP_IFLAG %mad
4922// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4923// %mul2 = G_FMUL %mul1, 2**(-32)
4924// %trunc = G_INTRINSIC_TRUNC %mul2
4925// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4926// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4927static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4928 Register Val) {
4929 const LLT S32 = LLT::scalar(32);
4930 auto Unmerge = B.buildUnmerge(S32, Val);
4931
4932 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4933 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4934
4935 auto Mad = B.buildFMAD(
4936 S32, CvtHi, // 2**32
4937 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4938
4939 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4940 auto Mul1 = B.buildFMul(
4941 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4942
4943 // 2**(-32)
4944 auto Mul2 = B.buildFMul(
4945 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4946 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4947
4948 // -(2**32)
4949 auto Mad2 = B.buildFMAD(
4950 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4951 Mul1);
4952
4953 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4954 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4955
4956 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4957}
4958
4960 Register DstDivReg,
4961 Register DstRemReg,
4962 Register Numer,
4963 Register Denom) const {
4964 const LLT S32 = LLT::scalar(32);
4965 const LLT S64 = LLT::scalar(64);
4966 const LLT S1 = LLT::scalar(1);
4967 Register RcpLo, RcpHi;
4968
4969 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4970
4971 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4972
4973 auto Zero64 = B.buildConstant(S64, 0);
4974 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4975
4976 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4977 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4978
4979 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4980 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4981 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4982
4983 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4984 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4985 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4986
4987 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4988 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4989 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4990 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4991 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4992
4993 auto Zero32 = B.buildConstant(S32, 0);
4994 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4995 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4996 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4997
4998 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4999 Register NumerLo = UnmergeNumer.getReg(0);
5000 Register NumerHi = UnmergeNumer.getReg(1);
5001
5002 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5003 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5004 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5005 Register Mul3_Lo = UnmergeMul3.getReg(0);
5006 Register Mul3_Hi = UnmergeMul3.getReg(1);
5007 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5008 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5009 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5010 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5011
5012 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5013 Register DenomLo = UnmergeDenom.getReg(0);
5014 Register DenomHi = UnmergeDenom.getReg(1);
5015
5016 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5017 auto C1 = B.buildSExt(S32, CmpHi);
5018
5019 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5020 auto C2 = B.buildSExt(S32, CmpLo);
5021
5022 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5023 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5024
5025 // TODO: Here and below portions of the code can be enclosed into if/endif.
5026 // Currently control flow is unconditional and we have 4 selects after
5027 // potential endif to substitute PHIs.
5028
5029 // if C3 != 0 ...
5030 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5031 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5032 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5033 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5034
5035 auto One64 = B.buildConstant(S64, 1);
5036 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5037
5038 auto C4 =
5039 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5040 auto C5 =
5041 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5042 auto C6 = B.buildSelect(
5043 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5044
5045 // if (C6 != 0)
5046 auto Add4 = B.buildAdd(S64, Add3, One64);
5047 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5048
5049 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5050 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5051 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5052
5053 // endif C6
5054 // endif C3
5055
5056 if (DstDivReg) {
5057 auto Sel1 = B.buildSelect(
5058 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5059 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5060 Sel1, MulHi3);
5061 }
5062
5063 if (DstRemReg) {
5064 auto Sel2 = B.buildSelect(
5065 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5066 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5067 Sel2, Sub1);
5068 }
5069}
5070
5073 MachineIRBuilder &B) const {
5074 Register DstDivReg, DstRemReg;
5075 switch (MI.getOpcode()) {
5076 default:
5077 llvm_unreachable("Unexpected opcode!");
5078 case AMDGPU::G_UDIV: {
5079 DstDivReg = MI.getOperand(0).getReg();
5080 break;
5081 }
5082 case AMDGPU::G_UREM: {
5083 DstRemReg = MI.getOperand(0).getReg();
5084 break;
5085 }
5086 case AMDGPU::G_UDIVREM: {
5087 DstDivReg = MI.getOperand(0).getReg();
5088 DstRemReg = MI.getOperand(1).getReg();
5089 break;
5090 }
5091 }
5092
5093 const LLT S64 = LLT::scalar(64);
5094 const LLT S32 = LLT::scalar(32);
5095 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5096 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5097 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5098 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5099
5100 if (Ty == S32)
5101 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5102 else if (Ty == S64)
5103 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5104 else
5105 return false;
5106
5107 MI.eraseFromParent();
5108 return true;
5109}
5110
5113 MachineIRBuilder &B) const {
5114 const LLT S64 = LLT::scalar(64);
5115 const LLT S32 = LLT::scalar(32);
5116
5117 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5118 if (Ty != S32 && Ty != S64)
5119 return false;
5120
5121 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5122 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5123 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5124
5125 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5126 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5127 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5128
5129 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5130 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5131
5132 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5133 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5134
5135 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5136 switch (MI.getOpcode()) {
5137 default:
5138 llvm_unreachable("Unexpected opcode!");
5139 case AMDGPU::G_SDIV: {
5140 DstDivReg = MI.getOperand(0).getReg();
5141 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5142 break;
5143 }
5144 case AMDGPU::G_SREM: {
5145 DstRemReg = MI.getOperand(0).getReg();
5146 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5147 break;
5148 }
5149 case AMDGPU::G_SDIVREM: {
5150 DstDivReg = MI.getOperand(0).getReg();
5151 DstRemReg = MI.getOperand(1).getReg();
5152 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5153 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5154 break;
5155 }
5156 }
5157
5158 if (Ty == S32)
5159 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5160 else
5161 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5162
5163 if (DstDivReg) {
5164 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5165 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5166 B.buildSub(DstDivReg, SignXor, Sign);
5167 }
5168
5169 if (DstRemReg) {
5170 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5171 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5172 B.buildSub(DstRemReg, SignXor, Sign);
5173 }
5174
5175 MI.eraseFromParent();
5176 return true;
5177}
5178
5181 MachineIRBuilder &B) const {
5182 Register Res = MI.getOperand(0).getReg();
5183 Register LHS = MI.getOperand(1).getReg();
5184 Register RHS = MI.getOperand(2).getReg();
5185 uint16_t Flags = MI.getFlags();
5186 LLT ResTy = MRI.getType(Res);
5187
5188 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5189
5190 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5191 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5192 return false;
5193
5194 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5195 // the CI documentation has a worst case error of 1 ulp.
5196 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5197 // use it as long as we aren't trying to use denormals.
5198 //
5199 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5200
5201 // 1 / x -> RCP(x)
5202 if (CLHS->isExactlyValue(1.0)) {
5203 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5204 .addUse(RHS)
5205 .setMIFlags(Flags);
5206
5207 MI.eraseFromParent();
5208 return true;
5209 }
5210
5211 // -1 / x -> RCP( FNEG(x) )
5212 if (CLHS->isExactlyValue(-1.0)) {
5213 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5214 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5215 .addUse(FNeg.getReg(0))
5216 .setMIFlags(Flags);
5217
5218 MI.eraseFromParent();
5219 return true;
5220 }
5221 }
5222
5223 // For f16 require afn or arcp.
5224 // For f32 require afn.
5225 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5226 !MI.getFlag(MachineInstr::FmArcp)))
5227 return false;
5228
5229 // x / y -> x * (1.0 / y)
5230 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5231 .addUse(RHS)
5232 .setMIFlags(Flags);
5233 B.buildFMul(Res, LHS, RCP, Flags);
5234
5235 MI.eraseFromParent();
5236 return true;
5237}
5238
5241 MachineIRBuilder &B) const {
5242 Register Res = MI.getOperand(0).getReg();
5243 Register X = MI.getOperand(1).getReg();
5244 Register Y = MI.getOperand(2).getReg();
5245 uint16_t Flags = MI.getFlags();
5246 LLT ResTy = MRI.getType(Res);
5247
5248 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5249
5250 if (!AllowInaccurateRcp)
5251 return false;
5252
5253 auto NegY = B.buildFNeg(ResTy, Y);
5254 auto One = B.buildFConstant(ResTy, 1.0);
5255
5256 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5257 .addUse(Y)
5258 .setMIFlags(Flags);
5259
5260 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5261 R = B.buildFMA(ResTy, Tmp0, R, R);
5262
5263 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5264 R = B.buildFMA(ResTy, Tmp1, R, R);
5265
5266 auto Ret = B.buildFMul(ResTy, X, R);
5267 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5268
5269 B.buildFMA(Res, Tmp2, R, Ret);
5270 MI.eraseFromParent();
5271 return true;
5272}
5273
5276 MachineIRBuilder &B) const {
5278 return true;
5279
5280 Register Res = MI.getOperand(0).getReg();
5281 Register LHS = MI.getOperand(1).getReg();
5282 Register RHS = MI.getOperand(2).getReg();
5283
5284 uint16_t Flags = MI.getFlags();
5285
5286 LLT S16 = LLT::scalar(16);
5287 LLT S32 = LLT::scalar(32);
5288
5289 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5290 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5291 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5292 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5293 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5294 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5295 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5296 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5297 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5298 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5299 // q16.u = opx(V_CVT_F16_F32, q32.u);
5300 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5301
5302 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5303 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5304 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5305 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5306 .addUse(RHSExt.getReg(0))
5307 .setMIFlags(Flags);
5308 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5310 if (ST.hasMadMacF32Insts()) {
5311 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5312 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5313 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5314 } else {
5315 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5316 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5317 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5318 }
5319 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5320 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5321 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5322 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5323 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5324 .addUse(RDst.getReg(0))
5325 .addUse(RHS)
5326 .addUse(LHS)
5327 .setMIFlags(Flags);
5328
5329 MI.eraseFromParent();
5330 return true;
5331}
5332
5333static constexpr unsigned SPDenormModeBitField =
5335
5336// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5337// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5339 const GCNSubtarget &ST,
5341 // Set SP denorm mode to this value.
5342 unsigned SPDenormMode =
5343 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5344
5345 if (ST.hasDenormModeInst()) {
5346 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5347 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5348
5349 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5350 B.buildInstr(AMDGPU::S_DENORM_MODE)
5351 .addImm(NewDenormModeValue);
5352
5353 } else {
5354 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5355 .addImm(SPDenormMode)
5356 .addImm(SPDenormModeBitField);
5357 }
5358}
5359
5362 MachineIRBuilder &B) const {
5364 return true;
5365
5366 Register Res = MI.getOperand(0).getReg();
5367 Register LHS = MI.getOperand(1).getReg();
5368 Register RHS = MI.getOperand(2).getReg();
5369 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5370 SIModeRegisterDefaults Mode = MFI->getMode();
5371
5372 uint16_t Flags = MI.getFlags();
5373
5374 LLT S32 = LLT::scalar(32);
5375 LLT S1 = LLT::scalar(1);
5376
5377 auto One = B.buildFConstant(S32, 1.0f);
5378
5379 auto DenominatorScaled =
5380 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5381 .addUse(LHS)
5382 .addUse(RHS)
5383 .addImm(0)
5384 .setMIFlags(Flags);
5385 auto NumeratorScaled =
5386 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5387 .addUse(LHS)
5388 .addUse(RHS)
5389 .addImm(1)
5390 .setMIFlags(Flags);
5391
5392 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5393 .addUse(DenominatorScaled.getReg(0))
5394 .setMIFlags(Flags);
5395 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5396
5397 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5398 const bool HasDynamicDenormals =
5399 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5400 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5401
5402 Register SavedSPDenormMode;
5403 if (!PreservesDenormals) {
5404 if (HasDynamicDenormals) {
5405 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5406 B.buildInstr(AMDGPU::S_GETREG_B32)
5407 .addDef(SavedSPDenormMode)
5408 .addImm(SPDenormModeBitField);
5409 }
5410 toggleSPDenormMode(true, B, ST, Mode);
5411 }
5412
5413 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5414 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5415 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5416 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5417 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5418 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5419
5420 if (!PreservesDenormals) {
5421 if (HasDynamicDenormals) {
5422 assert(SavedSPDenormMode);
5423 B.buildInstr(AMDGPU::S_SETREG_B32)
5424 .addReg(SavedSPDenormMode)
5425 .addImm(SPDenormModeBitField);
5426 } else
5427 toggleSPDenormMode(false, B, ST, Mode);
5428 }
5429
5430 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5431 .addUse(Fma4.getReg(0))
5432 .addUse(Fma1.getReg(0))
5433 .addUse(Fma3.getReg(0))
5434 .addUse(NumeratorScaled.getReg(1))
5435 .setMIFlags(Flags);
5436
5437 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5438 .addUse(Fmas.getReg(0))
5439 .addUse(RHS)
5440 .addUse(LHS)
5441 .setMIFlags(Flags);
5442
5443 MI.eraseFromParent();
5444 return true;
5445}
5446
5449 MachineIRBuilder &B) const {
5451 return true;
5452
5453 Register Res = MI.getOperand(0).getReg();
5454 Register LHS = MI.getOperand(1).getReg();
5455 Register RHS = MI.getOperand(2).getReg();
5456
5457 uint16_t Flags = MI.getFlags();
5458
5459 LLT S64 = LLT::scalar(64);
5460 LLT S1 = LLT::scalar(1);
5461
5462 auto One = B.buildFConstant(S64, 1.0);
5463
5464 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5465 .addUse(LHS)
5466 .addUse(RHS)
5467 .addImm(0)
5468 .setMIFlags(Flags);
5469
5470 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5471
5472 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5473 .addUse(DivScale0.getReg(0))
5474 .setMIFlags(Flags);
5475
5476 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5477 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5478 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5479
5480 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5481 .addUse(LHS)
5482 .addUse(RHS)
5483 .addImm(1)
5484 .setMIFlags(Flags);
5485
5486 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5487 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5488 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5489
5490 Register Scale;
5491 if (!ST.hasUsableDivScaleConditionOutput()) {
5492 // Workaround a hardware bug on SI where the condition output from div_scale
5493 // is not usable.
5494
5495 LLT S32 = LLT::scalar(32);
5496
5497 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5498 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5499 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5500 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5501
5502 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5503 Scale1Unmerge.getReg(1));
5504 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5505 Scale0Unmerge.getReg(1));
5506 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5507 } else {
5508 Scale = DivScale1.getReg(1);
5509 }
5510
5511 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5512 .addUse(Fma4.getReg(0))
5513 .addUse(Fma3.getReg(0))
5514 .addUse(Mul.getReg(0))
5515 .addUse(Scale)
5516 .setMIFlags(Flags);
5517
5518 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5519 .addUse(Fmas.getReg(0))
5520 .addUse(RHS)
5521 .addUse(LHS)
5522 .setMIFlags(Flags);
5523
5524 MI.eraseFromParent();
5525 return true;
5526}
5527
5530 MachineIRBuilder &B) const {
5531 Register Res0 = MI.getOperand(0).getReg();
5532 Register Res1 = MI.getOperand(1).getReg();
5533 Register Val = MI.getOperand(2).getReg();
5534 uint16_t Flags = MI.getFlags();
5535
5536 LLT Ty = MRI.getType(Res0);
5537 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5538
5539 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5540 .addUse(Val)
5541 .setMIFlags(Flags);
5542 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5543 .addUse(Val)
5544 .setMIFlags(Flags);
5545
5546 if (ST.hasFractBug()) {
5547 auto Fabs = B.buildFAbs(Ty, Val);
5548 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5549 auto IsFinite =
5550 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5551 auto Zero = B.buildConstant(InstrExpTy, 0);
5552 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5553 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5554 }
5555
5556 B.buildCopy(Res0, Mant);
5557 B.buildSExtOrTrunc(Res1, Exp);
5558
5559 MI.eraseFromParent();
5560 return true;
5561}
5562
5565 MachineIRBuilder &B) const {
5566 Register Res = MI.getOperand(0).getReg();
5567 Register LHS = MI.getOperand(2).getReg();
5568 Register RHS = MI.getOperand(3).getReg();
5569 uint16_t Flags = MI.getFlags();
5570
5571 LLT S32 = LLT::scalar(32);
5572 LLT S1 = LLT::scalar(1);
5573
5574 auto Abs = B.buildFAbs(S32, RHS, Flags);
5575 const APFloat C0Val(1.0f);
5576
5577 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5578 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5579 auto C2 = B.buildFConstant(S32, 1.0f);
5580
5581 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5582 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5583
5584 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5585
5586 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5587 .addUse(Mul0.getReg(0))
5588 .setMIFlags(Flags);
5589
5590 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5591
5592 B.buildFMul(Res, Sel, Mul1, Flags);
5593
5594 MI.eraseFromParent();
5595 return true;
5596}
5597
5600 MachineIRBuilder &B) const {
5601 // Bypass the correct expansion a standard promotion through G_FSQRT would
5602 // get. The f32 op is accurate enough for the f16 cas.
5603 unsigned Flags = MI.getFlags();
5604 assert(!ST.has16BitInsts());
5605 const LLT F32 = LLT::scalar(32);
5606 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5607 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5608 .addUse(Ext.getReg(0))
5609 .setMIFlags(Flags);
5610 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5611 MI.eraseFromParent();
5612 return true;
5613}
5614
5617 MachineIRBuilder &B) const {
5618 MachineFunction &MF = B.getMF();
5619 Register Dst = MI.getOperand(0).getReg();
5620 Register X = MI.getOperand(1).getReg();
5621 const unsigned Flags = MI.getFlags();
5622 const LLT S1 = LLT::scalar(1);
5623 const LLT F32 = LLT::scalar(32);
5624 const LLT I32 = LLT::scalar(32);
5625
5626 if (allowApproxFunc(MF, Flags)) {
5627 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5628 .addUse(X)
5629 .setMIFlags(Flags);
5630 MI.eraseFromParent();
5631 return true;
5632 }
5633
5634 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5635 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5636 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5637 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5638 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5639
5640 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5641 if (needsDenormHandlingF32(MF, X, Flags)) {
5642 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5643 .addUse(SqrtX.getReg(0))
5644 .setMIFlags(Flags);
5645
5646 auto NegOne = B.buildConstant(I32, -1);
5647 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5648
5649 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5650 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5651
5652 auto PosOne = B.buildConstant(I32, 1);
5653 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5654
5655 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5656 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5657
5658 auto Zero = B.buildFConstant(F32, 0.0f);
5659 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5660
5661 SqrtS =
5662 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5663
5664 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5665 SqrtS =
5666 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5667 } else {
5668 auto SqrtR =
5669 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5670 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5671
5672 auto Half = B.buildFConstant(F32, 0.5f);
5673 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5674 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5675 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5676 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5677 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5678 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5679 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5680 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5681 }
5682
5683 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5684
5685 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5686
5687 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5688
5689 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5690 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5691
5692 MI.eraseFromParent();
5693 return true;
5694}
5695
5698 MachineIRBuilder &B) const {
5699 // For double type, the SQRT and RSQ instructions don't have required
5700 // precision, we apply Goldschmidt's algorithm to improve the result:
5701 //
5702 // y0 = rsq(x)
5703 // g0 = x * y0
5704 // h0 = 0.5 * y0
5705 //
5706 // r0 = 0.5 - h0 * g0
5707 // g1 = g0 * r0 + g0
5708 // h1 = h0 * r0 + h0
5709 //
5710 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5711 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5712 // h2 = h1 * r1 + h1
5713 //
5714 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5715 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5716 //
5717 // sqrt(x) = g3
5718
5719 const LLT S1 = LLT::scalar(1);
5720 const LLT S32 = LLT::scalar(32);
5721 const LLT F64 = LLT::scalar(64);
5722
5723 Register Dst = MI.getOperand(0).getReg();
5724 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5725
5726 Register X = MI.getOperand(1).getReg();
5727 unsigned Flags = MI.getFlags();
5728
5729 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5730
5731 auto ZeroInt = B.buildConstant(S32, 0);
5732 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5733
5734 // Scale up input if it is too small.
5735 auto ScaleUpFactor = B.buildConstant(S32, 256);
5736 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5737 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5738
5739 auto SqrtY =
5740 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5741
5742 auto Half = B.buildFConstant(F64, 0.5);
5743 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5744 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5745
5746 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5747 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5748
5749 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5750 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5751
5752 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5753 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5754
5755 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5756
5757 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5758 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5759
5760 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5761
5762 // Scale down the result.
5763 auto ScaleDownFactor = B.buildConstant(S32, -128);
5764 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5765 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5766
5767 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5768 // with finite only or nsz because rsq(+/-0) = +/-inf
5769
5770 // TODO: Check for DAZ and expand to subnormals
5771 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5772
5773 // If x is +INF, +0, or -0, use its original value
5774 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5775
5776 MI.eraseFromParent();
5777 return true;
5778}
5779
5782 MachineIRBuilder &B) const {
5783 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5784 if (Ty == LLT::scalar(32))
5785 return legalizeFSQRTF32(MI, MRI, B);
5786 if (Ty == LLT::scalar(64))
5787 return legalizeFSQRTF64(MI, MRI, B);
5788 if (Ty == LLT::scalar(16))
5789 return legalizeFSQRTF16(MI, MRI, B);
5790 return false;
5791}
5792
5793// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5794// FIXME: Why do we handle this one but not other removed instructions?
5795//
5796// Reciprocal square root. The clamp prevents infinite results, clamping
5797// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5798// +-max_float.
5801 MachineIRBuilder &B) const {
5802 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5803 return true;
5804
5805 Register Dst = MI.getOperand(0).getReg();
5806 Register Src = MI.getOperand(2).getReg();
5807 auto Flags = MI.getFlags();
5808
5809 LLT Ty = MRI.getType(Dst);
5810
5811 const fltSemantics *FltSemantics;
5812 if (Ty == LLT::scalar(32))
5813 FltSemantics = &APFloat::IEEEsingle();
5814 else if (Ty == LLT::scalar(64))
5815 FltSemantics = &APFloat::IEEEdouble();
5816 else
5817 return false;
5818
5819 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5820 .addUse(Src)
5821 .setMIFlags(Flags);
5822
5823 // We don't need to concern ourselves with the snan handling difference, since
5824 // the rsq quieted (or not) so use the one which will directly select.
5825 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5826 const bool UseIEEE = MFI->getMode().IEEE;
5827
5828 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5829 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5830 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5831
5832 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5833
5834 if (UseIEEE)
5835 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5836 else
5837 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5838 MI.eraseFromParent();
5839 return true;
5840}
5841
5842// TODO: Fix pointer type handling
5845 Intrinsic::ID IID) const {
5846
5847 MachineIRBuilder &B = Helper.MIRBuilder;
5848 MachineRegisterInfo &MRI = *B.getMRI();
5849
5850 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5851 IID == Intrinsic::amdgcn_permlanex16;
5852 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5853 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5854
5855 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5856 Register Src2, LLT VT) -> Register {
5857 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5858 switch (IID) {
5859 case Intrinsic::amdgcn_readfirstlane:
5860 case Intrinsic::amdgcn_permlane64:
5861 return LaneOp.getReg(0);
5862 case Intrinsic::amdgcn_readlane:
5863 case Intrinsic::amdgcn_set_inactive:
5864 case Intrinsic::amdgcn_set_inactive_chain_arg:
5865 return LaneOp.addUse(Src1).getReg(0);
5866 case Intrinsic::amdgcn_writelane:
5867 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5868 case Intrinsic::amdgcn_permlane16:
5869 case Intrinsic::amdgcn_permlanex16: {
5870 Register Src3 = MI.getOperand(5).getReg();
5871 int64_t Src4 = MI.getOperand(6).getImm();
5872 int64_t Src5 = MI.getOperand(7).getImm();
5873 return LaneOp.addUse(Src1)
5874 .addUse(Src2)
5875 .addUse(Src3)
5876 .addImm(Src4)
5877 .addImm(Src5)
5878 .getReg(0);
5879 }
5880 case Intrinsic::amdgcn_mov_dpp8:
5881 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5882 case Intrinsic::amdgcn_update_dpp:
5883 return LaneOp.addUse(Src1)
5884 .addImm(MI.getOperand(4).getImm())
5885 .addImm(MI.getOperand(5).getImm())
5886 .addImm(MI.getOperand(6).getImm())
5887 .addImm(MI.getOperand(7).getImm())
5888 .getReg(0);
5889 default:
5890 llvm_unreachable("unhandled lane op");
5891 }
5892 };
5893
5894 Register DstReg = MI.getOperand(0).getReg();
5895 Register Src0 = MI.getOperand(2).getReg();
5896 Register Src1, Src2;
5897 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5898 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5899 Src1 = MI.getOperand(3).getReg();
5900 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5901 Src2 = MI.getOperand(4).getReg();
5902 }
5903 }
5904
5905 LLT Ty = MRI.getType(DstReg);
5906 unsigned Size = Ty.getSizeInBits();
5907
5908 unsigned SplitSize = 32;
5909 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5910 ST.hasDPALU_DPP() &&
5911 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
5912 SplitSize = 64;
5913
5914 if (Size == SplitSize) {
5915 // Already legal
5916 return true;
5917 }
5918
5919 if (Size < 32) {
5920 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5921
5922 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5923 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5924
5925 if (IID == Intrinsic::amdgcn_writelane)
5926 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5927
5928 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5929 B.buildTrunc(DstReg, LaneOpDst);
5930 MI.eraseFromParent();
5931 return true;
5932 }
5933
5934 if (Size % SplitSize != 0)
5935 return false;
5936
5937 LLT PartialResTy = LLT::scalar(SplitSize);
5938 bool NeedsBitcast = false;
5939 if (Ty.isVector()) {
5940 LLT EltTy = Ty.getElementType();
5941 unsigned EltSize = EltTy.getSizeInBits();
5942 if (EltSize == SplitSize) {
5943 PartialResTy = EltTy;
5944 } else if (EltSize == 16 || EltSize == 32) {
5945 unsigned NElem = SplitSize / EltSize;
5946 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5947 } else {
5948 // Handle all other cases via S32/S64 pieces
5949 NeedsBitcast = true;
5950 }
5951 }
5952
5953 SmallVector<Register, 4> PartialRes;
5954 unsigned NumParts = Size / SplitSize;
5955 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5956 MachineInstrBuilder Src1Parts, Src2Parts;
5957
5958 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5959 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5960
5961 if (IID == Intrinsic::amdgcn_writelane)
5962 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5963
5964 for (unsigned i = 0; i < NumParts; ++i) {
5965 Src0 = Src0Parts.getReg(i);
5966
5967 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5968 Src1 = Src1Parts.getReg(i);
5969
5970 if (IID == Intrinsic::amdgcn_writelane)
5971 Src2 = Src2Parts.getReg(i);
5972
5973 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5974 }
5975
5976 if (NeedsBitcast)
5977 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
5978 LLT::scalar(Ty.getSizeInBits()), PartialRes));
5979 else
5980 B.buildMergeLikeInstr(DstReg, PartialRes);
5981
5982 MI.eraseFromParent();
5983 return true;
5984}
5985
5988 MachineIRBuilder &B) const {
5990 ST.getTargetLowering()->getImplicitParameterOffset(
5992 LLT DstTy = MRI.getType(DstReg);
5993 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5994
5995 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5996 if (!loadInputValue(KernargPtrReg, B,
5998 return false;
5999
6000 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6001 B.buildConstant(IdxTy, Offset).getReg(0));
6002 return true;
6003}
6004
6005/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6006/// bits of the pointer and replace them with the stride argument, then
6007/// merge_values everything together. In the common case of a raw buffer (the
6008/// stride component is 0), we can just AND off the upper half.
6011 Register Result = MI.getOperand(0).getReg();
6012 Register Pointer = MI.getOperand(2).getReg();
6013 Register Stride = MI.getOperand(3).getReg();
6014 Register NumRecords = MI.getOperand(4).getReg();
6015 Register Flags = MI.getOperand(5).getReg();
6016
6017 LLT S32 = LLT::scalar(32);
6018 LLT S64 = LLT::scalar(64);
6019
6020 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6021
6022 auto ExtStride = B.buildAnyExt(S32, Stride);
6023
6024 if (ST.has45BitNumRecordsBufferResource()) {
6025 Register Zero = B.buildConstant(S32, 0).getReg(0);
6026 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6027 // num_records.
6028 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6029 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6030 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6031 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6032 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6033
6034 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6035 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6036 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6037 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6038 auto ExtShiftedStride =
6039 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6040 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6041 auto ExtShiftedFlags =
6042 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6043 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6044 Register HighHalf =
6045 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6046 B.buildMergeValues(Result, {LowHalf, HighHalf});
6047 } else {
6048 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6049 auto Unmerge = B.buildUnmerge(S32, Pointer);
6050 auto LowHalf = Unmerge.getReg(0);
6051 auto HighHalf = Unmerge.getReg(1);
6052
6053 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6054 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6055 auto ShiftConst = B.buildConstant(S32, 16);
6056 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6057 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6058 Register NewHighHalfReg = NewHighHalf.getReg(0);
6059 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6060 }
6061
6062 MI.eraseFromParent();
6063 return true;
6064}
6065
6068 MachineIRBuilder &B) const {
6069 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6070 if (!MFI->isEntryFunction()) {
6073 }
6074
6075 Register DstReg = MI.getOperand(0).getReg();
6076 if (!getImplicitArgPtr(DstReg, MRI, B))
6077 return false;
6078
6079 MI.eraseFromParent();
6080 return true;
6081}
6082
6085 MachineIRBuilder &B) const {
6086 Function &F = B.getMF().getFunction();
6087 std::optional<uint32_t> KnownSize =
6089 if (KnownSize.has_value())
6090 B.buildConstant(DstReg, *KnownSize);
6091 return false;
6092}
6093
6096 MachineIRBuilder &B) const {
6097
6098 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6099 if (!MFI->isEntryFunction()) {
6102 }
6103
6104 Register DstReg = MI.getOperand(0).getReg();
6105 if (!getLDSKernelId(DstReg, MRI, B))
6106 return false;
6107
6108 MI.eraseFromParent();
6109 return true;
6110}
6111
6115 unsigned AddrSpace) const {
6116 const LLT S32 = LLT::scalar(32);
6117 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6118 Register Hi32 = Unmerge.getReg(1);
6119
6120 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6121 ST.hasGloballyAddressableScratch()) {
6122 Register FlatScratchBaseHi =
6123 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6124 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6125 .getReg(0);
6126 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6127 // Test bits 63..58 against the aperture address.
6128 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6129 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6130 B.buildConstant(S32, 1u << 26));
6131 } else {
6132 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6133 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6134 }
6135 MI.eraseFromParent();
6136 return true;
6137}
6138
6139// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6140// offset (the offset that is included in bounds checking and swizzling, to be
6141// split between the instruction's voffset and immoffset fields) and soffset
6142// (the offset that is excluded from bounds checking and swizzling, to go in
6143// the instruction's soffset field). This function takes the first kind of
6144// offset and figures out how to split it between voffset and immoffset.
6145std::pair<Register, unsigned>
6147 Register OrigOffset) const {
6148 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6149 Register BaseReg;
6150 unsigned ImmOffset;
6151 const LLT S32 = LLT::scalar(32);
6152 MachineRegisterInfo &MRI = *B.getMRI();
6153
6154 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6155 // being added, so we can only safely match a 32-bit addition with no unsigned
6156 // overflow.
6157 bool CheckNUW = AMDGPU::isGFX1250(ST);
6158 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6159 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6160
6161 // If BaseReg is a pointer, convert it to int.
6162 if (MRI.getType(BaseReg).isPointer())
6163 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6164
6165 // If the immediate value is too big for the immoffset field, put only bits
6166 // that would normally fit in the immoffset field. The remaining value that
6167 // is copied/added for the voffset field is a large power of 2, and it
6168 // stands more chance of being CSEd with the copy/add for another similar
6169 // load/store.
6170 // However, do not do that rounding down if that is a negative
6171 // number, as it appears to be illegal to have a negative offset in the
6172 // vgpr, even if adding the immediate offset makes it positive.
6173 unsigned Overflow = ImmOffset & ~MaxImm;
6174 ImmOffset -= Overflow;
6175 if ((int32_t)Overflow < 0) {
6176 Overflow += ImmOffset;
6177 ImmOffset = 0;
6178 }
6179
6180 if (Overflow != 0) {
6181 if (!BaseReg) {
6182 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6183 } else {
6184 auto OverflowVal = B.buildConstant(S32, Overflow);
6185 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6186 }
6187 }
6188
6189 if (!BaseReg)
6190 BaseReg = B.buildConstant(S32, 0).getReg(0);
6191
6192 return std::pair(BaseReg, ImmOffset);
6193}
6194
6195/// Handle register layout difference for f16 images for some subtargets.
6198 Register Reg,
6199 bool ImageStore) const {
6200 const LLT S16 = LLT::scalar(16);
6201 const LLT S32 = LLT::scalar(32);
6202 LLT StoreVT = MRI.getType(Reg);
6203 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6204
6205 if (ST.hasUnpackedD16VMem()) {
6206 auto Unmerge = B.buildUnmerge(S16, Reg);
6207
6208 SmallVector<Register, 4> WideRegs;
6209 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6210 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6211
6212 int NumElts = StoreVT.getNumElements();
6213
6214 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6215 .getReg(0);
6216 }
6217
6218 if (ImageStore && ST.hasImageStoreD16Bug()) {
6219 if (StoreVT.getNumElements() == 2) {
6220 SmallVector<Register, 4> PackedRegs;
6221 Reg = B.buildBitcast(S32, Reg).getReg(0);
6222 PackedRegs.push_back(Reg);
6223 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6224 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6225 .getReg(0);
6226 }
6227
6228 if (StoreVT.getNumElements() == 3) {
6229 SmallVector<Register, 4> PackedRegs;
6230 auto Unmerge = B.buildUnmerge(S16, Reg);
6231 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6232 PackedRegs.push_back(Unmerge.getReg(I));
6233 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6234 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6235 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6236 }
6237
6238 if (StoreVT.getNumElements() == 4) {
6239 SmallVector<Register, 4> PackedRegs;
6240 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6241 auto Unmerge = B.buildUnmerge(S32, Reg);
6242 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6243 PackedRegs.push_back(Unmerge.getReg(I));
6244 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6245 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6246 .getReg(0);
6247 }
6248
6249 llvm_unreachable("invalid data type");
6250 }
6251
6252 if (StoreVT == LLT::fixed_vector(3, S16)) {
6253 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6254 .getReg(0);
6255 }
6256 return Reg;
6257}
6258
6260 Register VData, LLT MemTy,
6261 bool IsFormat) const {
6262 MachineRegisterInfo *MRI = B.getMRI();
6263 LLT Ty = MRI->getType(VData);
6264
6265 const LLT S16 = LLT::scalar(16);
6266
6267 // Fixup buffer resources themselves needing to be v4i128.
6269 return castBufferRsrcToV4I32(VData, B);
6270
6271 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6272 Ty = getBitcastRegisterType(Ty);
6273 VData = B.buildBitcast(Ty, VData).getReg(0);
6274 }
6275 // Fixup illegal register types for i8 stores.
6276 if (Ty == LLT::scalar(8) || Ty == S16) {
6277 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6278 return AnyExt;
6279 }
6280
6281 if (Ty.isVector()) {
6282 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6283 if (IsFormat)
6284 return handleD16VData(B, *MRI, VData);
6285 }
6286 }
6287
6288 return VData;
6289}
6290
6292 LegalizerHelper &Helper,
6293 bool IsTyped,
6294 bool IsFormat) const {
6295 MachineIRBuilder &B = Helper.MIRBuilder;
6296 MachineRegisterInfo &MRI = *B.getMRI();
6297
6298 Register VData = MI.getOperand(1).getReg();
6299 LLT Ty = MRI.getType(VData);
6300 LLT EltTy = Ty.getScalarType();
6301 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6302 const LLT S32 = LLT::scalar(32);
6303
6304 MachineMemOperand *MMO = *MI.memoperands_begin();
6305 const int MemSize = MMO->getSize().getValue();
6306 LLT MemTy = MMO->getMemoryType();
6307
6308 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6309
6311 Register RSrc = MI.getOperand(2).getReg();
6312
6313 unsigned ImmOffset;
6314
6315 // The typed intrinsics add an immediate after the registers.
6316 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6317
6318 // The struct intrinsic variants add one additional operand over raw.
6319 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6320 Register VIndex;
6321 int OpOffset = 0;
6322 if (HasVIndex) {
6323 VIndex = MI.getOperand(3).getReg();
6324 OpOffset = 1;
6325 } else {
6326 VIndex = B.buildConstant(S32, 0).getReg(0);
6327 }
6328
6329 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6330 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6331
6332 unsigned Format = 0;
6333 if (IsTyped) {
6334 Format = MI.getOperand(5 + OpOffset).getImm();
6335 ++OpOffset;
6336 }
6337
6338 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6339
6340 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6341
6342 unsigned Opc;
6343 if (IsTyped) {
6344 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6345 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6346 } else if (IsFormat) {
6347 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6348 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6349 } else {
6350 switch (MemSize) {
6351 case 1:
6352 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6353 break;
6354 case 2:
6355 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6356 break;
6357 default:
6358 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6359 break;
6360 }
6361 }
6362
6363 auto MIB = B.buildInstr(Opc)
6364 .addUse(VData) // vdata
6365 .addUse(RSrc) // rsrc
6366 .addUse(VIndex) // vindex
6367 .addUse(VOffset) // voffset
6368 .addUse(SOffset) // soffset
6369 .addImm(ImmOffset); // offset(imm)
6370
6371 if (IsTyped)
6372 MIB.addImm(Format);
6373
6374 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6375 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6376 .addMemOperand(MMO);
6377
6378 MI.eraseFromParent();
6379 return true;
6380}
6381
6382static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6383 Register VIndex, Register VOffset, Register SOffset,
6384 unsigned ImmOffset, unsigned Format,
6385 unsigned AuxiliaryData, MachineMemOperand *MMO,
6386 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6387 auto MIB = B.buildInstr(Opc)
6388 .addDef(LoadDstReg) // vdata
6389 .addUse(RSrc) // rsrc
6390 .addUse(VIndex) // vindex
6391 .addUse(VOffset) // voffset
6392 .addUse(SOffset) // soffset
6393 .addImm(ImmOffset); // offset(imm)
6394
6395 if (IsTyped)
6396 MIB.addImm(Format);
6397
6398 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6399 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6400 .addMemOperand(MMO);
6401}
6402
6404 LegalizerHelper &Helper,
6405 bool IsFormat,
6406 bool IsTyped) const {
6407 MachineIRBuilder &B = Helper.MIRBuilder;
6408 MachineRegisterInfo &MRI = *B.getMRI();
6409 GISelChangeObserver &Observer = Helper.Observer;
6410
6411 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6412 MachineMemOperand *MMO = *MI.memoperands_begin();
6413 const LLT MemTy = MMO->getMemoryType();
6414 const LLT S32 = LLT::scalar(32);
6415
6416 Register Dst = MI.getOperand(0).getReg();
6417
6418 Register StatusDst;
6419 int OpOffset = 0;
6420 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6421 bool IsTFE = MI.getNumExplicitDefs() == 2;
6422 if (IsTFE) {
6423 StatusDst = MI.getOperand(1).getReg();
6424 ++OpOffset;
6425 }
6426
6427 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6428 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6429
6430 // The typed intrinsics add an immediate after the registers.
6431 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6432
6433 // The struct intrinsic variants add one additional operand over raw.
6434 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6435 Register VIndex;
6436 if (HasVIndex) {
6437 VIndex = MI.getOperand(3 + OpOffset).getReg();
6438 ++OpOffset;
6439 } else {
6440 VIndex = B.buildConstant(S32, 0).getReg(0);
6441 }
6442
6443 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6444 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6445
6446 unsigned Format = 0;
6447 if (IsTyped) {
6448 Format = MI.getOperand(5 + OpOffset).getImm();
6449 ++OpOffset;
6450 }
6451
6452 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6453 unsigned ImmOffset;
6454
6455 LLT Ty = MRI.getType(Dst);
6456 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6457 // logic doesn't have to handle that case.
6458 if (hasBufferRsrcWorkaround(Ty)) {
6459 Observer.changingInstr(MI);
6460 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6461 Observer.changedInstr(MI);
6462 Dst = MI.getOperand(0).getReg();
6463 B.setInsertPt(B.getMBB(), MI);
6464 }
6465 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6466 Ty = getBitcastRegisterType(Ty);
6467 Observer.changingInstr(MI);
6468 Helper.bitcastDst(MI, Ty, 0);
6469 Observer.changedInstr(MI);
6470 Dst = MI.getOperand(0).getReg();
6471 B.setInsertPt(B.getMBB(), MI);
6472 }
6473
6474 LLT EltTy = Ty.getScalarType();
6475 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6476 const bool Unpacked = ST.hasUnpackedD16VMem();
6477
6478 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6479
6480 unsigned Opc;
6481
6482 // TODO: Support TFE for typed and narrow loads.
6483 if (IsTyped) {
6484 if (IsTFE)
6485 return false;
6486 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6487 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6488 } else if (IsFormat) {
6489 if (IsD16) {
6490 if (IsTFE)
6491 return false;
6492 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6493 } else {
6494 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6495 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6496 }
6497 } else {
6498 switch (MemTy.getSizeInBits()) {
6499 case 8:
6500 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6501 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6502 break;
6503 case 16:
6504 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6505 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6506 break;
6507 default:
6508 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6509 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6510 break;
6511 }
6512 }
6513
6514 if (IsTFE) {
6515 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6516 unsigned NumLoadDWords = NumValueDWords + 1;
6517 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6518 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6519 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6520 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6521 if (MemTy.getSizeInBits() < 32) {
6522 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6523 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6524 B.buildTrunc(Dst, ExtDst);
6525 } else if (NumValueDWords == 1) {
6526 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6527 } else {
6528 SmallVector<Register, 5> LoadElts;
6529 for (unsigned I = 0; I != NumValueDWords; ++I)
6530 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6531 LoadElts.push_back(StatusDst);
6532 B.buildUnmerge(LoadElts, LoadDstReg);
6533 LoadElts.truncate(NumValueDWords);
6534 B.buildMergeLikeInstr(Dst, LoadElts);
6535 }
6536 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6537 (IsD16 && !Ty.isVector())) {
6538 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6539 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6540 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6541 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6542 B.buildTrunc(Dst, LoadDstReg);
6543 } else if (Unpacked && IsD16 && Ty.isVector()) {
6544 LLT UnpackedTy = Ty.changeElementSize(32);
6545 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6546 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6547 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6548 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6549 // FIXME: G_TRUNC should work, but legalization currently fails
6550 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6552 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6553 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6554 B.buildMergeLikeInstr(Dst, Repack);
6555 } else {
6556 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6557 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6558 }
6559
6560 MI.eraseFromParent();
6561 return true;
6562}
6563
6564static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6565 switch (IntrID) {
6566 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6567 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6568 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6569 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6570 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6571 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6572 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6573 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6574 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6575 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6576 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6577 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6578 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6579 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6580 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6581 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6582 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6583 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6584 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6585 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6586 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6587 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6588 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6589 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6590 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6591 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6592 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6593 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6594 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6595 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6596 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6597 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6598 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6599 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6600 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6601 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6602 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6603 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6605 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6606 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6607 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6608 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6610 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6611 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6612 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6613 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6614 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6615 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6616 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6617 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6618 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6620 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6621 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6623 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6624 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6625 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6626 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6627 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6628 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6629 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6630 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6631 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6632 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6633 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6634 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6635 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6636 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6638 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6640 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6641 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6642 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6643 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6644 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6646 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6647 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6648 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6649 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6650 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6651 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6653 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6654 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6655 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6656 default:
6657 llvm_unreachable("unhandled atomic opcode");
6658 }
6659}
6660
6663 Intrinsic::ID IID) const {
6664 const bool IsCmpSwap =
6665 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6666 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6667 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6668 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6669
6670 Register Dst = MI.getOperand(0).getReg();
6671 // Since we don't have 128-bit atomics, we don't need to handle the case of
6672 // p8 argmunents to the atomic itself
6673 Register VData = MI.getOperand(2).getReg();
6674
6675 Register CmpVal;
6676 int OpOffset = 0;
6677
6678 if (IsCmpSwap) {
6679 CmpVal = MI.getOperand(3).getReg();
6680 ++OpOffset;
6681 }
6682
6683 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6684 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6685 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6686
6687 // The struct intrinsic variants add one additional operand over raw.
6688 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6689 Register VIndex;
6690 if (HasVIndex) {
6691 VIndex = MI.getOperand(4 + OpOffset).getReg();
6692 ++OpOffset;
6693 } else {
6694 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6695 }
6696
6697 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6698 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6699 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6700
6701 MachineMemOperand *MMO = *MI.memoperands_begin();
6702
6703 unsigned ImmOffset;
6704 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6705
6706 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6707 .addDef(Dst)
6708 .addUse(VData); // vdata
6709
6710 if (IsCmpSwap)
6711 MIB.addReg(CmpVal);
6712
6713 MIB.addUse(RSrc) // rsrc
6714 .addUse(VIndex) // vindex
6715 .addUse(VOffset) // voffset
6716 .addUse(SOffset) // soffset
6717 .addImm(ImmOffset) // offset(imm)
6718 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6719 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6720 .addMemOperand(MMO);
6721
6722 MI.eraseFromParent();
6723 return true;
6724}
6725
6726/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6727/// vector with s16 typed elements.
6729 SmallVectorImpl<Register> &PackedAddrs,
6730 unsigned ArgOffset,
6732 bool IsA16, bool IsG16) {
6733 const LLT S16 = LLT::scalar(16);
6734 const LLT V2S16 = LLT::fixed_vector(2, 16);
6735 auto EndIdx = Intr->VAddrEnd;
6736
6737 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6738 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6739 if (!SrcOp.isReg())
6740 continue; // _L to _LZ may have eliminated this.
6741
6742 Register AddrReg = SrcOp.getReg();
6743
6744 if ((I < Intr->GradientStart) ||
6745 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6746 (I >= Intr->CoordStart && !IsA16)) {
6747 if ((I < Intr->GradientStart) && IsA16 &&
6748 (B.getMRI()->getType(AddrReg) == S16)) {
6749 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6750 // Special handling of bias when A16 is on. Bias is of type half but
6751 // occupies full 32-bit.
6752 PackedAddrs.push_back(
6753 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6754 .getReg(0));
6755 } else {
6756 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6757 "Bias needs to be converted to 16 bit in A16 mode");
6758 // Handle any gradient or coordinate operands that should not be packed
6759 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6760 PackedAddrs.push_back(AddrReg);
6761 }
6762 } else {
6763 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6764 // derivatives dx/dh and dx/dv are packed with undef.
6765 if (((I + 1) >= EndIdx) ||
6766 ((Intr->NumGradients / 2) % 2 == 1 &&
6767 (I == static_cast<unsigned>(Intr->GradientStart +
6768 (Intr->NumGradients / 2) - 1) ||
6769 I == static_cast<unsigned>(Intr->GradientStart +
6770 Intr->NumGradients - 1))) ||
6771 // Check for _L to _LZ optimization
6772 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6773 PackedAddrs.push_back(
6774 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6775 .getReg(0));
6776 } else {
6777 PackedAddrs.push_back(
6778 B.buildBuildVector(
6779 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6780 .getReg(0));
6781 ++I;
6782 }
6783 }
6784 }
6785}
6786
6787/// Convert from separate vaddr components to a single vector address register,
6788/// and replace the remaining operands with $noreg.
6790 int DimIdx, int NumVAddrs) {
6791 const LLT S32 = LLT::scalar(32);
6792 (void)S32;
6793 SmallVector<Register, 8> AddrRegs;
6794 for (int I = 0; I != NumVAddrs; ++I) {
6795 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6796 if (SrcOp.isReg()) {
6797 AddrRegs.push_back(SrcOp.getReg());
6798 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6799 }
6800 }
6801
6802 int NumAddrRegs = AddrRegs.size();
6803 if (NumAddrRegs != 1) {
6804 auto VAddr =
6805 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6806 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6807 }
6808
6809 for (int I = 1; I != NumVAddrs; ++I) {
6810 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6811 if (SrcOp.isReg())
6812 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6813 }
6814}
6815
6816/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6817///
6818/// Depending on the subtarget, load/store with 16-bit element data need to be
6819/// rewritten to use the low half of 32-bit registers, or directly use a packed
6820/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6821/// registers.
6822///
6823/// We don't want to directly select image instructions just yet, but also want
6824/// to exposes all register repacking to the legalizer/combiners. We also don't
6825/// want a selected instruction entering RegBankSelect. In order to avoid
6826/// defining a multitude of intermediate image instructions, directly hack on
6827/// the intrinsic's arguments. In cases like a16 addresses, this requires
6828/// padding now unnecessary arguments with $noreg.
6831 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6832
6833 const MachineFunction &MF = *MI.getMF();
6834 const unsigned NumDefs = MI.getNumExplicitDefs();
6835 const unsigned ArgOffset = NumDefs + 1;
6836 bool IsTFE = NumDefs == 2;
6837 // We are only processing the operands of d16 image operations on subtargets
6838 // that use the unpacked register layout, or need to repack the TFE result.
6839
6840 // TODO: Do we need to guard against already legalized intrinsics?
6841 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6843
6844 MachineRegisterInfo *MRI = B.getMRI();
6845 const LLT S32 = LLT::scalar(32);
6846 const LLT S16 = LLT::scalar(16);
6847 const LLT V2S16 = LLT::fixed_vector(2, 16);
6848
6849 unsigned DMask = 0;
6850 Register VData;
6851 LLT Ty;
6852
6853 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6854 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6855 Ty = MRI->getType(VData);
6856 }
6857
6858 const bool IsAtomicPacked16Bit =
6859 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6860 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6861
6862 // Check for 16 bit addresses and pack if true.
6863 LLT GradTy =
6864 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6865 LLT AddrTy =
6866 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6867 const bool IsG16 =
6868 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6869 const bool IsA16 = AddrTy == S16;
6870 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6871
6872 int DMaskLanes = 0;
6873 if (!BaseOpcode->Atomic) {
6874 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6875 if (BaseOpcode->Gather4) {
6876 DMaskLanes = 4;
6877 } else if (DMask != 0) {
6878 DMaskLanes = llvm::popcount(DMask);
6879 } else if (!IsTFE && !BaseOpcode->Store) {
6880 // If dmask is 0, this is a no-op load. This can be eliminated.
6881 B.buildUndef(MI.getOperand(0));
6882 MI.eraseFromParent();
6883 return true;
6884 }
6885 }
6886
6887 Observer.changingInstr(MI);
6888 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
6889
6890 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6891 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6892 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6893 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6894 unsigned NewOpcode = LoadOpcode;
6895 if (BaseOpcode->Store)
6896 NewOpcode = StoreOpcode;
6897 else if (BaseOpcode->NoReturn)
6898 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6899
6900 // Track that we legalized this
6901 MI.setDesc(B.getTII().get(NewOpcode));
6902
6903 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6904 // dmask to be at least 1 otherwise the instruction will fail
6905 if (IsTFE && DMask == 0) {
6906 DMask = 0x1;
6907 DMaskLanes = 1;
6908 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6909 }
6910
6911 if (BaseOpcode->Atomic) {
6912 Register VData0 = MI.getOperand(2).getReg();
6913 LLT Ty = MRI->getType(VData0);
6914
6915 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6916 if (Ty.isVector() && !IsAtomicPacked16Bit)
6917 return false;
6918
6919 if (BaseOpcode->AtomicX2) {
6920 Register VData1 = MI.getOperand(3).getReg();
6921 // The two values are packed in one register.
6922 LLT PackedTy = LLT::fixed_vector(2, Ty);
6923 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6924 MI.getOperand(2).setReg(Concat.getReg(0));
6925 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6926 }
6927 }
6928
6929 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6930
6931 // Rewrite the addressing register layout before doing anything else.
6932 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6933 // 16 bit gradients are supported, but are tied to the A16 control
6934 // so both gradients and addresses must be 16 bit
6935 return false;
6936 }
6937
6938 if (IsA16 && !ST.hasA16()) {
6939 // A16 not supported
6940 return false;
6941 }
6942
6943 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6944 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6945
6946 if (IsA16 || IsG16) {
6947 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6948 // instructions expect VGPR_32
6949 SmallVector<Register, 4> PackedRegs;
6950
6951 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6952
6953 // See also below in the non-a16 branch
6954 const bool UseNSA = ST.hasNSAEncoding() &&
6955 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6956 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6957 const bool UsePartialNSA =
6958 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6959
6960 if (UsePartialNSA) {
6961 // Pack registers that would go over NSAMaxSize into last VAddr register
6962 LLT PackedAddrTy =
6963 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6964 auto Concat = B.buildConcatVectors(
6965 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6966 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6967 PackedRegs.resize(NSAMaxSize);
6968 } else if (!UseNSA && PackedRegs.size() > 1) {
6969 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6970 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6971 PackedRegs[0] = Concat.getReg(0);
6972 PackedRegs.resize(1);
6973 }
6974
6975 const unsigned NumPacked = PackedRegs.size();
6976 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6977 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6978 if (!SrcOp.isReg()) {
6979 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6980 continue;
6981 }
6982
6983 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6984
6985 if (I - Intr->VAddrStart < NumPacked)
6986 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6987 else
6988 SrcOp.setReg(AMDGPU::NoRegister);
6989 }
6990 } else {
6991 // If the register allocator cannot place the address registers contiguously
6992 // without introducing moves, then using the non-sequential address encoding
6993 // is always preferable, since it saves VALU instructions and is usually a
6994 // wash in terms of code size or even better.
6995 //
6996 // However, we currently have no way of hinting to the register allocator
6997 // that MIMG addresses should be placed contiguously when it is possible to
6998 // do so, so force non-NSA for the common 2-address case as a heuristic.
6999 //
7000 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7001 // allocation when possible.
7002 //
7003 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7004 // set of the remaining addresses.
7005 const bool UseNSA = ST.hasNSAEncoding() &&
7006 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7007 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7008 const bool UsePartialNSA =
7009 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7010
7011 if (UsePartialNSA) {
7013 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7014 Intr->NumVAddrs - NSAMaxSize + 1);
7015 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7016 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7017 Intr->NumVAddrs);
7018 }
7019 }
7020
7021 int Flags = 0;
7022 if (IsA16)
7023 Flags |= 1;
7024 if (IsG16)
7025 Flags |= 2;
7026 MI.addOperand(MachineOperand::CreateImm(Flags));
7027
7028 if (BaseOpcode->NoReturn) { // No TFE for stores?
7029 // TODO: Handle dmask trim
7030 if (!Ty.isVector() || !IsD16)
7031 return true;
7032
7033 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7034 if (RepackedReg != VData) {
7035 MI.getOperand(1).setReg(RepackedReg);
7036 }
7037
7038 return true;
7039 }
7040
7041 Register DstReg = MI.getOperand(0).getReg();
7042 const LLT EltTy = Ty.getScalarType();
7043 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7044
7045 // Confirm that the return type is large enough for the dmask specified
7046 if (NumElts < DMaskLanes)
7047 return false;
7048
7049 if (NumElts > 4 || DMaskLanes > 4)
7050 return false;
7051
7052 // Image atomic instructions are using DMask to specify how many bits
7053 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7054 // DMaskLanes for image atomic has default value '0'.
7055 // We must be sure that atomic variants (especially packed) will not be
7056 // truncated from v2s16 or v4s16 to s16 type.
7057 //
7058 // ChangeElementCount will be needed for image load where Ty is always scalar.
7059 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7060 const LLT AdjustedTy =
7061 DMaskLanes == 0
7062 ? Ty
7063 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7064
7065 // The raw dword aligned data component of the load. The only legal cases
7066 // where this matters should be when using the packed D16 format, for
7067 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7068 LLT RoundedTy;
7069
7070 // S32 vector to cover all data, plus TFE result element.
7071 LLT TFETy;
7072
7073 // Register type to use for each loaded component. Will be S32 or V2S16.
7074 LLT RegTy;
7075
7076 if (IsD16 && ST.hasUnpackedD16VMem()) {
7077 RoundedTy =
7078 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7079 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7080 RegTy = S32;
7081 } else {
7082 unsigned EltSize = EltTy.getSizeInBits();
7083 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7084 unsigned RoundedSize = 32 * RoundedElts;
7085 RoundedTy = LLT::scalarOrVector(
7086 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7087 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7088 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7089 }
7090
7091 // The return type does not need adjustment.
7092 // TODO: Should we change s16 case to s32 or <2 x s16>?
7093 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7094 return true;
7095
7096 Register Dst1Reg;
7097
7098 // Insert after the instruction.
7099 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7100
7101 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7102 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7103 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7104 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7105
7106 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7107
7108 MI.getOperand(0).setReg(NewResultReg);
7109
7110 // In the IR, TFE is supposed to be used with a 2 element struct return
7111 // type. The instruction really returns these two values in one contiguous
7112 // register, with one additional dword beyond the loaded data. Rewrite the
7113 // return type to use a single register result.
7114
7115 if (IsTFE) {
7116 Dst1Reg = MI.getOperand(1).getReg();
7117 if (MRI->getType(Dst1Reg) != S32)
7118 return false;
7119
7120 // TODO: Make sure the TFE operand bit is set.
7121 MI.removeOperand(1);
7122
7123 // Handle the easy case that requires no repack instructions.
7124 if (Ty == S32) {
7125 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7126 return true;
7127 }
7128 }
7129
7130 // Now figure out how to copy the new result register back into the old
7131 // result.
7132 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7133
7134 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7135
7136 if (ResultNumRegs == 1) {
7137 assert(!IsTFE);
7138 ResultRegs[0] = NewResultReg;
7139 } else {
7140 // We have to repack into a new vector of some kind.
7141 for (int I = 0; I != NumDataRegs; ++I)
7142 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7143 B.buildUnmerge(ResultRegs, NewResultReg);
7144
7145 // Drop the final TFE element to get the data part. The TFE result is
7146 // directly written to the right place already.
7147 if (IsTFE)
7148 ResultRegs.resize(NumDataRegs);
7149 }
7150
7151 // For an s16 scalar result, we form an s32 result with a truncate regardless
7152 // of packed vs. unpacked.
7153 if (IsD16 && !Ty.isVector()) {
7154 B.buildTrunc(DstReg, ResultRegs[0]);
7155 return true;
7156 }
7157
7158 // Avoid a build/concat_vector of 1 entry.
7159 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7160 B.buildBitcast(DstReg, ResultRegs[0]);
7161 return true;
7162 }
7163
7164 assert(Ty.isVector());
7165
7166 if (IsD16) {
7167 // For packed D16 results with TFE enabled, all the data components are
7168 // S32. Cast back to the expected type.
7169 //
7170 // TODO: We don't really need to use load s32 elements. We would only need one
7171 // cast for the TFE result if a multiple of v2s16 was used.
7172 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7173 for (Register &Reg : ResultRegs)
7174 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7175 } else if (ST.hasUnpackedD16VMem()) {
7176 for (Register &Reg : ResultRegs)
7177 Reg = B.buildTrunc(S16, Reg).getReg(0);
7178 }
7179 }
7180
7181 auto padWithUndef = [&](LLT Ty, int NumElts) {
7182 if (NumElts == 0)
7183 return;
7184 Register Undef = B.buildUndef(Ty).getReg(0);
7185 for (int I = 0; I != NumElts; ++I)
7186 ResultRegs.push_back(Undef);
7187 };
7188
7189 // Pad out any elements eliminated due to the dmask.
7190 LLT ResTy = MRI->getType(ResultRegs[0]);
7191 if (!ResTy.isVector()) {
7192 padWithUndef(ResTy, NumElts - ResultRegs.size());
7193 B.buildBuildVector(DstReg, ResultRegs);
7194 return true;
7195 }
7196
7197 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7198 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7199
7200 // Deal with the one annoying legal case.
7201 const LLT V3S16 = LLT::fixed_vector(3, 16);
7202 if (Ty == V3S16) {
7203 if (IsTFE) {
7204 if (ResultRegs.size() == 1) {
7205 NewResultReg = ResultRegs[0];
7206 } else if (ResultRegs.size() == 2) {
7207 LLT V4S16 = LLT::fixed_vector(4, 16);
7208 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7209 } else {
7210 return false;
7211 }
7212 }
7213
7214 if (MRI->getType(DstReg).getNumElements() <
7215 MRI->getType(NewResultReg).getNumElements()) {
7216 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7217 } else {
7218 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7219 }
7220 return true;
7221 }
7222
7223 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7224 B.buildConcatVectors(DstReg, ResultRegs);
7225 return true;
7226}
7227
7229 MachineInstr &MI) const {
7230 MachineIRBuilder &B = Helper.MIRBuilder;
7231 GISelChangeObserver &Observer = Helper.Observer;
7232
7233 Register OrigDst = MI.getOperand(0).getReg();
7234 Register Dst;
7235 LLT Ty = B.getMRI()->getType(OrigDst);
7236 unsigned Size = Ty.getSizeInBits();
7237 MachineFunction &MF = B.getMF();
7238 unsigned Opc = 0;
7239 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7240 assert(Size == 8 || Size == 16);
7241 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7242 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7243 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7244 // destination register.
7245 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7246 } else {
7247 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7248 Dst = OrigDst;
7249 }
7250
7251 Observer.changingInstr(MI);
7252
7253 // Handle needing to s.buffer.load() a p8 value.
7254 if (hasBufferRsrcWorkaround(Ty)) {
7255 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7256 B.setInsertPt(B.getMBB(), MI);
7257 }
7259 Ty = getBitcastRegisterType(Ty);
7260 Helper.bitcastDst(MI, Ty, 0);
7261 B.setInsertPt(B.getMBB(), MI);
7262 }
7263
7264 // FIXME: We don't really need this intermediate instruction. The intrinsic
7265 // should be fixed to have a memory operand. Since it's readnone, we're not
7266 // allowed to add one.
7267 MI.setDesc(B.getTII().get(Opc));
7268 MI.removeOperand(1); // Remove intrinsic ID
7269
7270 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7271 const unsigned MemSize = (Size + 7) / 8;
7272 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7278 MemSize, MemAlign);
7279 MI.addMemOperand(MF, MMO);
7280 if (Dst != OrigDst) {
7281 MI.getOperand(0).setReg(Dst);
7282 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7283 B.buildTrunc(OrigDst, Dst);
7284 }
7285
7286 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7287 // always be legal. We may need to restore this to a 96-bit result if it turns
7288 // out this needs to be converted to a vector load during RegBankSelect.
7289 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7290 if (Ty.isVector())
7292 else
7293 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7294 }
7295
7296 Observer.changedInstr(MI);
7297 return true;
7298}
7299
7301 MachineInstr &MI) const {
7302 MachineIRBuilder &B = Helper.MIRBuilder;
7303 GISelChangeObserver &Observer = Helper.Observer;
7304 Observer.changingInstr(MI);
7305 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7306 MI.removeOperand(0); // Remove intrinsic ID
7308 Observer.changedInstr(MI);
7309 return true;
7310}
7311
7312// TODO: Move to selection
7315 MachineIRBuilder &B) const {
7316 if (!ST.isTrapHandlerEnabled() ||
7317 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7318 return legalizeTrapEndpgm(MI, MRI, B);
7319
7320 return ST.supportsGetDoorbellID() ?
7322}
7323
7326 const DebugLoc &DL = MI.getDebugLoc();
7327 MachineBasicBlock &BB = B.getMBB();
7328 MachineFunction *MF = BB.getParent();
7329
7330 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7331 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7332 .addImm(0);
7333 MI.eraseFromParent();
7334 return true;
7335 }
7336
7337 // We need a block split to make the real endpgm a terminator. We also don't
7338 // want to break phis in successor blocks, so we can't just delete to the
7339 // end of the block.
7340 BB.splitAt(MI, false /*UpdateLiveIns*/);
7342 MF->push_back(TrapBB);
7343 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7344 .addImm(0);
7345 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7346 .addMBB(TrapBB);
7347
7348 BB.addSuccessor(TrapBB);
7349 MI.eraseFromParent();
7350 return true;
7351}
7352
7355 MachineFunction &MF = B.getMF();
7356 const LLT S64 = LLT::scalar(64);
7357
7358 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7359 // For code object version 5, queue_ptr is passed through implicit kernarg.
7365 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7366
7367 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7369
7370 if (!loadInputValue(KernargPtrReg, B,
7372 return false;
7373
7374 // TODO: can we be smarter about machine pointer info?
7377 PtrInfo.getWithOffset(Offset),
7381
7382 // Pointer address
7383 Register LoadAddr = MRI.createGenericVirtualRegister(
7385 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7386 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7387 // Load address
7388 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7389 B.buildCopy(SGPR01, Temp);
7390 B.buildInstr(AMDGPU::S_TRAP)
7391 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7392 .addReg(SGPR01, RegState::Implicit);
7393 MI.eraseFromParent();
7394 return true;
7395 }
7396
7397 // Pass queue pointer to trap handler as input, and insert trap instruction
7398 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7399 Register LiveIn =
7400 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
7402 return false;
7403
7404 B.buildCopy(SGPR01, LiveIn);
7405 B.buildInstr(AMDGPU::S_TRAP)
7406 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7407 .addReg(SGPR01, RegState::Implicit);
7408
7409 MI.eraseFromParent();
7410 return true;
7411}
7412
7415 MachineIRBuilder &B) const {
7416 // We need to simulate the 's_trap 2' instruction on targets that run in
7417 // PRIV=1 (where it is treated as a nop).
7418 if (ST.hasPrivEnabledTrap2NopBug()) {
7419 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7420 MI.getDebugLoc());
7421 MI.eraseFromParent();
7422 return true;
7423 }
7424
7425 B.buildInstr(AMDGPU::S_TRAP)
7426 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7427 MI.eraseFromParent();
7428 return true;
7429}
7430
7433 MachineIRBuilder &B) const {
7434 // Is non-HSA path or trap-handler disabled? Then, report a warning
7435 // accordingly
7436 if (!ST.isTrapHandlerEnabled() ||
7437 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7438 Function &Fn = B.getMF().getFunction();
7440 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7441 } else {
7442 // Insert debug-trap instruction
7443 B.buildInstr(AMDGPU::S_TRAP)
7444 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7445 }
7446
7447 MI.eraseFromParent();
7448 return true;
7449}
7450
7452 MachineInstr &MI, MachineIRBuilder &B) const {
7453 MachineRegisterInfo &MRI = *B.getMRI();
7454 const LLT S16 = LLT::scalar(16);
7455 const LLT S32 = LLT::scalar(32);
7456 const LLT V2S16 = LLT::fixed_vector(2, 16);
7457 const LLT V3S32 = LLT::fixed_vector(3, 32);
7458
7459 Register DstReg = MI.getOperand(0).getReg();
7460 Register NodePtr = MI.getOperand(2).getReg();
7461 Register RayExtent = MI.getOperand(3).getReg();
7462 Register RayOrigin = MI.getOperand(4).getReg();
7463 Register RayDir = MI.getOperand(5).getReg();
7464 Register RayInvDir = MI.getOperand(6).getReg();
7465 Register TDescr = MI.getOperand(7).getReg();
7466
7467 if (!ST.hasGFX10_AEncoding()) {
7468 Function &Fn = B.getMF().getFunction();
7470 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7471 return false;
7472 }
7473
7474 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7475 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7476 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7477 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7478 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7479 const unsigned NumVDataDwords = 4;
7480 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7481 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7482 const bool UseNSA =
7483 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7484
7485 const unsigned BaseOpcodes[2][2] = {
7486 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7487 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7488 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7489 int Opcode;
7490 if (UseNSA) {
7491 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7492 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7493 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7494 : AMDGPU::MIMGEncGfx10NSA,
7495 NumVDataDwords, NumVAddrDwords);
7496 } else {
7497 assert(!IsGFX12Plus);
7498 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7499 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7500 : AMDGPU::MIMGEncGfx10Default,
7501 NumVDataDwords, NumVAddrDwords);
7502 }
7503 assert(Opcode != -1);
7504
7506 if (UseNSA && IsGFX11Plus) {
7507 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7508 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7509 auto Merged = B.buildMergeLikeInstr(
7510 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7511 Ops.push_back(Merged.getReg(0));
7512 };
7513
7514 Ops.push_back(NodePtr);
7515 Ops.push_back(RayExtent);
7516 packLanes(RayOrigin);
7517
7518 if (IsA16) {
7519 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7520 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7521 auto MergedDir = B.buildMergeLikeInstr(
7522 V3S32,
7523 {B.buildBitcast(
7524 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7525 UnmergeRayDir.getReg(0)}))
7526 .getReg(0),
7527 B.buildBitcast(
7528 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7529 UnmergeRayDir.getReg(1)}))
7530 .getReg(0),
7531 B.buildBitcast(
7532 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7533 UnmergeRayDir.getReg(2)}))
7534 .getReg(0)});
7535 Ops.push_back(MergedDir.getReg(0));
7536 } else {
7537 packLanes(RayDir);
7538 packLanes(RayInvDir);
7539 }
7540 } else {
7541 if (Is64) {
7542 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7543 Ops.push_back(Unmerge.getReg(0));
7544 Ops.push_back(Unmerge.getReg(1));
7545 } else {
7546 Ops.push_back(NodePtr);
7547 }
7548 Ops.push_back(RayExtent);
7549
7550 auto packLanes = [&Ops, &S32, &B](Register Src) {
7551 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7552 Ops.push_back(Unmerge.getReg(0));
7553 Ops.push_back(Unmerge.getReg(1));
7554 Ops.push_back(Unmerge.getReg(2));
7555 };
7556
7557 packLanes(RayOrigin);
7558 if (IsA16) {
7559 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7560 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7561 Register R1 = MRI.createGenericVirtualRegister(S32);
7562 Register R2 = MRI.createGenericVirtualRegister(S32);
7563 Register R3 = MRI.createGenericVirtualRegister(S32);
7564 B.buildMergeLikeInstr(R1,
7565 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7566 B.buildMergeLikeInstr(
7567 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7568 B.buildMergeLikeInstr(
7569 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7570 Ops.push_back(R1);
7571 Ops.push_back(R2);
7572 Ops.push_back(R3);
7573 } else {
7574 packLanes(RayDir);
7575 packLanes(RayInvDir);
7576 }
7577 }
7578
7579 if (!UseNSA) {
7580 // Build a single vector containing all the operands so far prepared.
7581 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7582 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7583 Ops.clear();
7584 Ops.push_back(MergedOps);
7585 }
7586
7587 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7588 .addDef(DstReg)
7589 .addImm(Opcode);
7590
7591 for (Register R : Ops) {
7592 MIB.addUse(R);
7593 }
7594
7595 MIB.addUse(TDescr)
7596 .addImm(IsA16 ? 1 : 0)
7597 .cloneMemRefs(MI);
7598
7599 MI.eraseFromParent();
7600 return true;
7601}
7602
7604 MachineInstr &MI, MachineIRBuilder &B) const {
7605 const LLT S32 = LLT::scalar(32);
7606 const LLT V2S32 = LLT::fixed_vector(2, 32);
7607
7608 Register DstReg = MI.getOperand(0).getReg();
7609 Register DstOrigin = MI.getOperand(1).getReg();
7610 Register DstDir = MI.getOperand(2).getReg();
7611 Register NodePtr = MI.getOperand(4).getReg();
7612 Register RayExtent = MI.getOperand(5).getReg();
7613 Register InstanceMask = MI.getOperand(6).getReg();
7614 Register RayOrigin = MI.getOperand(7).getReg();
7615 Register RayDir = MI.getOperand(8).getReg();
7616 Register Offsets = MI.getOperand(9).getReg();
7617 Register TDescr = MI.getOperand(10).getReg();
7618
7619 if (!ST.hasBVHDualAndBVH8Insts()) {
7620 Function &Fn = B.getMF().getFunction();
7622 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7623 return false;
7624 }
7625
7626 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7627 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7628 const unsigned NumVDataDwords = 10;
7629 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7630 int Opcode = AMDGPU::getMIMGOpcode(
7631 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7632 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7633 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7634 assert(Opcode != -1);
7635
7636 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7637 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7638
7639 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7640 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7641 .addDef(DstReg)
7642 .addDef(DstOrigin)
7643 .addDef(DstDir)
7644 .addImm(Opcode)
7645 .addUse(NodePtr)
7646 .addUse(RayExtentInstanceMaskVec.getReg(0))
7647 .addUse(RayOrigin)
7648 .addUse(RayDir)
7649 .addUse(Offsets)
7650 .addUse(TDescr)
7651 .cloneMemRefs(MI);
7652
7653 MI.eraseFromParent();
7654 return true;
7655}
7656
7658 MachineIRBuilder &B) const {
7659 const SITargetLowering *TLI = ST.getTargetLowering();
7661 Register DstReg = MI.getOperand(0).getReg();
7662 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7663 MI.eraseFromParent();
7664 return true;
7665}
7666
7668 MachineIRBuilder &B) const {
7669 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7670 if (!ST.hasArchitectedSGPRs())
7671 return false;
7672 LLT S32 = LLT::scalar(32);
7673 Register DstReg = MI.getOperand(0).getReg();
7674 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7675 auto LSB = B.buildConstant(S32, 25);
7676 auto Width = B.buildConstant(S32, 5);
7677 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7678 MI.eraseFromParent();
7679 return true;
7680}
7681
7684 AMDGPU::Hwreg::Id HwReg,
7685 unsigned LowBit,
7686 unsigned Width) const {
7687 MachineRegisterInfo &MRI = *B.getMRI();
7688 Register DstReg = MI.getOperand(0).getReg();
7689 if (!MRI.getRegClassOrNull(DstReg))
7690 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7691 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7692 .addDef(DstReg)
7693 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7694 MI.eraseFromParent();
7695 return true;
7696}
7697
7698static constexpr unsigned FPEnvModeBitField =
7700
7701static constexpr unsigned FPEnvTrapBitField =
7703
7706 MachineIRBuilder &B) const {
7707 Register Src = MI.getOperand(0).getReg();
7708 if (MRI.getType(Src) != S64)
7709 return false;
7710
7711 auto ModeReg =
7712 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7713 /*HasSideEffects=*/true, /*isConvergent=*/false)
7714 .addImm(FPEnvModeBitField);
7715 auto TrapReg =
7716 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7717 /*HasSideEffects=*/true, /*isConvergent=*/false)
7718 .addImm(FPEnvTrapBitField);
7719 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7720 MI.eraseFromParent();
7721 return true;
7722}
7723
7726 MachineIRBuilder &B) const {
7727 Register Src = MI.getOperand(0).getReg();
7728 if (MRI.getType(Src) != S64)
7729 return false;
7730
7731 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7732 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7733 /*HasSideEffects=*/true, /*isConvergent=*/false)
7734 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7735 .addReg(Unmerge.getReg(0));
7736 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7737 /*HasSideEffects=*/true, /*isConvergent=*/false)
7738 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7739 .addReg(Unmerge.getReg(1));
7740 MI.eraseFromParent();
7741 return true;
7742}
7743
7745 MachineInstr &MI) const {
7746 MachineIRBuilder &B = Helper.MIRBuilder;
7747 MachineRegisterInfo &MRI = *B.getMRI();
7748
7749 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7750 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7751 switch (IntrID) {
7752 case Intrinsic::amdgcn_if:
7753 case Intrinsic::amdgcn_else: {
7754 MachineInstr *Br = nullptr;
7755 MachineBasicBlock *UncondBrTarget = nullptr;
7756 bool Negated = false;
7757 if (MachineInstr *BrCond =
7758 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7759 const SIRegisterInfo *TRI
7760 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7761
7762 Register Def = MI.getOperand(1).getReg();
7763 Register Use = MI.getOperand(3).getReg();
7764
7765 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7766
7767 if (Negated)
7768 std::swap(CondBrTarget, UncondBrTarget);
7769
7770 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7771 if (IntrID == Intrinsic::amdgcn_if) {
7772 B.buildInstr(AMDGPU::SI_IF)
7773 .addDef(Def)
7774 .addUse(Use)
7775 .addMBB(UncondBrTarget);
7776 } else {
7777 B.buildInstr(AMDGPU::SI_ELSE)
7778 .addDef(Def)
7779 .addUse(Use)
7780 .addMBB(UncondBrTarget);
7781 }
7782
7783 if (Br) {
7784 Br->getOperand(0).setMBB(CondBrTarget);
7785 } else {
7786 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7787 // since we're swapping branch targets it needs to be reinserted.
7788 // FIXME: IRTranslator should probably not do this
7789 B.buildBr(*CondBrTarget);
7790 }
7791
7792 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7793 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7794 MI.eraseFromParent();
7795 BrCond->eraseFromParent();
7796 return true;
7797 }
7798
7799 return false;
7800 }
7801 case Intrinsic::amdgcn_loop: {
7802 MachineInstr *Br = nullptr;
7803 MachineBasicBlock *UncondBrTarget = nullptr;
7804 bool Negated = false;
7805 if (MachineInstr *BrCond =
7806 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7807 const SIRegisterInfo *TRI
7808 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7809
7810 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7811 Register Reg = MI.getOperand(2).getReg();
7812
7813 if (Negated)
7814 std::swap(CondBrTarget, UncondBrTarget);
7815
7816 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7817 B.buildInstr(AMDGPU::SI_LOOP)
7818 .addUse(Reg)
7819 .addMBB(UncondBrTarget);
7820
7821 if (Br)
7822 Br->getOperand(0).setMBB(CondBrTarget);
7823 else
7824 B.buildBr(*CondBrTarget);
7825
7826 MI.eraseFromParent();
7827 BrCond->eraseFromParent();
7828 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7829 return true;
7830 }
7831
7832 return false;
7833 }
7834 case Intrinsic::amdgcn_addrspacecast_nonnull:
7835 return legalizeAddrSpaceCast(MI, MRI, B);
7836 case Intrinsic::amdgcn_make_buffer_rsrc:
7838 case Intrinsic::amdgcn_kernarg_segment_ptr:
7839 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
7840 // This only makes sense to call in a kernel, so just lower to null.
7841 B.buildConstant(MI.getOperand(0).getReg(), 0);
7842 MI.eraseFromParent();
7843 return true;
7844 }
7845
7848 case Intrinsic::amdgcn_implicitarg_ptr:
7849 return legalizeImplicitArgPtr(MI, MRI, B);
7850 case Intrinsic::amdgcn_workitem_id_x:
7851 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7853 case Intrinsic::amdgcn_workitem_id_y:
7854 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7856 case Intrinsic::amdgcn_workitem_id_z:
7857 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7859 case Intrinsic::amdgcn_workgroup_id_x:
7860 return legalizeWorkGroupId(
7864 case Intrinsic::amdgcn_workgroup_id_y:
7865 return legalizeWorkGroupId(
7869 case Intrinsic::amdgcn_workgroup_id_z:
7870 return legalizeWorkGroupId(
7874 case Intrinsic::amdgcn_cluster_id_x:
7875 return ST.hasClusters() &&
7878 case Intrinsic::amdgcn_cluster_id_y:
7879 return ST.hasClusters() &&
7882 case Intrinsic::amdgcn_cluster_id_z:
7883 return ST.hasClusters() &&
7886 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7887 return ST.hasClusters() &&
7890 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7891 return ST.hasClusters() &&
7894 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7895 return ST.hasClusters() &&
7898 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7899 return ST.hasClusters() &&
7901 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7902 return ST.hasClusters() &&
7905 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7906 return ST.hasClusters() &&
7909 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7910 return ST.hasClusters() &&
7913 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7914 return ST.hasClusters() &&
7916 MI, MRI, B,
7918 case Intrinsic::amdgcn_wave_id:
7919 return legalizeWaveID(MI, B);
7920 case Intrinsic::amdgcn_lds_kernel_id:
7923 case Intrinsic::amdgcn_dispatch_ptr:
7926 case Intrinsic::amdgcn_queue_ptr:
7929 case Intrinsic::amdgcn_implicit_buffer_ptr:
7932 case Intrinsic::amdgcn_dispatch_id:
7935 case Intrinsic::r600_read_ngroups_x:
7936 // TODO: Emit error for hsa
7939 case Intrinsic::r600_read_ngroups_y:
7942 case Intrinsic::r600_read_ngroups_z:
7945 case Intrinsic::r600_read_local_size_x:
7946 // TODO: Could insert G_ASSERT_ZEXT from s16
7948 case Intrinsic::r600_read_local_size_y:
7949 // TODO: Could insert G_ASSERT_ZEXT from s16
7951 // TODO: Could insert G_ASSERT_ZEXT from s16
7952 case Intrinsic::r600_read_local_size_z:
7955 case Intrinsic::amdgcn_fdiv_fast:
7956 return legalizeFDIVFastIntrin(MI, MRI, B);
7957 case Intrinsic::amdgcn_is_shared:
7959 case Intrinsic::amdgcn_is_private:
7961 case Intrinsic::amdgcn_wavefrontsize: {
7962 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7963 MI.eraseFromParent();
7964 return true;
7965 }
7966 case Intrinsic::amdgcn_s_buffer_load:
7967 return legalizeSBufferLoad(Helper, MI);
7968 case Intrinsic::amdgcn_raw_buffer_store:
7969 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7970 case Intrinsic::amdgcn_struct_buffer_store:
7971 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7972 return legalizeBufferStore(MI, Helper, false, false);
7973 case Intrinsic::amdgcn_raw_buffer_store_format:
7974 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7975 case Intrinsic::amdgcn_struct_buffer_store_format:
7976 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7977 return legalizeBufferStore(MI, Helper, false, true);
7978 case Intrinsic::amdgcn_raw_tbuffer_store:
7979 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7980 case Intrinsic::amdgcn_struct_tbuffer_store:
7981 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7982 return legalizeBufferStore(MI, Helper, true, true);
7983 case Intrinsic::amdgcn_raw_buffer_load:
7984 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7985 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7986 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7987 case Intrinsic::amdgcn_struct_buffer_load:
7988 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7989 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7990 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7991 return legalizeBufferLoad(MI, Helper, false, false);
7992 case Intrinsic::amdgcn_raw_buffer_load_format:
7993 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7994 case Intrinsic::amdgcn_struct_buffer_load_format:
7995 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7996 return legalizeBufferLoad(MI, Helper, true, false);
7997 case Intrinsic::amdgcn_raw_tbuffer_load:
7998 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7999 case Intrinsic::amdgcn_struct_tbuffer_load:
8000 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8001 return legalizeBufferLoad(MI, Helper, true, true);
8002 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8003 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8004 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8005 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8006 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8007 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8008 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8009 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8010 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8011 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8012 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8013 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8014 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8015 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8016 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8017 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8018 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8019 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8020 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8021 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8022 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8023 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8024 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8025 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8026 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8027 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8028 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8029 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8030 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8031 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8032 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8033 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8034 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8035 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8036 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8037 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8038 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8039 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8040 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8041 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8042 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8043 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8044 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8045 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8046 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8047 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8048 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8049 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8050 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8051 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8052 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8053 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8054 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8055 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8056 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8057 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8058 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8059 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8060 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8061 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8062 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8063 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8064 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8065 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8066 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8067 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8068 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8069 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8070 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8072 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8074 return legalizeBufferAtomic(MI, B, IntrID);
8075 case Intrinsic::amdgcn_rsq_clamp:
8077 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8079 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8080 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8082 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8083 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8084 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8085 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8086 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8087 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8088 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8089 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8090 Register Index = MI.getOperand(5).getReg();
8091 LLT S64 = LLT::scalar(64);
8092 if (MRI.getType(Index) != S64)
8093 MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0));
8094 return true;
8095 }
8096 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8097 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8098 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8099 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8100 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8101 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8102 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8103 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8104 Register Index = MI.getOperand(5).getReg();
8105 LLT S32 = LLT::scalar(32);
8106 if (MRI.getType(Index) != S32)
8107 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8108 return true;
8109 }
8110 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8111 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8112 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8113 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8114 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8115 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8116 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8117 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8118 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8119 Register Index = MI.getOperand(7).getReg();
8120 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8121 ? LLT::scalar(64)
8122 : LLT::scalar(32);
8123 if (MRI.getType(Index) != IdxTy)
8124 MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0));
8125 return true;
8126 }
8127
8128 case Intrinsic::amdgcn_fmed3: {
8129 GISelChangeObserver &Observer = Helper.Observer;
8130
8131 // FIXME: This is to workaround the inability of tablegen match combiners to
8132 // match intrinsics in patterns.
8133 Observer.changingInstr(MI);
8134 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8135 MI.removeOperand(1);
8136 Observer.changedInstr(MI);
8137 return true;
8138 }
8139 case Intrinsic::amdgcn_readlane:
8140 case Intrinsic::amdgcn_writelane:
8141 case Intrinsic::amdgcn_readfirstlane:
8142 case Intrinsic::amdgcn_permlane16:
8143 case Intrinsic::amdgcn_permlanex16:
8144 case Intrinsic::amdgcn_permlane64:
8145 case Intrinsic::amdgcn_set_inactive:
8146 case Intrinsic::amdgcn_set_inactive_chain_arg:
8147 case Intrinsic::amdgcn_mov_dpp8:
8148 case Intrinsic::amdgcn_update_dpp:
8149 return legalizeLaneOp(Helper, MI, IntrID);
8150 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8151 return legalizeSBufferPrefetch(Helper, MI);
8152 case Intrinsic::amdgcn_dead: {
8153 // TODO: Use poison instead of undef
8154 for (const MachineOperand &Def : MI.defs())
8155 B.buildUndef(Def);
8156 MI.eraseFromParent();
8157 return true;
8158 }
8159 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8160 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8161 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8162 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8163 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8164 MI.eraseFromParent();
8165 return true;
8166 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8167 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8168 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8169 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8170 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8171 MI.eraseFromParent();
8172 return true;
8173 default: {
8174 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8176 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8177 return true;
8178 }
8179 }
8180
8181 return true;
8182}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:71
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1258
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1120
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:387
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
TargetOptions Options
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:922
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:2041
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:654
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:462
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:315
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1726
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.