LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::scalar(32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::scalar(64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
327constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
328constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
329constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
330constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
331constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
332constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
333constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
334constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
335constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
336constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
337constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
338constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
341constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
342constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
343constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
344constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
345constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
346constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
347constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
350constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
364
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
375
378 (ST.useRealTrue16Insts() && Ty == S16) ||
380}
381
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 Query.MMODescrs[0].Ordering !=
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
500 Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
601 RoundedSize, AddrSpace, Align(AlignInBits / 8),
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
612 Query.MMODescrs[0].AlignInBits,
613 Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(Idx);
622
623 const LLT PointerTy = MRI.getType(MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
647 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Unmerged.getReg(I));
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 }
675 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
677}
678
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
686 return;
688}
689
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
736
738
739 // s1 for VCC branches, s32 for SCC branches.
741
742 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
743 // elements for v3s16
746 .legalFor(AllS32Vectors)
748 .legalFor(AddrSpaces64)
749 .legalFor(AddrSpaces32)
750 .legalFor(AddrSpaces128)
751 .legalIf(isPointer(0))
752 .clampScalar(0, S16, S256)
754 .clampMaxNumElements(0, S32, 16)
756 .scalarize(0);
757
758 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
759 // Full set of gfx9 features.
760 if (ST.hasScalarAddSub64()) {
761 getActionDefinitionsBuilder({G_ADD, G_SUB})
762 .legalFor({S64, S32, S16, V2S16})
763 .clampMaxNumElementsStrict(0, S16, 2)
764 .scalarize(0)
765 .minScalar(0, S16)
767 .maxScalar(0, S32);
768 } else {
769 getActionDefinitionsBuilder({G_ADD, G_SUB})
770 .legalFor({S32, S16, V2S16})
771 .clampMaxNumElementsStrict(0, S16, 2)
772 .scalarize(0)
773 .minScalar(0, S16)
775 .maxScalar(0, S32);
776 }
777
778 if (ST.hasScalarSMulU64()) {
780 .legalFor({S64, S32, S16, V2S16})
781 .clampMaxNumElementsStrict(0, S16, 2)
782 .scalarize(0)
783 .minScalar(0, S16)
785 .custom();
786 } else {
788 .legalFor({S32, S16, V2S16})
789 .clampMaxNumElementsStrict(0, S16, 2)
790 .scalarize(0)
791 .minScalar(0, S16)
793 .custom();
794 }
795 assert(ST.hasMad64_32());
796
797 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
798 .legalFor({S32, S16, V2S16}) // Clamp modifier
799 .minScalarOrElt(0, S16)
801 .scalarize(0)
803 .lower();
804 } else if (ST.has16BitInsts()) {
805 getActionDefinitionsBuilder({G_ADD, G_SUB})
806 .legalFor({S32, S16})
807 .minScalar(0, S16)
809 .maxScalar(0, S32)
810 .scalarize(0);
811
813 .legalFor({S32, S16})
814 .scalarize(0)
815 .minScalar(0, S16)
817 .custom();
818 assert(ST.hasMad64_32());
819
820 // Technically the saturating operations require clamp bit support, but this
821 // was introduced at the same time as 16-bit operations.
822 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
823 .legalFor({S32, S16}) // Clamp modifier
824 .minScalar(0, S16)
825 .scalarize(0)
827 .lower();
828
829 // We're just lowering this, but it helps get a better result to try to
830 // coerce to the desired type first.
831 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
832 .minScalar(0, S16)
833 .scalarize(0)
834 .lower();
835 } else {
836 getActionDefinitionsBuilder({G_ADD, G_SUB})
837 .legalFor({S32})
838 .widenScalarToNextMultipleOf(0, 32)
839 .clampScalar(0, S32, S32)
840 .scalarize(0);
841
842 auto &Mul = getActionDefinitionsBuilder(G_MUL)
843 .legalFor({S32})
844 .scalarize(0)
845 .minScalar(0, S32)
847
848 if (ST.hasMad64_32())
849 Mul.custom();
850 else
851 Mul.maxScalar(0, S32);
852
853 if (ST.hasIntClamp()) {
854 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
855 .legalFor({S32}) // Clamp modifier.
856 .scalarize(0)
858 .lower();
859 } else {
860 // Clamp bit support was added in VI, along with 16-bit operations.
861 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
862 .minScalar(0, S32)
863 .scalarize(0)
864 .lower();
865 }
866
867 // FIXME: DAG expansion gets better results. The widening uses the smaller
868 // range values and goes for the min/max lowering directly.
869 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
870 .minScalar(0, S32)
871 .scalarize(0)
872 .lower();
873 }
874
876 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
877 .customFor({S32, S64})
878 .clampScalar(0, S32, S64)
880 .scalarize(0);
881
882 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
883 .legalFor({S32})
884 .maxScalar(0, S32);
885
886 if (ST.hasVOP3PInsts()) {
887 Mulh
888 .clampMaxNumElements(0, S8, 2)
889 .lowerFor({V2S8});
890 }
891
892 Mulh
893 .scalarize(0)
894 .lower();
895
896 // Report legal for any types we can handle anywhere. For the cases only legal
897 // on the SALU, RegBankSelect will be able to re-legalize.
898 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
899 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
900 .clampScalar(0, S32, S64)
906 .scalarize(0);
907
909 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
910 .legalFor({{S32, S1}, {S32, S32}})
911 .clampScalar(0, S32, S32)
912 .scalarize(0);
913
915 // Don't worry about the size constraint.
917 .lower();
918
920 .legalFor({S1, S32, S64, S16, GlobalPtr,
921 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
922 .legalIf(isPointer(0))
923 .clampScalar(0, S32, S64)
925
926 getActionDefinitionsBuilder(G_FCONSTANT)
927 .legalFor({S32, S64, S16})
928 .clampScalar(0, S16, S64);
929
930 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
931 .legalIf(isRegisterClassType(ST, 0))
932 // s1 and s16 are special cases because they have legal operations on
933 // them, but don't really occupy registers in the normal way.
934 .legalFor({S1, S16})
935 .clampNumElements(0, V16S32, V32S32)
939 .clampMaxNumElements(0, S32, 16);
940
941 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
942
943 // If the amount is divergent, we have to do a wave reduction to get the
944 // maximum value, so this is expanded during RegBankSelect.
945 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
946 .legalFor({{PrivatePtr, S32}});
947
948 getActionDefinitionsBuilder(G_STACKSAVE)
949 .customFor({PrivatePtr});
950 getActionDefinitionsBuilder(G_STACKRESTORE)
951 .legalFor({PrivatePtr});
952
953 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
954
955 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
956 .customIf(typeIsNot(0, PrivatePtr));
957
958 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
959
960 auto &FPOpActions = getActionDefinitionsBuilder(
961 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
962 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
963 .legalFor({S32, S64});
964 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
965 .customFor({S32, S64});
966 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
967 .customFor({S32, S64});
968
969 if (ST.has16BitInsts()) {
970 if (ST.hasVOP3PInsts())
971 FPOpActions.legalFor({S16, V2S16});
972 else
973 FPOpActions.legalFor({S16});
974
975 TrigActions.customFor({S16});
976 FDIVActions.customFor({S16});
977 }
978
979 if (ST.hasPackedFP32Ops()) {
980 FPOpActions.legalFor({V2S32});
981 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
982 }
983
984 auto &MinNumMaxNumIeee =
985 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
986
987 if (ST.hasVOP3PInsts()) {
988 MinNumMaxNumIeee.legalFor(FPTypesPK16)
989 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
990 .clampMaxNumElements(0, S16, 2)
991 .clampScalar(0, S16, S64)
992 .scalarize(0);
993 } else if (ST.has16BitInsts()) {
994 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
995 } else {
996 MinNumMaxNumIeee.legalFor(FPTypesBase)
997 .clampScalar(0, S32, S64)
998 .scalarize(0);
999 }
1000
1001 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1002 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1003
1004 if (ST.hasVOP3PInsts()) {
1005 MinNumMaxNum.customFor(FPTypesPK16)
1006 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1007 .clampMaxNumElements(0, S16, 2)
1008 .clampScalar(0, S16, S64)
1009 .scalarize(0);
1010 } else if (ST.has16BitInsts()) {
1011 MinNumMaxNum.customFor(FPTypes16)
1012 .clampScalar(0, S16, S64)
1013 .scalarize(0);
1014 } else {
1015 MinNumMaxNum.customFor(FPTypesBase)
1016 .clampScalar(0, S32, S64)
1017 .scalarize(0);
1018 }
1019
1020 if (ST.hasVOP3PInsts())
1021 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1022
1023 FPOpActions
1024 .scalarize(0)
1025 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1026
1027 TrigActions
1028 .scalarize(0)
1029 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1030
1031 FDIVActions
1032 .scalarize(0)
1033 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1034
1035 auto &FNegAbs = getActionDefinitionsBuilder({G_FNEG, G_FABS});
1036 FNegAbs.legalFor(FPTypesPK16)
1037 .legalFor(ST.hasPackedFP32Ops(), {V2S32})
1039 if (ST.hasPackedFP32Ops())
1040 FNegAbs.clampMaxNumElementsStrict(0, S32, 2);
1041 FNegAbs.scalarize(0).clampScalar(0, S16, S64);
1042
1043 if (ST.has16BitInsts()) {
1045 .legalFor({S16})
1046 .customFor({S32, S64})
1047 .scalarize(0)
1048 .unsupported();
1050 .legalFor({S32, S64, S16})
1051 .scalarize(0)
1052 .clampScalar(0, S16, S64);
1053
1054 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1055 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1056 .scalarize(0)
1057 .maxScalarIf(typeIs(0, S16), 1, S16)
1058 .clampScalar(1, S32, S32)
1059 .lower();
1060
1062 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1063 .scalarize(0)
1064 .lower();
1065
1067 .lowerFor({S16, S32, S64})
1068 .scalarize(0)
1069 .lower();
1070 } else {
1072 .customFor({S32, S64, S16})
1073 .scalarize(0)
1074 .unsupported();
1075
1076
1077 if (ST.hasFractBug()) {
1079 .customFor({S64})
1080 .legalFor({S32, S64})
1081 .scalarize(0)
1082 .clampScalar(0, S32, S64);
1083 } else {
1085 .legalFor({S32, S64})
1086 .scalarize(0)
1087 .clampScalar(0, S32, S64);
1088 }
1089
1090 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1091 .legalFor({{S32, S32}, {S64, S32}})
1092 .scalarize(0)
1093 .clampScalar(0, S32, S64)
1094 .clampScalar(1, S32, S32)
1095 .lower();
1096
1098 .customFor({{S32, S32}, {S64, S32}})
1099 .scalarize(0)
1100 .minScalar(0, S32)
1101 .clampScalar(1, S32, S32)
1102 .lower();
1103
1105 .lowerFor({S32, S64})
1106 .scalarize(0)
1107 .lower();
1108 }
1109
1110 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1111 if (ST.hasCvtPkF16F32Inst()) {
1112 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1113 .clampMaxNumElements(0, S16, 2);
1114 } else {
1115 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1116 }
1117 FPTruncActions.scalarize(0).lower();
1118
1120 .legalFor({{S64, S32}, {S32, S16}})
1121 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1122 .scalarize(0);
1123
1124 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1125 if (ST.has16BitInsts()) {
1126 FSubActions
1127 // Use actual fsub instruction
1128 .legalFor({S32, S16})
1129 // Must use fadd + fneg
1130 .lowerFor({S64, V2S16});
1131 } else {
1132 FSubActions
1133 // Use actual fsub instruction
1134 .legalFor({S32})
1135 // Must use fadd + fneg
1136 .lowerFor({S64, S16, V2S16});
1137 }
1138
1139 if (ST.hasPackedFP32Ops())
1140 FSubActions.lowerFor({V2S32}).clampMaxNumElements(0, S32, 2);
1141
1142 FSubActions
1143 .clampMaxNumElements(0, S16, 2)
1144 .scalarize(0)
1145 .clampScalar(0, S32, S64);
1146
1147 // Whether this is legal depends on the floating point mode for the function.
1148 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1149 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1150 FMad.customFor({S32, S16});
1151 else if (ST.hasMadMacF32Insts())
1152 FMad.customFor({S32});
1153 else if (ST.hasMadF16())
1154 FMad.customFor({S16});
1155 FMad.scalarize(0)
1156 .lower();
1157
1158 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1159 if (ST.has16BitInsts()) {
1160 FRem.customFor({S16, S32, S64});
1161 } else {
1162 FRem.minScalar(0, S32)
1163 .customFor({S32, S64});
1164 }
1165 FRem.scalarize(0);
1166
1167 // TODO: Do we need to clamp maximum bitwidth?
1169 .legalIf(isScalar(0))
1170 .legalFor({{V2S16, V2S32}})
1171 .clampMaxNumElements(0, S16, 2)
1172 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1173 // situations (like an invalid implicit use), we don't want to infinite loop
1174 // in the legalizer.
1176 .alwaysLegal();
1177
1178 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1179 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1180 {S32, S1}, {S64, S1}, {S16, S1}})
1181 .scalarize(0)
1182 .clampScalar(0, S32, S64)
1183 .widenScalarToNextPow2(1, 32);
1184
1185 // TODO: Split s1->s64 during regbankselect for VALU.
1186 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1187 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1188 .lowerIf(typeIs(1, S1))
1189 .customFor({{S32, S64}, {S64, S64}});
1190 if (ST.has16BitInsts())
1191 IToFP.legalFor({{S16, S16}});
1192 IToFP.clampScalar(1, S32, S64)
1193 .minScalar(0, S32)
1194 .scalarize(0)
1196
1197 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1198 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1199 .customFor({{S64, S32}, {S64, S64}})
1200 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1201 if (ST.has16BitInsts())
1202 FPToI.legalFor({{S16, S16}});
1203 else
1204 FPToI.minScalar(1, S32);
1205
1206 FPToI.minScalar(0, S32)
1207 .widenScalarToNextPow2(0, 32)
1208 .scalarize(0)
1209 .lower();
1210
1211 // clang-format off
1212 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1213 .legalFor({{S32, S32}, {S32, S64}})
1214 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1215 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1216
1217 // If available, widen width <16 to i16, intead of i32 so v_cvt_i16/u16_f16 can be used.
1218 if (ST.has16BitInsts())
1219 FPToISat.minScalarIf(typeIs(1, S16), 0, S16);
1220
1221 FPToISat.minScalar(1, S32);
1222 FPToISat.minScalar(0, S32)
1223 .widenScalarToNextPow2(0, 32)
1224 .scalarize(0)
1225 .lower();
1226 // clang-format on
1227
1228 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1229 .clampScalar(0, S16, S64)
1230 .scalarize(0)
1231 .lower();
1232
1233 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1234 .legalFor({S16, S32})
1235 .scalarize(0)
1236 .lower();
1237
1238 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1239 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1240 .scalarize(0)
1241 .lower();
1242
1243 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1244 .clampScalar(0, S16, S64)
1245 .scalarize(0)
1246 .lower();
1247
1248 if (ST.has16BitInsts()) {
1249 getActionDefinitionsBuilder(
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1251 .legalFor({S16, S32, S64})
1252 .clampScalar(0, S16, S64)
1253 .scalarize(0);
1254 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1255 getActionDefinitionsBuilder(
1256 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1257 .legalFor({S32, S64})
1258 .clampScalar(0, S32, S64)
1259 .scalarize(0);
1260 } else {
1261 getActionDefinitionsBuilder(
1262 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1263 .legalFor({S32})
1264 .customFor({S64})
1265 .clampScalar(0, S32, S64)
1266 .scalarize(0);
1267 }
1268
1269 getActionDefinitionsBuilder(G_PTR_ADD)
1270 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1271 .legalIf(all(isPointer(0), sameSize(0, 1)))
1272 .scalarize(0)
1273 .scalarSameSizeAs(1, 0);
1274
1275 getActionDefinitionsBuilder(G_PTRMASK)
1276 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1277 .scalarSameSizeAs(1, 0)
1278 .scalarize(0);
1279
1280 auto &CmpBuilder =
1281 getActionDefinitionsBuilder(G_ICMP)
1282 // The compare output type differs based on the register bank of the output,
1283 // so make both s1 and s32 legal.
1284 //
1285 // Scalar compares producing output in scc will be promoted to s32, as that
1286 // is the allocatable register type that will be needed for the copy from
1287 // scc. This will be promoted during RegBankSelect, and we assume something
1288 // before that won't try to use s32 result types.
1289 //
1290 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1291 // bank.
1293 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1294 .legalForCartesianProduct(
1295 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1296 if (ST.has16BitInsts()) {
1297 CmpBuilder.legalFor({{S1, S16}});
1298 }
1299
1300 CmpBuilder
1302 .clampScalar(1, S32, S64)
1303 .scalarize(0)
1304 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1305
1306 auto &FCmpBuilder =
1307 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1308 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1309
1310 if (ST.hasSALUFloatInsts())
1311 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1312
1313 FCmpBuilder
1315 .clampScalar(1, S32, S64)
1316 .scalarize(0);
1317
1318 // FIXME: fpow has a selection pattern that should move to custom lowering.
1319 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1320 if (ST.has16BitInsts())
1321 ExpOps.customFor({{S32}, {S16}});
1322 else
1323 ExpOps.customFor({S32});
1324 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1325 .scalarize(0);
1326
1327 getActionDefinitionsBuilder(G_FPOWI)
1328 .clampScalar(0, MinScalarFPTy, S32)
1329 .lower();
1330
1331 getActionDefinitionsBuilder(G_FLOG2)
1332 .legalFor(ST.has16BitInsts(), {S16})
1333 .customFor({S32, S16})
1334 .scalarize(0)
1335 .lower();
1336
1337 getActionDefinitionsBuilder(G_FEXP2)
1338 .legalFor(ST.has16BitInsts(), {S16})
1339 .customFor({S32, S64, S16})
1340 .scalarize(0)
1341 .lower();
1342
1343 auto &LogOps =
1344 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1345 LogOps.customFor({S32, S16, S64});
1346 LogOps.clampScalar(0, MinScalarFPTy, S32)
1347 .scalarize(0);
1348
1349 // The 64-bit versions produce 32-bit results, but only on the SALU.
1350 getActionDefinitionsBuilder(G_CTPOP)
1351 .legalFor({{S32, S32}, {S32, S64}})
1352 .clampScalar(0, S32, S32)
1353 .widenScalarToNextPow2(1, 32)
1354 .clampScalar(1, S32, S64)
1355 .scalarize(0)
1356 .widenScalarToNextPow2(0, 32);
1357
1358 // If no 16 bit instr is available, lower into different instructions.
1359 if (ST.has16BitInsts())
1360 getActionDefinitionsBuilder(G_IS_FPCLASS)
1361 .legalForCartesianProduct({S1}, FPTypes16)
1362 .widenScalarToNextPow2(1)
1363 .scalarize(0)
1364 .lower();
1365 else
1366 getActionDefinitionsBuilder(G_IS_FPCLASS)
1367 .legalForCartesianProduct({S1}, FPTypesBase)
1368 .lowerFor({S1, S16})
1369 .widenScalarToNextPow2(1)
1370 .scalarize(0)
1371 .lower();
1372
1373 // The hardware instructions return a different result on 0 than the generic
1374 // instructions expect. The hardware produces -1, but these produce the
1375 // bitwidth.
1376 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1377 .scalarize(0)
1378 .clampScalar(0, S32, S32)
1379 .clampScalar(1, S32, S64)
1380 .widenScalarToNextPow2(0, 32)
1381 .widenScalarToNextPow2(1, 32)
1382 .custom();
1383
1384 // The 64-bit versions produce 32-bit results, but only on the SALU.
1385 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1386 .legalFor({{S32, S32}, {S32, S64}})
1387 .customIf(scalarNarrowerThan(1, 32))
1388 .clampScalar(0, S32, S32)
1389 .clampScalar(1, S32, S64)
1390 .scalarize(0)
1391 .widenScalarToNextPow2(0, 32)
1392 .widenScalarToNextPow2(1, 32);
1393
1394 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1395 .legalFor({{S32, S32}, {S32, S64}})
1396 .clampScalar(0, S32, S32)
1397 .clampScalar(1, S32, S64)
1398 .scalarize(0)
1399 .widenScalarToNextPow2(0, 32)
1400 .widenScalarToNextPow2(1, 32);
1401
1402 getActionDefinitionsBuilder(G_CTLS)
1403 .customFor({{S32, S32}})
1404 .scalarize(0)
1405 .clampScalar(0, S32, S32)
1406 .clampScalar(1, S32, S32);
1407
1408 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1409 // RegBankSelect.
1410 getActionDefinitionsBuilder(G_BITREVERSE)
1411 .legalFor({S32, S64})
1412 .clampScalar(0, S32, S64)
1413 .scalarize(0)
1414 .widenScalarToNextPow2(0);
1415
1416 if (ST.has16BitInsts()) {
1417 getActionDefinitionsBuilder(G_BSWAP)
1418 .legalFor({S16, S32, V2S16})
1419 .clampMaxNumElementsStrict(0, S16, 2)
1420 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1421 // narrowScalar limitation.
1422 .widenScalarToNextPow2(0)
1423 .clampScalar(0, S16, S32)
1424 .scalarize(0);
1425
1426 if (ST.hasVOP3PInsts()) {
1427 getActionDefinitionsBuilder(G_ABS)
1428 .legalFor({S32, S16, V2S16})
1429 .clampMaxNumElements(0, S16, 2)
1430 .minScalar(0, S16)
1431 .widenScalarToNextPow2(0)
1432 .scalarize(0)
1433 .lower();
1434 if (ST.hasIntMinMax64()) {
1435 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1436 .legalFor({S32, S16, S64, V2S16})
1437 .clampMaxNumElements(0, S16, 2)
1438 .minScalar(0, S16)
1439 .widenScalarToNextPow2(0)
1440 .scalarize(0)
1441 .lower();
1442 } else {
1443 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1444 .legalFor({S32, S16, V2S16})
1445 .clampMaxNumElements(0, S16, 2)
1446 .minScalar(0, S16)
1447 .widenScalarToNextPow2(0)
1448 .scalarize(0)
1449 .lower();
1450 }
1451 } else {
1452 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1453 .legalFor({S32, S16})
1454 .widenScalarToNextPow2(0)
1455 .minScalar(0, S16)
1456 .scalarize(0)
1457 .lower();
1458 }
1459 } else {
1460 // TODO: Should have same legality without v_perm_b32
1461 getActionDefinitionsBuilder(G_BSWAP)
1462 .legalFor({S32})
1463 .lowerIf(scalarNarrowerThan(0, 32))
1464 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1465 // narrowScalar limitation.
1466 .widenScalarToNextPow2(0)
1467 .maxScalar(0, S32)
1468 .scalarize(0)
1469 .lower();
1470
1471 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1472 .legalFor({S32})
1473 .minScalar(0, S32)
1474 .widenScalarToNextPow2(0)
1475 .scalarize(0)
1476 .lower();
1477 }
1478
1479 getActionDefinitionsBuilder(G_INTTOPTR)
1480 // List the common cases
1481 .legalForCartesianProduct(AddrSpaces64, {S64})
1482 .legalForCartesianProduct(AddrSpaces32, {S32})
1483 .scalarize(0)
1484 // Accept any address space as long as the size matches
1485 .legalIf(sameSize(0, 1))
1486 .widenScalarIf(smallerThan(1, 0),
1487 [](const LegalityQuery &Query) {
1488 return std::pair(
1489 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1490 })
1491 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1492 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1493 });
1494
1495 getActionDefinitionsBuilder(G_PTRTOINT)
1496 // List the common cases
1497 .legalForCartesianProduct(AddrSpaces64, {S64})
1498 .legalForCartesianProduct(AddrSpaces32, {S32})
1499 .scalarize(0)
1500 // Accept any address space as long as the size matches
1501 .legalIf(sameSize(0, 1))
1502 .widenScalarIf(smallerThan(0, 1),
1503 [](const LegalityQuery &Query) {
1504 return std::pair(
1505 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1506 })
1507 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1508 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1509 });
1510
1511 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1512 .scalarize(0)
1513 .custom();
1514
1515 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1516 bool IsLoad) -> bool {
1517 const LLT DstTy = Query.Types[0];
1518
1519 // Split vector extloads.
1520 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1521
1522 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1523 return true;
1524
1525 const LLT PtrTy = Query.Types[1];
1526 unsigned AS = PtrTy.getAddressSpace();
1527 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1528 Query.MMODescrs[0].Ordering !=
1530 return true;
1531
1532 // Catch weird sized loads that don't evenly divide into the access sizes
1533 // TODO: May be able to widen depending on alignment etc.
1534 unsigned NumRegs = (MemSize + 31) / 32;
1535 if (NumRegs == 3) {
1536 if (!ST.hasDwordx3LoadStores())
1537 return true;
1538 } else {
1539 // If the alignment allows, these should have been widened.
1540 if (!isPowerOf2_32(NumRegs))
1541 return true;
1542 }
1543
1544 return false;
1545 };
1546
1547 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1548 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1549 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1550
1551 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1552 // LDS
1553 // TODO: Unsupported flat for SI.
1554
1555 for (unsigned Op : {G_LOAD, G_STORE}) {
1556 const bool IsStore = Op == G_STORE;
1557
1558 auto &Actions = getActionDefinitionsBuilder(Op);
1559 // Explicitly list some common cases.
1560 // TODO: Does this help compile time at all?
1561 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1562 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1563 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1564 {S64, GlobalPtr, S64, GlobalAlign32},
1565 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1566 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1567 {S32, GlobalPtr, S8, GlobalAlign8},
1568 {S32, GlobalPtr, S16, GlobalAlign16},
1569
1570 {S32, LocalPtr, S32, 32},
1571 {S64, LocalPtr, S64, 32},
1572 {V2S32, LocalPtr, V2S32, 32},
1573 {S32, LocalPtr, S8, 8},
1574 {S32, LocalPtr, S16, 16},
1575 {V2S16, LocalPtr, S32, 32},
1576
1577 {S32, PrivatePtr, S32, 32},
1578 {S32, PrivatePtr, S8, 8},
1579 {S32, PrivatePtr, S16, 16},
1580 {V2S16, PrivatePtr, S32, 32},
1581
1582 {S32, ConstantPtr, S32, GlobalAlign32},
1583 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1584 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1585 {S64, ConstantPtr, S64, GlobalAlign32},
1586 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1587
1588 Actions.legalForTypesWithMemDesc(ST.useRealTrue16Insts(), /* Pred */
1589 {{S16, GlobalPtr, S8, GlobalAlign8},
1590 {S16, GlobalPtr, S16, GlobalAlign16},
1591 {S16, LocalPtr, S8, 8},
1592 {S16, LocalPtr, S16, 16},
1593 {S16, PrivatePtr, S8, 8},
1594 {S16, PrivatePtr, S16, 16}});
1595
1596 Actions.legalIf(
1597 [=](const LegalityQuery &Query) -> bool {
1598 return isLoadStoreLegal(ST, Query);
1599 });
1600
1601 // The custom pointers (fat pointers, buffer resources) don't work with load
1602 // and store at this level. Fat pointers should have been lowered to
1603 // intrinsics before the translation to MIR.
1604 Actions.unsupportedIf(
1605 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1606
1607 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1608 // ptrtoint. This is needed to account for the fact that we can't have i128
1609 // as a register class for SelectionDAG reasons.
1610 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1611 return hasBufferRsrcWorkaround(Query.Types[0]);
1612 });
1613
1614 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1615 // 64-bits.
1616 //
1617 // TODO: Should generalize bitcast action into coerce, which will also cover
1618 // inserting addrspacecasts.
1619 Actions.customIf(typeIs(1, Constant32Ptr));
1620
1621 // Turn any illegal element vectors into something easier to deal
1622 // with. These will ultimately produce 32-bit scalar shifts to extract the
1623 // parts anyway.
1624 //
1625 // For odd 16-bit element vectors, prefer to split those into pieces with
1626 // 16-bit vector parts.
1627 Actions.bitcastIf(
1628 [=](const LegalityQuery &Query) -> bool {
1629 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1630 Query.MMODescrs[0].MemoryTy);
1631 }, bitcastToRegisterType(0));
1632
1633 if (!IsStore) {
1634 // Widen suitably aligned loads by loading extra bytes. The standard
1635 // legalization actions can't properly express widening memory operands.
1636 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1637 return shouldWidenLoad(ST, Query, G_LOAD);
1638 });
1639 }
1640
1641 // FIXME: load/store narrowing should be moved to lower action
1642 Actions
1643 .narrowScalarIf(
1644 [=](const LegalityQuery &Query) -> bool {
1645 return !Query.Types[0].isVector() &&
1646 needToSplitMemOp(Query, Op == G_LOAD);
1647 },
1648 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1649 const LLT DstTy = Query.Types[0];
1650 const LLT PtrTy = Query.Types[1];
1651
1652 const unsigned DstSize = DstTy.getSizeInBits();
1653 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1654
1655 // Split extloads.
1656 if (DstSize > MemSize)
1657 return std::pair(0, LLT::scalar(MemSize));
1658
1659 unsigned MaxSize = maxSizeForAddrSpace(
1660 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1661 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1662 if (MemSize > MaxSize)
1663 return std::pair(0, LLT::scalar(MaxSize));
1664
1665 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1666 return std::pair(0, LLT::scalar(Align));
1667 })
1668 .fewerElementsIf(
1669 [=](const LegalityQuery &Query) -> bool {
1670 return Query.Types[0].isVector() &&
1671 needToSplitMemOp(Query, Op == G_LOAD);
1672 },
1673 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1674 const LLT DstTy = Query.Types[0];
1675 const LLT PtrTy = Query.Types[1];
1676
1677 LLT EltTy = DstTy.getElementType();
1678 unsigned MaxSize = maxSizeForAddrSpace(
1679 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1680 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1681
1682 // FIXME: Handle widened to power of 2 results better. This ends
1683 // up scalarizing.
1684 // FIXME: 3 element stores scalarized on SI
1685
1686 // Split if it's too large for the address space.
1687 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1688 if (MemSize > MaxSize) {
1689 unsigned NumElts = DstTy.getNumElements();
1690 unsigned EltSize = EltTy.getSizeInBits();
1691
1692 if (MaxSize % EltSize == 0) {
1693 return std::pair(
1695 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1696 }
1697
1698 unsigned NumPieces = MemSize / MaxSize;
1699
1700 // FIXME: Refine when odd breakdowns handled
1701 // The scalars will need to be re-legalized.
1702 if (NumPieces == 1 || NumPieces >= NumElts ||
1703 NumElts % NumPieces != 0)
1704 return std::pair(0, EltTy);
1705
1706 return std::pair(0,
1707 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1708 }
1709
1710 // FIXME: We could probably handle weird extending loads better.
1711 if (DstTy.getSizeInBits() > MemSize)
1712 return std::pair(0, EltTy);
1713
1714 unsigned EltSize = EltTy.getSizeInBits();
1715 unsigned DstSize = DstTy.getSizeInBits();
1716 if (!isPowerOf2_32(DstSize)) {
1717 // We're probably decomposing an odd sized store. Try to split
1718 // to the widest type. TODO: Account for alignment. As-is it
1719 // should be OK, since the new parts will be further legalized.
1720 unsigned FloorSize = llvm::bit_floor(DstSize);
1721 return std::pair(
1723 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1724 }
1725
1726 // May need relegalization for the scalars.
1727 return std::pair(0, EltTy);
1728 })
1729 .minScalar(0, S32)
1730 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1732 .widenScalarToNextPow2(0)
1733 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1734 .lower();
1735 }
1736
1737 // FIXME: Unaligned accesses not lowered.
1738 auto &ExtLoads =
1739 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1740 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1741 {S32, GlobalPtr, S16, 2 * 8},
1742 {S32, LocalPtr, S8, 8},
1743 {S32, LocalPtr, S16, 16},
1744 {S32, PrivatePtr, S8, 8},
1745 {S32, PrivatePtr, S16, 16},
1746 {S32, ConstantPtr, S8, 8},
1747 {S32, ConstantPtr, S16, 2 * 8}})
1748 .legalForTypesWithMemDesc(ST.useRealTrue16Insts(),
1749 {{S16, GlobalPtr, S8, GlobalAlign8},
1750 {S16, LocalPtr, S8, GlobalAlign8},
1751 {S16, PrivatePtr, S8, GlobalAlign8},
1752 {S16, ConstantPtr, S8, GlobalAlign8}})
1753 .legalIf([=](const LegalityQuery &Query) -> bool {
1754 return isLoadStoreLegal(ST, Query);
1755 });
1756
1757 if (ST.hasFlatAddressSpace()) {
1758 ExtLoads.legalForTypesWithMemDesc(
1759 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1760
1761 ExtLoads.legalForTypesWithMemDesc(ST.useRealTrue16Insts(),
1762 {{S16, FlatPtr, S8, GlobalAlign8}});
1763 }
1764
1765 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1766 // 64-bits.
1767 //
1768 // TODO: Should generalize bitcast action into coerce, which will also cover
1769 // inserting addrspacecasts.
1770 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1771
1772 ExtLoads.narrowScalarIf(
1773 [](const LegalityQuery &Query) {
1774 LLT MemTy = Query.MMODescrs[0].MemoryTy;
1775 return MemTy.isAnyScalar() && MemTy.getSizeInBits() > 32 &&
1776 Query.Types[0].getSizeInBits() > MemTy.getSizeInBits();
1777 }, // For large MemSize, narrowscalar to MemSize (load MemSize + ext)
1779 ExtLoads.clampScalar(0, S32, S32)
1780 .widenScalarToNextPow2(0)
1781 .lower();
1782
1783 auto &Atomics = getActionDefinitionsBuilder(
1784 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1785 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1786 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1787 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1788 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1789 {S64, GlobalPtr}, {S64, LocalPtr},
1790 {S32, RegionPtr}, {S64, RegionPtr}});
1791 if (ST.hasFlatAddressSpace()) {
1792 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1793 }
1794
1795 auto &Atomics32 =
1796 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1797 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1798 if (ST.hasFlatAddressSpace()) {
1799 Atomics32.legalFor({{S32, FlatPtr}});
1800 }
1801
1802 // TODO: v2bf16 operations, and fat buffer pointer support.
1803 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1804 if (ST.hasLDSFPAtomicAddF32()) {
1805 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1806 if (ST.hasLdsAtomicAddF64())
1807 Atomic.legalFor({{S64, LocalPtr}});
1808 if (ST.hasAtomicDsPkAdd16Insts())
1809 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1810 }
1811 if (ST.hasAtomicFaddInsts())
1812 Atomic.legalFor({{S32, GlobalPtr}});
1813 if (ST.hasFlatAtomicFaddF32Inst())
1814 Atomic.legalFor({{S32, FlatPtr}});
1815
1816 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1817 // These are legal with some caveats, and should have undergone expansion in
1818 // the IR in most situations
1819 // TODO: Move atomic expansion into legalizer
1820 Atomic.legalFor({
1821 {S32, GlobalPtr},
1822 {S64, GlobalPtr},
1823 {S64, FlatPtr}
1824 });
1825 }
1826
1827 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1828 ST.hasAtomicBufferGlobalPkAddF16Insts())
1829 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1830 if (ST.hasAtomicGlobalPkAddBF16Inst())
1831 Atomic.legalFor({{V2BF16, GlobalPtr}});
1832 if (ST.hasAtomicFlatPkAdd16Insts())
1833 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1834
1835
1836 // Most of the legalization work here is done by AtomicExpand. We could
1837 // probably use a simpler legality rule that just assumes anything is OK.
1838 auto &AtomicFMinFMax =
1839 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1840 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1841
1842 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1843 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1844 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1845 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1846 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1847 AtomicFMinFMax.legalFor({F32, FlatPtr});
1848 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1849 AtomicFMinFMax.legalFor({F64, FlatPtr});
1850
1851 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1852 // demarshalling
1853 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1854 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1855 {S32, FlatPtr}, {S64, FlatPtr}})
1856 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1857 {S32, RegionPtr}, {S64, RegionPtr}});
1858 // TODO: Pointer types, any 32-bit or 64-bit vector
1859
1860 // Condition should be s32 for scalar, s1 for vector.
1861 getActionDefinitionsBuilder(G_SELECT)
1862 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1863 LocalPtr, FlatPtr, PrivatePtr,
1864 LLT::fixed_vector(2, LocalPtr),
1865 LLT::fixed_vector(2, PrivatePtr)},
1866 {S1, S32})
1867 .clampScalar(0, S16, S64)
1868 .scalarize(1)
1869 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1870 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1871 .clampMaxNumElements(0, S32, 2)
1872 .clampMaxNumElements(0, LocalPtr, 2)
1873 .clampMaxNumElements(0, PrivatePtr, 2)
1874 .scalarize(0)
1875 .widenScalarToNextPow2(0)
1876 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1877
1878 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1879 // be more flexible with the shift amount type.
1880 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1881 .legalFor({{S32, S32}, {S64, S32}});
1882 if (ST.has16BitInsts()) {
1883 if (ST.hasVOP3PInsts()) {
1884 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1885 .clampMaxNumElements(0, S16, 2);
1886 } else
1887 Shifts.legalFor({{S16, S16}});
1888
1889 // TODO: Support 16-bit shift amounts for all types
1890 Shifts.widenScalarIf(
1891 [=](const LegalityQuery &Query) {
1892 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1893 // 32-bit amount.
1894 const LLT ValTy = Query.Types[0];
1895 const LLT AmountTy = Query.Types[1];
1896 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1897 AmountTy.getSizeInBits() < 16;
1898 }, changeTo(1, S16));
1899 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1900 Shifts.clampScalar(1, S32, S32);
1901 Shifts.widenScalarToNextPow2(0, 16);
1902 Shifts.clampScalar(0, S16, S64);
1903
1904 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1905 .minScalar(0, S16)
1906 .scalarize(0)
1907 .lower();
1908 } else {
1909 // Make sure we legalize the shift amount type first, as the general
1910 // expansion for the shifted type will produce much worse code if it hasn't
1911 // been truncated already.
1912 Shifts.clampScalar(1, S32, S32);
1913 Shifts.widenScalarToNextPow2(0, 32);
1914 Shifts.clampScalar(0, S32, S64);
1915
1916 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1917 .minScalar(0, S32)
1918 .scalarize(0)
1919 .lower();
1920 }
1921 Shifts.scalarize(0);
1922
1923 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1924 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1925 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1926 unsigned IdxTypeIdx = 2;
1927
1928 getActionDefinitionsBuilder(Op)
1929 .customIf([=](const LegalityQuery &Query) {
1930 const LLT EltTy = Query.Types[EltTypeIdx];
1931 const LLT VecTy = Query.Types[VecTypeIdx];
1932 const LLT IdxTy = Query.Types[IdxTypeIdx];
1933 const unsigned EltSize = EltTy.getSizeInBits();
1934 const bool isLegalVecType =
1936 // Address space 8 pointers are 128-bit wide values, but the logic
1937 // below will try to bitcast them to 2N x s64, which will fail.
1938 // Therefore, as an intermediate step, wrap extracts/insertions from a
1939 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1940 // extraction result) in order to produce a vector operation that can
1941 // be handled by the logic below.
1942 if (EltTy.isPointer() && EltSize > 64)
1943 return true;
1944 return (EltSize == 32 || EltSize == 64) &&
1945 VecTy.getSizeInBits() % 32 == 0 &&
1946 VecTy.getSizeInBits() <= MaxRegisterSize &&
1947 IdxTy.getSizeInBits() == 32 &&
1948 isLegalVecType;
1949 })
1950 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1951 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1952 bitcastToVectorElement32(VecTypeIdx))
1953 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1954 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1955 scalarOrEltWiderThan(VecTypeIdx, 64)),
1956 [=](const LegalityQuery &Query) {
1957 // For > 64-bit element types, try to turn this into a
1958 // 64-bit element vector since we may be able to do better
1959 // indexing if this is scalar. If not, fall back to 32.
1960 const LLT EltTy = Query.Types[EltTypeIdx];
1961 const LLT VecTy = Query.Types[VecTypeIdx];
1962 const unsigned DstEltSize = EltTy.getSizeInBits();
1963 const unsigned VecSize = VecTy.getSizeInBits();
1964
1965 const unsigned TargetEltSize =
1966 DstEltSize % 64 == 0 ? 64 : 32;
1967 return std::pair(VecTypeIdx,
1968 LLT::fixed_vector(VecSize / TargetEltSize,
1969 TargetEltSize));
1970 })
1971 .clampScalar(EltTypeIdx, S32, S64)
1972 .clampScalar(VecTypeIdx, S32, S64)
1973 .clampScalar(IdxTypeIdx, S32, S32)
1974 .clampMaxNumElements(VecTypeIdx, S32, 32)
1975 // TODO: Clamp elements for 64-bit vectors?
1976 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1978 // It should only be necessary with variable indexes.
1979 // As a last resort, lower to the stack
1980 .lower();
1981 }
1982
1983 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1984 .unsupportedIf([=](const LegalityQuery &Query) {
1985 const LLT &EltTy = Query.Types[1].getElementType();
1986 return Query.Types[0] != EltTy;
1987 });
1988
1989 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1990 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1991 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1992 getActionDefinitionsBuilder(Op)
1993 .widenScalarIf(
1994 [=](const LegalityQuery &Query) {
1995 const LLT BigTy = Query.Types[BigTyIdx];
1996 return (BigTy.getScalarSizeInBits() < 16);
1997 },
1999 .widenScalarIf(
2000 [=](const LegalityQuery &Query) {
2001 const LLT LitTy = Query.Types[LitTyIdx];
2002 return (LitTy.getScalarSizeInBits() < 16);
2003 },
2005 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
2006 .widenScalarToNextPow2(BigTyIdx, 32)
2007 .customIf([=](const LegalityQuery &Query) {
2008 // Generic lower operates on the full-width value, producing
2009 // shift+trunc/mask sequences. For simple cases where extract/insert
2010 // values are 32-bit aligned, we can instead unmerge/merge and work on
2011 // the 32-bit components. However, we can't check the offset here so
2012 // custom lower function will have to call generic lowering if offset
2013 // is not 32-bit aligned.
2014 const LLT BigTy = Query.Types[BigTyIdx];
2015 const LLT LitTy = Query.Types[LitTyIdx];
2016 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
2017 LitTy.getSizeInBits() % 32 == 0;
2018 })
2019 .lower();
2020 }
2021
2022 auto &BuildVector =
2023 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2024 .legalForCartesianProduct(AllS32Vectors, {S32})
2025 .legalForCartesianProduct(AllS64Vectors, {S64})
2026 .clampNumElements(0, V16S32, V32S32)
2027 .clampNumElements(0, V2S64, V16S64)
2028 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
2029 .moreElementsIf(isIllegalRegisterType(ST, 0),
2031
2032 if (ST.hasScalarPackInsts()) {
2033 BuildVector
2034 // FIXME: Should probably widen s1 vectors straight to s32
2035 .minScalarOrElt(0, S16)
2036 .minScalar(1, S16);
2037
2038 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2039 .legalFor({V2S16, S32})
2040 .lower();
2041 } else {
2042 BuildVector.customFor({V2S16, S16});
2043 BuildVector.minScalarOrElt(0, S32);
2044
2045 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2046 .customFor({V2S16, S32})
2047 .lower();
2048 }
2049
2050 BuildVector.legalIf(isRegisterType(ST, 0));
2051
2052 // FIXME: Clamp maximum size
2053 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2054 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2055 .clampMaxNumElements(0, S32, 32)
2056 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2057 .clampMaxNumElements(0, S16, 64);
2058
2059 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2060
2061 // Merge/Unmerge
2062 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2063 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2064 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2065
2066 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2067 const LLT Ty = Query.Types[TypeIdx];
2068 if (Ty.isVector()) {
2069 const LLT &EltTy = Ty.getElementType();
2070 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2071 return true;
2073 return true;
2074 }
2075 return false;
2076 };
2077
2078 auto &Builder =
2079 getActionDefinitionsBuilder(Op)
2080 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2081 .lowerFor({{S16, V2S16}})
2082 .lowerIf([=](const LegalityQuery &Query) {
2083 const LLT BigTy = Query.Types[BigTyIdx];
2084 return BigTy.getSizeInBits() == 32;
2085 })
2086 // Try to widen to s16 first for small types.
2087 // TODO: Only do this on targets with legal s16 shifts
2088 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2089 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2090 .moreElementsIf(isSmallOddVector(BigTyIdx),
2091 oneMoreElement(BigTyIdx))
2092 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
2093 elementTypeIs(1, S16)),
2094 changeTo(1, V2S16))
2095 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2096 // not worth considering the multiples of 64 since 2*192 and 2*384
2097 // are not valid.
2098 .clampScalar(LitTyIdx, S32, S512)
2099 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2100 // Break up vectors with weird elements into scalars
2101 .fewerElementsIf(
2102 [=](const LegalityQuery &Query) {
2103 return notValidElt(Query, LitTyIdx);
2104 },
2105 scalarize(0))
2106 .fewerElementsIf(
2107 [=](const LegalityQuery &Query) {
2108 return notValidElt(Query, BigTyIdx);
2109 },
2110 scalarize(1))
2111 .clampScalar(BigTyIdx, S32, MaxScalar);
2112
2113 if (Op == G_MERGE_VALUES) {
2114 Builder.widenScalarIf(
2115 // TODO: Use 16-bit shifts if legal for 8-bit values?
2116 [=](const LegalityQuery &Query) {
2117 const LLT Ty = Query.Types[LitTyIdx];
2118 return Ty.getSizeInBits() < 32;
2119 },
2120 changeTo(LitTyIdx, S32));
2121 }
2122
2123 Builder.widenScalarIf(
2124 [=](const LegalityQuery &Query) {
2125 const LLT Ty = Query.Types[BigTyIdx];
2126 return Ty.getSizeInBits() % 16 != 0;
2127 },
2128 [=](const LegalityQuery &Query) {
2129 // Pick the next power of 2, or a multiple of 64 over 128.
2130 // Whichever is smaller.
2131 const LLT &Ty = Query.Types[BigTyIdx];
2132 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2133 if (NewSizeInBits >= 256) {
2134 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2135 if (RoundedTo < NewSizeInBits)
2136 NewSizeInBits = RoundedTo;
2137 }
2138 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2139 })
2140 // Any vectors left are the wrong size. Scalarize them.
2141 .scalarize(0)
2142 .scalarize(1);
2143 }
2144
2145 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2146 // RegBankSelect.
2147 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2148 .legalFor({{S32}, {S64}})
2149 .clampScalar(0, S32, S64);
2150
2151 if (ST.hasVOP3PInsts()) {
2152 SextInReg.lowerFor({{V2S16}})
2153 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2154 // get more vector shift opportunities, since we'll get those when
2155 // expanded.
2156 .clampMaxNumElementsStrict(0, S16, 2);
2157 } else if (ST.has16BitInsts()) {
2158 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2159 } else {
2160 // Prefer to promote to s32 before lowering if we don't have 16-bit
2161 // shifts. This avoid a lot of intermediate truncate and extend operations.
2162 SextInReg.lowerFor({{S32}, {S64}});
2163 }
2164
2165 SextInReg
2166 .scalarize(0)
2167 .clampScalar(0, S32, S64)
2168 .lower();
2169
2170 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2171 .scalarize(0)
2172 .lower();
2173
2174 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2175 FSHRActionDefs.legalFor({{S32, S32}})
2176 .clampMaxNumElementsStrict(0, S16, 2);
2177 if (ST.hasVOP3PInsts())
2178 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2179 FSHRActionDefs.scalarize(0).lower();
2180
2181 if (ST.hasVOP3PInsts()) {
2182 getActionDefinitionsBuilder(G_FSHL)
2183 .lowerFor({{V2S16, V2S16}})
2184 .clampMaxNumElementsStrict(0, S16, 2)
2185 .scalarize(0)
2186 .lower();
2187 } else {
2188 getActionDefinitionsBuilder(G_FSHL)
2189 .scalarize(0)
2190 .lower();
2191 }
2192
2193 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2194 .legalFor({S64});
2195
2196 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2197
2198 getActionDefinitionsBuilder(G_FENCE)
2199 .alwaysLegal();
2200
2201 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2202 .scalarize(0)
2203 .minScalar(0, S32)
2204 .lower();
2205
2206 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2207 .legalFor({{S32, S32}, {S64, S32}})
2208 .clampScalar(1, S32, S32)
2209 .clampScalar(0, S32, S64)
2210 .widenScalarToNextPow2(0)
2211 .scalarize(0);
2212
2213 getActionDefinitionsBuilder(
2214 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2215 G_FCOPYSIGN,
2216
2217 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2218 G_READ_REGISTER, G_WRITE_REGISTER,
2219
2220 G_SADDO, G_SSUBO})
2221 .lower();
2222
2223 if (ST.hasIEEEMinimumMaximumInsts()) {
2224 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2225 .legalFor(FPTypesPK16)
2226 .clampMaxNumElements(0, S16, 2)
2227 .scalarize(0);
2228 } else if (ST.hasVOP3PInsts()) {
2229 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2230 .lowerFor({V2S16})
2231 .clampMaxNumElementsStrict(0, S16, 2)
2232 .scalarize(0)
2233 .lower();
2234 } else {
2235 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2236 .scalarize(0)
2237 .clampScalar(0, S32, S64)
2238 .lower();
2239 }
2240
2241 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2242 .lower();
2243
2244 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2245
2246 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2247 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2248 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2249 .unsupported();
2250
2251 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2252
2253 getActionDefinitionsBuilder(
2254 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2255 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2256 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2257 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2258 .legalFor(AllVectors)
2259 .scalarize(1)
2260 .lower();
2261
2262 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2263 G_INTRINSIC_CONVERGENT,
2264 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2265 .alwaysLegal();
2266
2267 getLegacyLegalizerInfo().computeTables();
2268 verify(*ST.getInstrInfo());
2269}
2270
2273 LostDebugLocObserver &LocObserver) const {
2274 MachineIRBuilder &B = Helper.MIRBuilder;
2275 MachineRegisterInfo &MRI = *B.getMRI();
2276
2277 switch (MI.getOpcode()) {
2278 case TargetOpcode::G_ADDRSPACE_CAST:
2279 return legalizeAddrSpaceCast(MI, MRI, B);
2280 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2281 return legalizeFroundeven(MI, MRI, B);
2282 case TargetOpcode::G_FCEIL:
2283 return legalizeFceil(MI, MRI, B);
2284 case TargetOpcode::G_FREM:
2285 return legalizeFrem(MI, MRI, B);
2286 case TargetOpcode::G_INTRINSIC_TRUNC:
2287 return legalizeIntrinsicTrunc(MI, MRI, B);
2288 case TargetOpcode::G_SITOFP:
2289 return legalizeITOFP(MI, MRI, B, true);
2290 case TargetOpcode::G_UITOFP:
2291 return legalizeITOFP(MI, MRI, B, false);
2292 case TargetOpcode::G_FPTOSI:
2293 return legalizeFPTOI(MI, MRI, B, true);
2294 case TargetOpcode::G_FPTOUI:
2295 return legalizeFPTOI(MI, MRI, B, false);
2296 case TargetOpcode::G_FMINNUM:
2297 case TargetOpcode::G_FMAXNUM:
2298 case TargetOpcode::G_FMINIMUMNUM:
2299 case TargetOpcode::G_FMAXIMUMNUM:
2300 return legalizeMinNumMaxNum(Helper, MI);
2301 case TargetOpcode::G_EXTRACT:
2302 return legalizeExtract(Helper, MI);
2303 case TargetOpcode::G_INSERT:
2304 return legalizeInsert(Helper, MI);
2305 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2306 return legalizeExtractVectorElt(MI, MRI, B);
2307 case TargetOpcode::G_INSERT_VECTOR_ELT:
2308 return legalizeInsertVectorElt(MI, MRI, B);
2309 case TargetOpcode::G_FSIN:
2310 case TargetOpcode::G_FCOS:
2311 return legalizeSinCos(MI, MRI, B);
2312 case TargetOpcode::G_GLOBAL_VALUE:
2313 return legalizeGlobalValue(MI, MRI, B);
2314 case TargetOpcode::G_LOAD:
2315 case TargetOpcode::G_SEXTLOAD:
2316 case TargetOpcode::G_ZEXTLOAD:
2317 return legalizeLoad(Helper, MI);
2318 case TargetOpcode::G_STORE:
2319 return legalizeStore(Helper, MI);
2320 case TargetOpcode::G_FMAD:
2321 return legalizeFMad(MI, MRI, B);
2322 case TargetOpcode::G_FDIV:
2323 return legalizeFDIV(MI, MRI, B);
2324 case TargetOpcode::G_FFREXP:
2325 return legalizeFFREXP(MI, MRI, B);
2326 case TargetOpcode::G_FSQRT:
2327 return legalizeFSQRT(MI, MRI, B);
2328 case TargetOpcode::G_UDIV:
2329 case TargetOpcode::G_UREM:
2330 case TargetOpcode::G_UDIVREM:
2331 return legalizeUnsignedDIV_REM(MI, MRI, B);
2332 case TargetOpcode::G_SDIV:
2333 case TargetOpcode::G_SREM:
2334 case TargetOpcode::G_SDIVREM:
2335 return legalizeSignedDIV_REM(MI, MRI, B);
2336 case TargetOpcode::G_ATOMIC_CMPXCHG:
2337 return legalizeAtomicCmpXChg(MI, MRI, B);
2338 case TargetOpcode::G_FLOG2:
2339 return legalizeFlog2(MI, B);
2340 case TargetOpcode::G_FLOG:
2341 case TargetOpcode::G_FLOG10:
2342 return legalizeFlogCommon(MI, B);
2343 case TargetOpcode::G_FEXP2:
2344 return legalizeFExp2(MI, B);
2345 case TargetOpcode::G_FEXP:
2346 case TargetOpcode::G_FEXP10:
2347 return legalizeFExp(MI, B);
2348 case TargetOpcode::G_FPOW:
2349 return legalizeFPow(MI, B);
2350 case TargetOpcode::G_FFLOOR:
2351 return legalizeFFloor(MI, MRI, B);
2352 case TargetOpcode::G_BUILD_VECTOR:
2353 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2354 return legalizeBuildVector(MI, MRI, B);
2355 case TargetOpcode::G_MUL:
2356 return legalizeMul(Helper, MI);
2357 case TargetOpcode::G_CTLZ:
2358 case TargetOpcode::G_CTTZ:
2359 return legalizeCTLZ_CTTZ(MI, MRI, B);
2360 case TargetOpcode::G_CTLS:
2361 return legalizeCTLS(MI, MRI, B);
2362 case TargetOpcode::G_CTLZ_ZERO_POISON:
2363 return legalizeCTLZ_ZERO_POISON(MI, MRI, B);
2364 case TargetOpcode::G_STACKSAVE:
2365 return legalizeStackSave(MI, B);
2366 case TargetOpcode::G_GET_FPENV:
2367 return legalizeGetFPEnv(MI, MRI, B);
2368 case TargetOpcode::G_SET_FPENV:
2369 return legalizeSetFPEnv(MI, MRI, B);
2370 case TargetOpcode::G_TRAP:
2371 return legalizeTrap(MI, MRI, B);
2372 case TargetOpcode::G_DEBUGTRAP:
2373 return legalizeDebugTrap(MI, MRI, B);
2374 default:
2375 return false;
2376 }
2377
2378 llvm_unreachable("expected switch to return");
2379}
2380
2382 unsigned AS,
2384 MachineIRBuilder &B) const {
2385 MachineFunction &MF = B.getMF();
2386 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2387 const LLT S32 = LLT::scalar(32);
2388 const LLT S64 = LLT::scalar(64);
2389
2391
2392 if (ST.hasApertureRegs()) {
2393 // Note: this register is somewhat broken. When used as a 32-bit operand,
2394 // it only returns zeroes. The real value is in the upper 32 bits.
2395 // Thus, we must emit extract the high 32 bits.
2396 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2397 ? AMDGPU::SRC_SHARED_BASE
2398 : AMDGPU::SRC_PRIVATE_BASE;
2399 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2400 !ST.hasGloballyAddressableScratch()) &&
2401 "Cannot use src_private_base with globally addressable scratch!");
2403 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2404 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2405 return B.buildUnmerge(S32, Dst).getReg(1);
2406 }
2407
2410 // For code object version 5, private_base and shared_base are passed through
2411 // implicit kernargs.
2415
2420 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2421
2422 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2424
2425 if (!loadInputValue(KernargPtrReg, B,
2427 return Register();
2428
2430 PtrInfo.getWithOffset(Offset),
2434
2435 // Pointer address
2436 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2437 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2438 // Load address
2439 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2440 }
2441
2444
2446 return Register();
2447
2448 // TODO: Use custom PseudoSourceValue
2450
2451 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2452 // private_segment_aperture_base_hi.
2453 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2454
2456 PtrInfo,
2459 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2460
2461 B.buildObjectPtrOffset(
2462 LoadAddr, QueuePtr,
2463 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2464 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2465}
2466
2467/// Return true if the value is a known valid address, such that a null check is
2468/// not necessary.
2470 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2471 MachineInstr *Def = MRI.getVRegDef(Val);
2472 switch (Def->getOpcode()) {
2473 case AMDGPU::G_FRAME_INDEX:
2474 case AMDGPU::G_GLOBAL_VALUE:
2475 case AMDGPU::G_BLOCK_ADDR:
2476 return true;
2477 case AMDGPU::G_CONSTANT: {
2478 const ConstantInt *CI = Def->getOperand(1).getCImm();
2479 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2480 }
2481 default:
2482 return false;
2483 }
2484
2485 return false;
2486}
2487
2490 MachineIRBuilder &B) const {
2491 MachineFunction &MF = B.getMF();
2492
2493 // MI can either be a G_ADDRSPACE_CAST or a
2494 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2495 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2496 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2497 Intrinsic::amdgcn_addrspacecast_nonnull));
2498
2499 const LLT S32 = LLT::scalar(32);
2500 Register Dst = MI.getOperand(0).getReg();
2501 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2502 : MI.getOperand(1).getReg();
2503 LLT DstTy = MRI.getType(Dst);
2504 LLT SrcTy = MRI.getType(Src);
2505 unsigned DestAS = DstTy.getAddressSpace();
2506 unsigned SrcAS = SrcTy.getAddressSpace();
2507
2508 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2509 // vector element.
2510 assert(!DstTy.isVector());
2511
2512 const AMDGPUTargetMachine &TM
2513 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2514
2515 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2516 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2517 return true;
2518 }
2519
2520 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2521 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2522 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2523 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2524 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2525 ST.hasGloballyAddressableScratch()) {
2526 // flat -> private with globally addressable scratch: subtract
2527 // src_flat_scratch_base_lo.
2528 const LLT S32 = LLT::scalar(32);
2529 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2530 Register FlatScratchBaseLo =
2531 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2532 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2533 .getReg(0);
2534 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2535 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2536 return B.buildIntToPtr(Dst, Sub).getReg(0);
2537 }
2538
2539 // Extract low 32-bits of the pointer.
2540 return B.buildExtract(Dst, Src, 0).getReg(0);
2541 };
2542
2543 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2544 // G_ADDRSPACE_CAST we need to guess.
2545 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2546 castFlatToLocalOrPrivate(Dst);
2547 MI.eraseFromParent();
2548 return true;
2549 }
2550
2551 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2552
2553 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2554 auto FlatNull = B.buildConstant(SrcTy, 0);
2555
2556 // Extract low 32-bits of the pointer.
2557 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2558
2559 auto CmpRes =
2560 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2561 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2562
2563 MI.eraseFromParent();
2564 return true;
2565 }
2566
2567 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2568 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2569 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2570 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2571 // Coerce the type of the low half of the result so we can use
2572 // merge_values.
2573 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2574
2575 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2576 ST.hasGloballyAddressableScratch()) {
2577 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2578 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2579 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2580 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2581 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2582 .addUse(AllOnes)
2583 .addUse(ThreadID)
2584 .getReg(0);
2585 if (ST.isWave64()) {
2586 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2587 .addUse(AllOnes)
2588 .addUse(ThreadID)
2589 .getReg(0);
2590 }
2591 Register ShAmt =
2592 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2593 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2594 Register CvtPtr =
2595 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2596 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2597 // 64-bit hi:lo value.
2598 Register FlatScratchBase =
2599 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2600 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2601 .getReg(0);
2602 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2603 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2604 }
2605
2606 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2607 if (!ApertureReg.isValid())
2608 return false;
2609
2610 // TODO: Should we allow mismatched types but matching sizes in merges to
2611 // avoid the ptrtoint?
2612 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2613 };
2614
2615 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2616 // G_ADDRSPACE_CAST we need to guess.
2617 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2618 castLocalOrPrivateToFlat(Dst);
2619 MI.eraseFromParent();
2620 return true;
2621 }
2622
2623 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2624
2625 auto SegmentNull =
2626 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2627 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2628
2629 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2630 SegmentNull.getReg(0));
2631
2632 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2633
2634 MI.eraseFromParent();
2635 return true;
2636 }
2637
2638 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2639 SrcTy.getSizeInBits() == 64) {
2640 // Truncate.
2641 B.buildExtract(Dst, Src, 0);
2642 MI.eraseFromParent();
2643 return true;
2644 }
2645
2646 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2647 DstTy.getSizeInBits() == 64) {
2649 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2650 auto PtrLo = B.buildPtrToInt(S32, Src);
2651 if (AddrHiVal == 0) {
2652 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2653 B.buildIntToPtr(Dst, Zext);
2654 } else {
2655 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2656 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2657 }
2658
2659 MI.eraseFromParent();
2660 return true;
2661 }
2662
2663 // Invalid casts are poison.
2664 // TODO: Should return poison
2665 B.buildUndef(Dst);
2666 MI.eraseFromParent();
2667 return true;
2668}
2669
2672 MachineIRBuilder &B) const {
2673 Register Src = MI.getOperand(1).getReg();
2674 LLT Ty = MRI.getType(Src);
2675 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2676
2677 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2678 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2679
2680 auto C1 = B.buildFConstant(Ty, C1Val);
2681 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2682
2683 // TODO: Should this propagate fast-math-flags?
2684 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2685 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2686
2687 auto C2 = B.buildFConstant(Ty, C2Val);
2688 auto Fabs = B.buildFAbs(Ty, Src);
2689
2690 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2691 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2692 MI.eraseFromParent();
2693 return true;
2694}
2695
2698 MachineIRBuilder &B) const {
2699
2700 const LLT S1 = LLT::scalar(1);
2701 const LLT S64 = LLT::scalar(64);
2702
2703 Register Src = MI.getOperand(1).getReg();
2704 assert(MRI.getType(Src) == S64);
2705
2706 // result = trunc(src)
2707 // if (src > 0.0 && src != result)
2708 // result += 1.0
2709
2710 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2711
2712 const auto Zero = B.buildFConstant(S64, 0.0);
2713 const auto One = B.buildFConstant(S64, 1.0);
2714 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2715 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2716 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2717 auto Add = B.buildSelect(S64, And, One, Zero);
2718
2719 // TODO: Should this propagate fast-math-flags?
2720 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2721 MI.eraseFromParent();
2722 return true;
2723}
2724
2727 MachineIRBuilder &B) const {
2728 Register DstReg = MI.getOperand(0).getReg();
2729 Register Src0Reg = MI.getOperand(1).getReg();
2730 Register Src1Reg = MI.getOperand(2).getReg();
2731 auto Flags = MI.getFlags();
2732 LLT Ty = MRI.getType(DstReg);
2733
2734 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2735 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2736 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2737 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2738 MI.eraseFromParent();
2739 return true;
2740}
2741
2744 const unsigned FractBits = 52;
2745 const unsigned ExpBits = 11;
2746 LLT S32 = LLT::scalar(32);
2747
2748 auto Const0 = B.buildConstant(S32, FractBits - 32);
2749 auto Const1 = B.buildConstant(S32, ExpBits);
2750
2751 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2752 .addUse(Hi)
2753 .addUse(Const0.getReg(0))
2754 .addUse(Const1.getReg(0));
2755
2756 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2757}
2758
2761 MachineIRBuilder &B) const {
2762 const LLT S1 = LLT::scalar(1);
2763 const LLT S32 = LLT::scalar(32);
2764 const LLT S64 = LLT::scalar(64);
2765
2766 Register Src = MI.getOperand(1).getReg();
2767 assert(MRI.getType(Src) == S64);
2768
2769 // TODO: Should this use extract since the low half is unused?
2770 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2771 Register Hi = Unmerge.getReg(1);
2772
2773 // Extract the upper half, since this is where we will find the sign and
2774 // exponent.
2775 auto Exp = extractF64Exponent(Hi, B);
2776
2777 const unsigned FractBits = 52;
2778
2779 // Extract the sign bit.
2780 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2781 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2782
2783 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2784
2785 const auto Zero32 = B.buildConstant(S32, 0);
2786
2787 // Extend back to 64-bits.
2788 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2789
2790 auto Shr = B.buildAShr(S64, FractMask, Exp);
2791 auto Not = B.buildNot(S64, Shr);
2792 auto Tmp0 = B.buildAnd(S64, Src, Not);
2793 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2794
2795 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2796 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2797
2798 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2799 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2800 MI.eraseFromParent();
2801 return true;
2802}
2803
2806 MachineIRBuilder &B, bool Signed) const {
2807
2808 Register Dst = MI.getOperand(0).getReg();
2809 Register Src = MI.getOperand(1).getReg();
2810
2811 const LLT S64 = LLT::scalar(64);
2812 const LLT S32 = LLT::scalar(32);
2813
2814 assert(MRI.getType(Src) == S64);
2815
2816 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2817 auto ThirtyTwo = B.buildConstant(S32, 32);
2818
2819 if (MRI.getType(Dst) == S64) {
2820 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2821 : B.buildUITOFP(S64, Unmerge.getReg(1));
2822
2823 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2824 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2825
2826 // TODO: Should this propagate fast-math-flags?
2827 B.buildFAdd(Dst, LdExp, CvtLo);
2828 MI.eraseFromParent();
2829 return true;
2830 }
2831
2832 assert(MRI.getType(Dst) == S32);
2833
2834 auto One = B.buildConstant(S32, 1);
2835
2836 MachineInstrBuilder ShAmt;
2837 if (Signed) {
2838 auto ThirtyOne = B.buildConstant(S32, 31);
2839 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2840 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2841 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2842 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2843 .addUse(Unmerge.getReg(1));
2844 auto LS2 = B.buildSub(S32, LS, One);
2845 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2846 } else
2847 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2848 auto Norm = B.buildShl(S64, Src, ShAmt);
2849 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2850 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2851 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2852 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2853 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2854 B.buildFLdexp(Dst, FVal, Scale);
2855 MI.eraseFromParent();
2856 return true;
2857}
2858
2859// TODO: Copied from DAG implementation. Verify logic and document how this
2860// actually works.
2864 bool Signed) const {
2865
2866 Register Dst = MI.getOperand(0).getReg();
2867 Register Src = MI.getOperand(1).getReg();
2868
2869 const LLT S64 = LLT::scalar(64);
2870 const LLT S32 = LLT::scalar(32);
2871
2872 const LLT SrcLT = MRI.getType(Src);
2873 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2874
2875 unsigned Flags = MI.getFlags();
2876
2877 // The basic idea of converting a floating point number into a pair of 32-bit
2878 // integers is illustrated as follows:
2879 //
2880 // tf := trunc(val);
2881 // hif := floor(tf * 2^-32);
2882 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2883 // hi := fptoi(hif);
2884 // lo := fptoi(lof);
2885 //
2886 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2888 if (Signed && SrcLT == S32) {
2889 // However, a 32-bit floating point number has only 23 bits mantissa and
2890 // it's not enough to hold all the significant bits of `lof` if val is
2891 // negative. To avoid the loss of precision, We need to take the absolute
2892 // value after truncating and flip the result back based on the original
2893 // signedness.
2894 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2895 Trunc = B.buildFAbs(S32, Trunc, Flags);
2896 }
2897 MachineInstrBuilder K0, K1;
2898 if (SrcLT == S64) {
2899 K0 = B.buildFConstant(
2900 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2901 K1 = B.buildFConstant(
2902 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2903 } else {
2904 K0 = B.buildFConstant(
2905 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2906 K1 = B.buildFConstant(
2907 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2908 }
2909
2910 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2911 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2912 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2913
2914 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2915 : B.buildFPTOUI(S32, FloorMul);
2916 auto Lo = B.buildFPTOUI(S32, Fma);
2917
2918 if (Signed && SrcLT == S32) {
2919 // Flip the result based on the signedness, which is either all 0s or 1s.
2920 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2921 // r := xor({lo, hi}, sign) - sign;
2922 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2923 Sign);
2924 } else
2925 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2926 MI.eraseFromParent();
2927
2928 return true;
2929}
2930
2932 MachineInstr &MI) const {
2933 MachineFunction &MF = Helper.MIRBuilder.getMF();
2935
2936 // With ieee_mode disabled, the instructions have the correct behavior.
2937 if (!MFI->getMode().IEEE)
2938 return true;
2939
2941}
2942
2944 MachineInstr &MI) const {
2945 MachineIRBuilder &B = Helper.MIRBuilder;
2946 MachineRegisterInfo &MRI = *B.getMRI();
2947 Register DstReg = MI.getOperand(0).getReg();
2948 Register SrcReg = MI.getOperand(1).getReg();
2949 uint64_t Offset = MI.getOperand(2).getImm();
2950
2951 // Fall back to generic lowering for offset 0 (trivial trunc) and
2952 // non-32-bit-aligned cases which require shift+trunc sequences
2953 // that generic code handles correctly.
2954 if (Offset == 0 || Offset % 32 != 0)
2955 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2956
2957 const LLT DstTy = MRI.getType(DstReg);
2958 unsigned StartIdx = Offset / 32;
2959 unsigned DstCount = DstTy.getSizeInBits() / 32;
2960 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2961
2962 if (DstCount == 1) {
2963 if (DstTy.isPointer())
2964 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2965 else
2966 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2967 } else {
2968 SmallVector<Register, 8> MergeVec;
2969 for (unsigned I = 0; I < DstCount; ++I)
2970 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
2971 B.buildMergeLikeInstr(DstReg, MergeVec);
2972 }
2973
2974 MI.eraseFromParent();
2975 return true;
2976}
2977
2979 MachineInstr &MI) const {
2980 MachineIRBuilder &B = Helper.MIRBuilder;
2981 MachineRegisterInfo &MRI = *B.getMRI();
2982 Register DstReg = MI.getOperand(0).getReg();
2983 Register SrcReg = MI.getOperand(1).getReg();
2984 Register InsertSrc = MI.getOperand(2).getReg();
2985 uint64_t Offset = MI.getOperand(3).getImm();
2986
2987 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
2988 const LLT InsertTy = MRI.getType(InsertSrc);
2989 unsigned InsertSize = InsertTy.getSizeInBits();
2990
2991 // Fall back to generic lowering for non-32-bit-aligned cases which
2992 // require shift+mask sequences that generic code handles correctly.
2993 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2994 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
2995
2996 const LLT S32 = LLT::scalar(32);
2997 unsigned DstCount = DstSize / 32;
2998 unsigned InsertCount = InsertSize / 32;
2999 unsigned StartIdx = Offset / 32;
3000
3001 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
3002
3003 SmallVector<Register, 8> MergeVec;
3004 for (unsigned I = 0; I < StartIdx; ++I)
3005 MergeVec.push_back(SrcUnmerge.getReg(I));
3006
3007 if (InsertCount == 1) {
3008 // Merge-like instructions require same source types. Convert pointer
3009 // to scalar when inserting a pointer value into a scalar.
3010 if (InsertTy.isPointer())
3011 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
3012 MergeVec.push_back(InsertSrc);
3013 } else {
3014 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
3015 for (unsigned I = 0; I < InsertCount; ++I)
3016 MergeVec.push_back(InsertUnmerge.getReg(I));
3017 }
3018
3019 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
3020 MergeVec.push_back(SrcUnmerge.getReg(I));
3021
3022 B.buildMergeLikeInstr(DstReg, MergeVec);
3023
3024 MI.eraseFromParent();
3025 return true;
3026}
3027
3030 MachineIRBuilder &B) const {
3031 // TODO: Should move some of this into LegalizerHelper.
3032
3033 // TODO: Promote dynamic indexing of s16 to s32
3034
3035 Register Dst = MI.getOperand(0).getReg();
3036 Register Vec = MI.getOperand(1).getReg();
3037
3038 LLT VecTy = MRI.getType(Vec);
3039 LLT EltTy = VecTy.getElementType();
3040 assert(EltTy == MRI.getType(Dst));
3041
3042 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3043 // but we can't go directly to that logic becasue you can't bitcast a vector
3044 // of pointers to a vector of integers. Therefore, introduce an intermediate
3045 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3046 // drive the legalization forward.
3047 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3048 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3049 LLT IntVecTy = VecTy.changeElementType(IntTy);
3050
3051 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3052 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3053 B.buildIntToPtr(Dst, IntElt);
3054
3055 MI.eraseFromParent();
3056 return true;
3057 }
3058
3059 // FIXME: Artifact combiner probably should have replaced the truncated
3060 // constant before this, so we shouldn't need
3061 // getIConstantVRegValWithLookThrough.
3062 std::optional<ValueAndVReg> MaybeIdxVal =
3063 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3064 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3065 return true;
3066 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3067
3068 if (IdxVal < VecTy.getNumElements()) {
3069 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3070 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3071 } else {
3072 B.buildUndef(Dst);
3073 }
3074
3075 MI.eraseFromParent();
3076 return true;
3077}
3078
3081 MachineIRBuilder &B) const {
3082 // TODO: Should move some of this into LegalizerHelper.
3083
3084 // TODO: Promote dynamic indexing of s16 to s32
3085
3086 Register Dst = MI.getOperand(0).getReg();
3087 Register Vec = MI.getOperand(1).getReg();
3088 Register Ins = MI.getOperand(2).getReg();
3089
3090 LLT VecTy = MRI.getType(Vec);
3091 LLT EltTy = VecTy.getElementType();
3092 assert(EltTy == MRI.getType(Ins));
3093
3094 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3095 // but we can't go directly to that logic becasue you can't bitcast a vector
3096 // of pointers to a vector of integers. Therefore, make the pointer vector
3097 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3098 // new value, and then inttoptr the result vector back. This will then allow
3099 // the rest of legalization to take over.
3100 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3101 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3102 LLT IntVecTy = VecTy.changeElementType(IntTy);
3103
3104 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3105 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3106 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3107 MI.getOperand(3));
3108 B.buildIntToPtr(Dst, IntVecDest);
3109 MI.eraseFromParent();
3110 return true;
3111 }
3112
3113 // FIXME: Artifact combiner probably should have replaced the truncated
3114 // constant before this, so we shouldn't need
3115 // getIConstantVRegValWithLookThrough.
3116 std::optional<ValueAndVReg> MaybeIdxVal =
3117 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3118 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3119 return true;
3120
3121 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3122
3123 unsigned NumElts = VecTy.getNumElements();
3124 if (IdxVal < NumElts) {
3126 for (unsigned i = 0; i < NumElts; ++i)
3127 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3128 B.buildUnmerge(SrcRegs, Vec);
3129
3130 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3131 B.buildMergeLikeInstr(Dst, SrcRegs);
3132 } else {
3133 B.buildUndef(Dst);
3134 }
3135
3136 MI.eraseFromParent();
3137 return true;
3138}
3139
3142 MachineIRBuilder &B) const {
3143
3144 Register DstReg = MI.getOperand(0).getReg();
3145 Register SrcReg = MI.getOperand(1).getReg();
3146 LLT Ty = MRI.getType(DstReg);
3147 unsigned Flags = MI.getFlags();
3148
3149 Register TrigVal;
3150 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3151 if (ST.hasTrigReducedRange()) {
3152 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3153 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3154 .addUse(MulVal.getReg(0))
3155 .setMIFlags(Flags)
3156 .getReg(0);
3157 } else
3158 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3159
3160 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3161 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3162 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3163 .addUse(TrigVal)
3164 .setMIFlags(Flags);
3165 MI.eraseFromParent();
3166 return true;
3167}
3168
3171 const GlobalValue *GV,
3172 int64_t Offset,
3173 unsigned GAFlags) const {
3174 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3175 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3176 // to the following code sequence:
3177 //
3178 // For constant address space:
3179 // s_getpc_b64 s[0:1]
3180 // s_add_u32 s0, s0, $symbol
3181 // s_addc_u32 s1, s1, 0
3182 //
3183 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3184 // a fixup or relocation is emitted to replace $symbol with a literal
3185 // constant, which is a pc-relative offset from the encoding of the $symbol
3186 // operand to the global variable.
3187 //
3188 // For global address space:
3189 // s_getpc_b64 s[0:1]
3190 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3191 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3192 //
3193 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3194 // fixups or relocations are emitted to replace $symbol@*@lo and
3195 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3196 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3197 // operand to the global variable.
3198
3200
3201 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3202 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3203
3204 if (ST.has64BitLiterals()) {
3205 assert(GAFlags != SIInstrInfo::MO_NONE);
3206
3208 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3209 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3210 } else {
3212 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3213
3214 MIB.addGlobalAddress(GV, Offset, GAFlags);
3215 if (GAFlags == SIInstrInfo::MO_NONE)
3216 MIB.addImm(0);
3217 else
3218 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3219 }
3220
3221 if (!B.getMRI()->getRegClassOrNull(PCReg))
3222 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3223
3224 if (PtrTy.getSizeInBits() == 32)
3225 B.buildExtract(DstReg, PCReg, 0);
3226 return true;
3227}
3228
3229// Emit a ABS32_LO / ABS32_HI relocation stub.
3231 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3232 MachineRegisterInfo &MRI) const {
3233 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3234
3235 if (RequiresHighHalf && ST.has64BitLiterals()) {
3236 if (!MRI.getRegClassOrNull(DstReg))
3237 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3238 B.buildInstr(AMDGPU::S_MOV_B64)
3239 .addDef(DstReg)
3240 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3241 return;
3242 }
3243
3244 LLT S32 = LLT::scalar(32);
3245
3246 // Use the destination directly, if and only if we store the lower address
3247 // part only and we don't have a register class being set.
3248 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3249 ? DstReg
3251
3252 if (!MRI.getRegClassOrNull(AddrLo))
3253 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3254
3255 // Write the lower half.
3256 B.buildInstr(AMDGPU::S_MOV_B32)
3257 .addDef(AddrLo)
3258 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3259
3260 // If required, write the upper half as well.
3261 if (RequiresHighHalf) {
3262 assert(PtrTy.getSizeInBits() == 64 &&
3263 "Must provide a 64-bit pointer type!");
3264
3266 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3267
3268 B.buildInstr(AMDGPU::S_MOV_B32)
3269 .addDef(AddrHi)
3270 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3271
3272 // Use the destination directly, if and only if we don't have a register
3273 // class being set.
3274 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3275 ? DstReg
3277
3278 if (!MRI.getRegClassOrNull(AddrDst))
3279 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3280
3281 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3282
3283 // If we created a new register for the destination, cast the result into
3284 // the final output.
3285 if (AddrDst != DstReg)
3286 B.buildCast(DstReg, AddrDst);
3287 } else if (AddrLo != DstReg) {
3288 // If we created a new register for the destination, cast the result into
3289 // the final output.
3290 B.buildCast(DstReg, AddrLo);
3291 }
3292}
3293
3296 MachineIRBuilder &B) const {
3297 Register DstReg = MI.getOperand(0).getReg();
3298 LLT Ty = MRI.getType(DstReg);
3299 unsigned AS = Ty.getAddressSpace();
3300
3301 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3302 MachineFunction &MF = B.getMF();
3304
3306 if (!MFI->isModuleEntryFunction() &&
3307 GV->getName() != "llvm.amdgcn.module.lds" &&
3309 const Function &Fn = MF.getFunction();
3311 Fn, "local memory global used by non-kernel function",
3312 MI.getDebugLoc(), DS_Warning));
3313
3314 // We currently don't have a way to correctly allocate LDS objects that
3315 // aren't directly associated with a kernel. We do force inlining of
3316 // functions that use local objects. However, if these dead functions are
3317 // not eliminated, we don't want a compile time error. Just emit a warning
3318 // and a trap, since there should be no callable path here.
3319 B.buildTrap();
3320 B.buildUndef(DstReg);
3321 MI.eraseFromParent();
3322 return true;
3323 }
3324
3325 // TODO: We could emit code to handle the initialization somewhere.
3326 // We ignore the initializer for now and legalize it to allow selection.
3327 // The initializer will anyway get errored out during assembly emission.
3328 const SITargetLowering *TLI = ST.getTargetLowering();
3329 if (!TLI->shouldUseLDSConstAddress(GV)) {
3330 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3331 return true; // Leave in place;
3332 }
3333
3334 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3335 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3336 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3337 // zero-sized type in other languages to declare the dynamic shared
3338 // memory which size is not known at the compile time. They will be
3339 // allocated by the runtime and placed directly after the static
3340 // allocated ones. They all share the same offset.
3341 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3342 // Adjust alignment for that dynamic shared memory array.
3343 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3344 LLT S32 = LLT::scalar(32);
3345 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3346 B.buildIntToPtr(DstReg, Sz);
3347 MI.eraseFromParent();
3348 return true;
3349 }
3350 }
3351
3352 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3353 MI.eraseFromParent();
3354 return true;
3355 }
3356
3357 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3358 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3359 MI.eraseFromParent();
3360 return true;
3361 }
3362
3363 const SITargetLowering *TLI = ST.getTargetLowering();
3364
3365 if (TLI->shouldEmitFixup(GV)) {
3366 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3367 MI.eraseFromParent();
3368 return true;
3369 }
3370
3371 if (TLI->shouldEmitPCReloc(GV)) {
3372 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3373 MI.eraseFromParent();
3374 return true;
3375 }
3376
3378 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3379
3380 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3385 LoadTy, Align(8));
3386
3387 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3388
3389 if (Ty.getSizeInBits() == 32) {
3390 // Truncate if this is a 32-bit constant address.
3391 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3392 B.buildExtract(DstReg, Load, 0);
3393 } else
3394 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3395
3396 MI.eraseFromParent();
3397 return true;
3398}
3399
3401 if (Ty.isVector())
3402 return Ty.changeElementCount(
3403 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3404 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3405}
3406
3408 MachineInstr &MI) const {
3409 MachineIRBuilder &B = Helper.MIRBuilder;
3410 MachineRegisterInfo &MRI = *B.getMRI();
3411 GISelChangeObserver &Observer = Helper.Observer;
3412
3413 Register PtrReg = MI.getOperand(1).getReg();
3414 LLT PtrTy = MRI.getType(PtrReg);
3415 unsigned AddrSpace = PtrTy.getAddressSpace();
3416
3417 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3419 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3420 Observer.changingInstr(MI);
3421 MI.getOperand(1).setReg(Cast.getReg(0));
3422 Observer.changedInstr(MI);
3423 return true;
3424 }
3425
3426 if (MI.getOpcode() != AMDGPU::G_LOAD)
3427 return false;
3428
3429 Register ValReg = MI.getOperand(0).getReg();
3430 LLT ValTy = MRI.getType(ValReg);
3431
3432 if (hasBufferRsrcWorkaround(ValTy)) {
3433 Observer.changingInstr(MI);
3434 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3435 Observer.changedInstr(MI);
3436 return true;
3437 }
3438
3439 MachineMemOperand *MMO = *MI.memoperands_begin();
3440 const unsigned ValSize = ValTy.getSizeInBits();
3441 const LLT MemTy = MMO->getMemoryType();
3442 const Align MemAlign = MMO->getAlign();
3443 const unsigned MemSize = MemTy.getSizeInBits();
3444 const uint64_t AlignInBits = 8 * MemAlign.value();
3445
3446 // Widen non-power-of-2 loads to the alignment if needed
3447 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3448 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3449
3450 // This was already the correct extending load result type, so just adjust
3451 // the memory type.
3452 if (WideMemSize == ValSize) {
3453 MachineFunction &MF = B.getMF();
3454
3455 MachineMemOperand *WideMMO =
3456 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3457 Observer.changingInstr(MI);
3458 MI.setMemRefs(MF, {WideMMO});
3459 Observer.changedInstr(MI);
3460 return true;
3461 }
3462
3463 // Don't bother handling edge case that should probably never be produced.
3464 if (ValSize > WideMemSize)
3465 return false;
3466
3467 LLT WideTy = widenToNextPowerOf2(ValTy);
3468
3469 Register WideLoad;
3470 if (!WideTy.isVector()) {
3471 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3472 B.buildTrunc(ValReg, WideLoad).getReg(0);
3473 } else {
3474 // Extract the subvector.
3475
3476 if (isRegisterType(ST, ValTy)) {
3477 // If this a case where G_EXTRACT is legal, use it.
3478 // (e.g. <3 x s32> -> <4 x s32>)
3479 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3480 B.buildExtract(ValReg, WideLoad, 0);
3481 } else {
3482 // For cases where the widened type isn't a nice register value, unmerge
3483 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3484 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3485 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3486 }
3487 }
3488
3489 MI.eraseFromParent();
3490 return true;
3491 }
3492
3493 return false;
3494}
3495
3497 MachineInstr &MI) const {
3498 MachineIRBuilder &B = Helper.MIRBuilder;
3499 MachineRegisterInfo &MRI = *B.getMRI();
3500 GISelChangeObserver &Observer = Helper.Observer;
3501
3502 Register DataReg = MI.getOperand(0).getReg();
3503 LLT DataTy = MRI.getType(DataReg);
3504
3505 if (hasBufferRsrcWorkaround(DataTy)) {
3506 Observer.changingInstr(MI);
3508 Observer.changedInstr(MI);
3509 return true;
3510 }
3511 return false;
3512}
3513
3516 MachineIRBuilder &B) const {
3517 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3518 assert(Ty.isScalar());
3519
3520 MachineFunction &MF = B.getMF();
3522
3523 // TODO: Always legal with future ftz flag.
3524 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3525 // FIXME: Do we need just output?
3526 if (Ty == LLT::scalar(32) &&
3528 return true;
3529 if (Ty == LLT::scalar(16) &&
3531 return true;
3532
3533 MachineIRBuilder HelperBuilder(MI);
3534 GISelObserverWrapper DummyObserver;
3535 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3536 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3537}
3538
3541 Register DstReg = MI.getOperand(0).getReg();
3542 Register PtrReg = MI.getOperand(1).getReg();
3543 Register CmpVal = MI.getOperand(2).getReg();
3544 Register NewVal = MI.getOperand(3).getReg();
3545
3547 "this should not have been custom lowered");
3548
3549 LLT ValTy = MRI.getType(CmpVal);
3550 LLT VecTy = LLT::fixed_vector(2, ValTy);
3551
3552 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3553
3554 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3555 .addDef(DstReg)
3556 .addUse(PtrReg)
3557 .addUse(PackedVal)
3558 .setMemRefs(MI.memoperands());
3559
3560 MI.eraseFromParent();
3561 return true;
3562}
3563
3564/// Return true if it's known that \p Src can never be an f32 denormal value.
3566 Register Src) {
3567 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3568 switch (DefMI->getOpcode()) {
3569 case TargetOpcode::G_INTRINSIC: {
3571 case Intrinsic::amdgcn_frexp_mant:
3572 case Intrinsic::amdgcn_log:
3573 case Intrinsic::amdgcn_log_clamp:
3574 case Intrinsic::amdgcn_exp2:
3575 case Intrinsic::amdgcn_sqrt:
3576 return true;
3577 default:
3578 break;
3579 }
3580
3581 break;
3582 }
3583 case TargetOpcode::G_FSQRT:
3584 return true;
3585 case TargetOpcode::G_FFREXP: {
3586 if (DefMI->getOperand(0).getReg() == Src)
3587 return true;
3588 break;
3589 }
3590 case TargetOpcode::G_FPEXT: {
3591 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3592 }
3593 default:
3594 return false;
3595 }
3596
3597 return false;
3598}
3599
3600static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3601 return Flags & MachineInstr::FmAfn;
3602}
3603
3605 unsigned Flags) {
3606 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3609}
3610
3611std::pair<Register, Register>
3613 unsigned Flags) const {
3614 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3615 return {};
3616
3617 const LLT F32 = LLT::scalar(32);
3618 auto SmallestNormal = B.buildFConstant(
3620 auto IsLtSmallestNormal =
3621 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3622
3623 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3624 auto One = B.buildFConstant(F32, 1.0);
3625 auto ScaleFactor =
3626 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3627 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3628
3629 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3630}
3631
3633 MachineIRBuilder &B) const {
3634 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3635 // If we have to handle denormals, scale up the input and adjust the result.
3636
3637 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3638 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3639
3640 Register Dst = MI.getOperand(0).getReg();
3641 Register Src = MI.getOperand(1).getReg();
3642 LLT Ty = B.getMRI()->getType(Dst);
3643 unsigned Flags = MI.getFlags();
3644
3645 if (Ty == LLT::scalar(16)) {
3646 const LLT F32 = LLT::scalar(32);
3647 // Nothing in half is a denormal when promoted to f32.
3648 auto Ext = B.buildFPExt(F32, Src, Flags);
3649 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3650 .addUse(Ext.getReg(0))
3651 .setMIFlags(Flags);
3652 B.buildFPTrunc(Dst, Log2, Flags);
3653 MI.eraseFromParent();
3654 return true;
3655 }
3656
3657 assert(Ty == LLT::scalar(32));
3658
3659 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3660 if (!ScaledInput) {
3661 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3662 .addUse(Src)
3663 .setMIFlags(Flags);
3664 MI.eraseFromParent();
3665 return true;
3666 }
3667
3668 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3669 .addUse(ScaledInput)
3670 .setMIFlags(Flags);
3671
3672 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3673 auto Zero = B.buildFConstant(Ty, 0.0);
3674 auto ResultOffset =
3675 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3676 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3677
3678 MI.eraseFromParent();
3679 return true;
3680}
3681
3683 Register Z, unsigned Flags) {
3684 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3685 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3686}
3687
3689 MachineIRBuilder &B) const {
3690 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3691 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3692
3693 MachineRegisterInfo &MRI = *B.getMRI();
3694 Register Dst = MI.getOperand(0).getReg();
3695 Register X = MI.getOperand(1).getReg();
3696 unsigned Flags = MI.getFlags();
3697 const LLT Ty = MRI.getType(X);
3698
3699 const LLT F32 = LLT::scalar(32);
3700 const LLT F16 = LLT::scalar(16);
3701
3702 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3703 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3704 // depending on !fpmath metadata.
3705 bool PromoteToF32 =
3706 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3707 if (PromoteToF32) {
3709 auto PromoteSrc = B.buildFPExt(F32, X);
3710 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3711 B.buildFPTrunc(Dst, LogVal);
3712 } else {
3713 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3714 }
3715
3716 MI.eraseFromParent();
3717 return true;
3718 }
3719
3720 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3721 if (ScaledInput)
3722 X = ScaledInput;
3723
3724 auto Y =
3725 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3726
3727 Register R;
3728 if (ST.hasFastFMAF32()) {
3729 // c+cc are ln(2)/ln(10) to more than 49 bits
3730 const float c_log10 = 0x1.344134p-2f;
3731 const float cc_log10 = 0x1.09f79ep-26f;
3732
3733 // c + cc is ln(2) to more than 49 bits
3734 const float c_log = 0x1.62e42ep-1f;
3735 const float cc_log = 0x1.efa39ep-25f;
3736
3737 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3738 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3739 // This adds correction terms for which contraction may lead to an increase
3740 // in the error of the approximation, so disable it.
3741 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3742 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3743 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3744 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3745 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3746 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3747 } else {
3748 // ch+ct is ln(2)/ln(10) to more than 36 bits
3749 const float ch_log10 = 0x1.344000p-2f;
3750 const float ct_log10 = 0x1.3509f6p-18f;
3751
3752 // ch + ct is ln(2) to more than 36 bits
3753 const float ch_log = 0x1.62e000p-1f;
3754 const float ct_log = 0x1.0bfbe8p-15f;
3755
3756 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3757 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3758
3759 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3760 auto YH = B.buildAnd(Ty, Y, MaskConst);
3761 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3762 // This adds correction terms for which contraction may lead to an increase
3763 // in the error of the approximation, so disable it.
3764 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3765 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3766
3767 Register Mad0 =
3768 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3769 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3770 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3771 }
3772
3773 const bool IsFiniteOnly =
3775
3776 if (!IsFiniteOnly) {
3777 // Expand isfinite(x) => fabs(x) < inf
3778 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3779 auto Fabs = B.buildFAbs(Ty, Y);
3780 auto IsFinite =
3781 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3782 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3783 }
3784
3785 if (ScaledInput) {
3786 auto Zero = B.buildFConstant(Ty, 0.0);
3787 auto ShiftK =
3788 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3789 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3790 B.buildFSub(Dst, R, Shift, Flags);
3791 } else {
3792 B.buildCopy(Dst, R);
3793 }
3794
3795 MI.eraseFromParent();
3796 return true;
3797}
3798
3800 Register Src, bool IsLog10,
3801 unsigned Flags) const {
3802 const double Log2BaseInverted =
3804
3805 LLT Ty = B.getMRI()->getType(Dst);
3806
3807 if (Ty == LLT::scalar(32)) {
3808 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3809 if (ScaledInput) {
3810 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3811 .addUse(Src)
3812 .setMIFlags(Flags);
3813 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3814 auto Zero = B.buildFConstant(Ty, 0.0);
3815 auto ResultOffset =
3816 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3817 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3818
3819 if (ST.hasFastFMAF32())
3820 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3821 else {
3822 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3823 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3824 }
3825
3826 return true;
3827 }
3828 }
3829
3830 auto Log2Operand = Ty == LLT::scalar(16)
3831 ? B.buildFLog2(Ty, Src, Flags)
3832 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3833 .addUse(Src)
3834 .setMIFlags(Flags);
3835 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3836 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3837 return true;
3838}
3839
3841 MachineIRBuilder &B) const {
3842 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3843 // If we have to handle denormals, scale up the input and adjust the result.
3844
3845 Register Dst = MI.getOperand(0).getReg();
3846 Register Src = MI.getOperand(1).getReg();
3847 unsigned Flags = MI.getFlags();
3848 LLT Ty = B.getMRI()->getType(Dst);
3849 const LLT F16 = LLT::scalar(16);
3850 const LLT F32 = LLT::scalar(32);
3851 const LLT F64 = LLT::scalar(64);
3852
3853 if (Ty == F64)
3854 return legalizeFEXPF64(MI, B);
3855
3856 if (Ty == F16) {
3857 // Nothing in half is a denormal when promoted to f32.
3858 auto Ext = B.buildFPExt(F32, Src, Flags);
3859 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3860 .addUse(Ext.getReg(0))
3861 .setMIFlags(Flags);
3862 B.buildFPTrunc(Dst, Log2, Flags);
3863 MI.eraseFromParent();
3864 return true;
3865 }
3866
3867 assert(Ty == F32);
3868
3869 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3870 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3871 .addUse(Src)
3872 .setMIFlags(Flags);
3873 MI.eraseFromParent();
3874 return true;
3875 }
3876
3877 // bool needs_scaling = x < -0x1.f80000p+6f;
3878 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3879
3880 // -nextafter(128.0, -1)
3881 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3882 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3883 RangeCheckConst, Flags);
3884
3885 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3886 auto Zero = B.buildFConstant(Ty, 0.0);
3887 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3888 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3889
3890 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3891 .addUse(AddInput.getReg(0))
3892 .setMIFlags(Flags);
3893
3894 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3895 auto One = B.buildFConstant(Ty, 1.0);
3896 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3897 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3898 MI.eraseFromParent();
3899 return true;
3900}
3901
3903 const SrcOp &Src, unsigned Flags) {
3904 LLT Ty = Dst.getLLTTy(*B.getMRI());
3905
3906 if (Ty == LLT::scalar(32)) {
3907 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3908 .addUse(Src.getReg())
3909 .setMIFlags(Flags);
3910 }
3911 return B.buildFExp2(Dst, Src, Flags);
3912}
3913
3915 Register Dst, Register X,
3916 unsigned Flags,
3917 bool IsExp10) const {
3918 LLT Ty = B.getMRI()->getType(X);
3919
3920 // exp(x) -> exp2(M_LOG2E_F * x);
3921 // exp10(x) -> exp2(log2(10) * x);
3922 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3923 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3924 buildExp(B, Dst, Mul, Flags);
3925 return true;
3926}
3927
3929 Register X, unsigned Flags) const {
3930 LLT Ty = B.getMRI()->getType(Dst);
3931 LLT F32 = LLT::scalar(32);
3932
3933 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3934 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3935 }
3936
3937 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3938 auto NeedsScaling =
3939 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3940 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3941 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3942 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3943
3944 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3945 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3946
3947 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3948 .addUse(ExpInput.getReg(0))
3949 .setMIFlags(Flags);
3950
3951 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3952 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3953 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3954 return true;
3955}
3956
3958 Register Dst, Register X,
3959 unsigned Flags) const {
3960 LLT Ty = B.getMRI()->getType(Dst);
3961 LLT F32 = LLT::scalar(32);
3962
3963 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3964 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3965 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3966 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3967
3968 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3969 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3970 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3971 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3972 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3973 return true;
3974 }
3975
3976 // bool s = x < -0x1.2f7030p+5f;
3977 // x += s ? 0x1.0p+5f : 0.0f;
3978 // exp10 = exp2(x * 0x1.a92000p+1f) *
3979 // exp2(x * 0x1.4f0978p-11f) *
3980 // (s ? 0x1.9f623ep-107f : 1.0f);
3981
3982 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3983 auto NeedsScaling =
3984 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3985
3986 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3987 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3988 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3989
3990 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3991 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3992
3993 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3994 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3995 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3996 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3997
3998 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3999 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
4000 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
4001
4002 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4003 return true;
4004}
4005
4006// This expansion gives a result slightly better than 1ulp.
4008 MachineIRBuilder &B) const {
4009
4010 Register X = MI.getOperand(1).getReg();
4011 LLT S64 = LLT::scalar(64);
4012 LLT S32 = LLT::scalar(32);
4013 LLT S1 = LLT::scalar(1);
4014
4015 // TODO: Check if reassoc is safe. There is an output change in exp2 and
4016 // exp10, which slightly increases ulp.
4017 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
4018
4019 Register Dn, F, T;
4020
4021 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
4022 // Dn = rint(X)
4023 Dn = B.buildFRint(S64, X, Flags).getReg(0);
4024 // F = X - Dn
4025 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
4026 // T = F*C1 + F*C2
4027 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4028 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4029 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
4030 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
4031
4032 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
4033 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
4034 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4035 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4036
4037 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4038 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
4039 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
4040 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4041 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4042
4043 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
4044 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
4045 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
4046 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
4047
4048 } else { // G_FEXP
4049 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4050 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4051 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4052
4053 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4054 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4055 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4056 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4057 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4058 }
4059
4060 // Polynomial chain for P
4061 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4062 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4063 Flags);
4064 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4065 Flags);
4066 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4067 Flags);
4068 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4069 Flags);
4070 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4071 Flags);
4072 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4073 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4074 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4075 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4076
4077 auto One = B.buildFConstant(S64, 1.0);
4078 P = B.buildFMA(S64, T, P, One, Flags);
4079 P = B.buildFMA(S64, T, P, One, Flags);
4080
4081 // Z = FLDEXP(P, (int)Dn)
4082 auto DnInt = B.buildFPTOSI(S32, Dn);
4083 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4084
4085 if (!(Flags & MachineInstr::FmNoInfs)) {
4086 // Overflow guard: if X <= 1024.0 then Z else +inf
4087 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4088 B.buildFConstant(S64, APFloat(1024.0)));
4089 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4090 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4091 }
4092
4093 // Underflow guard: if X >= -1075.0 then Z else 0.0
4094 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4095 B.buildFConstant(S64, APFloat(-1075.0)));
4096 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4097 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4098
4099 MI.eraseFromParent();
4100 return true;
4101}
4102
4104 MachineIRBuilder &B) const {
4105 Register Dst = MI.getOperand(0).getReg();
4106 Register X = MI.getOperand(1).getReg();
4107 const unsigned Flags = MI.getFlags();
4108 MachineFunction &MF = B.getMF();
4109 MachineRegisterInfo &MRI = *B.getMRI();
4110 LLT Ty = MRI.getType(Dst);
4111
4112 const LLT F64 = LLT::scalar(64);
4113
4114 if (Ty == F64)
4115 return legalizeFEXPF64(MI, B);
4116
4117 const LLT F16 = LLT::scalar(16);
4118 const LLT F32 = LLT::scalar(32);
4119 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4120
4121 if (Ty == F16) {
4122 // v_exp_f16 (fmul x, log2e)
4123 if (allowApproxFunc(MF, Flags)) {
4124 // TODO: Does this really require fast?
4125 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4126 : legalizeFExpUnsafe(B, Dst, X, Flags);
4127 MI.eraseFromParent();
4128 return true;
4129 }
4130
4131 // Nothing in half is a denormal when promoted to f32.
4132 //
4133 // exp(f16 x) ->
4134 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4135 //
4136 // exp10(f16 x) ->
4137 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4138 auto Ext = B.buildFPExt(F32, X, Flags);
4140 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4141 B.buildFPTrunc(Dst, Lowered, Flags);
4142 MI.eraseFromParent();
4143 return true;
4144 }
4145
4146 assert(Ty == F32);
4147
4148 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4149 // library behavior. Also, is known-not-daz source sufficient?
4150 if (allowApproxFunc(MF, Flags)) {
4151 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4152 : legalizeFExpUnsafe(B, Dst, X, Flags);
4153 MI.eraseFromParent();
4154 return true;
4155 }
4156
4157 // Algorithm:
4158 //
4159 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4160 //
4161 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4162 // n = 64*m + j, 0 <= j < 64
4163 //
4164 // e^x = 2^((64*m + j + f)/64)
4165 // = (2^m) * (2^(j/64)) * 2^(f/64)
4166 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4167 //
4168 // f = x*(64/ln(2)) - n
4169 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4170 //
4171 // e^x = (2^m) * (2^(j/64)) * e^r
4172 //
4173 // (2^(j/64)) is precomputed
4174 //
4175 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4176 // e^r = 1 + q
4177 //
4178 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4179 //
4180 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4181 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4182 Register PH, PL;
4183
4184 if (ST.hasFastFMAF32()) {
4185 const float c_exp = numbers::log2ef;
4186 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4187 const float c_exp10 = 0x1.a934f0p+1f;
4188 const float cc_exp10 = 0x1.2f346ep-24f;
4189
4190 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4191 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4192 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4193 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4194
4195 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4196 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4197 } else {
4198 const float ch_exp = 0x1.714000p+0f;
4199 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4200
4201 const float ch_exp10 = 0x1.a92000p+1f;
4202 const float cl_exp10 = 0x1.4f0978p-11f;
4203
4204 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4205 auto XH = B.buildAnd(Ty, X, MaskConst);
4206 auto XL = B.buildFSub(Ty, X, XH, Flags);
4207
4208 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4209 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4210
4211 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4212 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4213
4214 Register Mad0 =
4215 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4216 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4217 }
4218
4219 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4220
4221 // It is unsafe to contract this fsub into the PH multiply.
4222 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4223 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4224 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4225
4226 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4227 .addUse(A.getReg(0))
4228 .setMIFlags(Flags);
4229 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4230
4231 auto UnderflowCheckConst =
4232 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4233 auto Zero = B.buildFConstant(Ty, 0.0);
4234 auto Underflow =
4235 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4236
4237 R = B.buildSelect(Ty, Underflow, Zero, R);
4238
4239 if (!(Flags & MachineInstr::FmNoInfs)) {
4240 auto OverflowCheckConst =
4241 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4242
4243 auto Overflow =
4244 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4245 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4246 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4247 }
4248
4249 B.buildCopy(Dst, R);
4250 MI.eraseFromParent();
4251 return true;
4252}
4253
4255 MachineIRBuilder &B) const {
4256 Register Dst = MI.getOperand(0).getReg();
4257 Register Src0 = MI.getOperand(1).getReg();
4258 Register Src1 = MI.getOperand(2).getReg();
4259 unsigned Flags = MI.getFlags();
4260 LLT Ty = B.getMRI()->getType(Dst);
4261 const LLT F16 = LLT::scalar(16); // TODO: Expected LLT::float16()
4262 const LLT F32 = LLT::scalar(32); // TODO: Expected LLT::float32()
4263
4264 if (Ty == F32) {
4265 auto Log = B.buildFLog2(F32, Src0, Flags);
4266 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4267 .addUse(Log.getReg(0))
4268 .addUse(Src1)
4269 .setMIFlags(Flags);
4270 B.buildFExp2(Dst, Mul, Flags);
4271 } else if (Ty == F16) {
4272 // There's no f16 fmul_legacy, so we need to convert for it.
4273 auto Log = B.buildFLog2(F16, Src0, Flags);
4274 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4275 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4276 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4277 .addUse(Ext0.getReg(0))
4278 .addUse(Ext1.getReg(0))
4279 .setMIFlags(Flags);
4280 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4281 } else
4282 return false;
4283
4284 MI.eraseFromParent();
4285 return true;
4286}
4287
4288// Find a source register, ignoring any possible source modifiers.
4290 Register ModSrc = OrigSrc;
4291 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4292 ModSrc = SrcFNeg->getOperand(1).getReg();
4293 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4294 ModSrc = SrcFAbs->getOperand(1).getReg();
4295 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4296 ModSrc = SrcFAbs->getOperand(1).getReg();
4297 return ModSrc;
4298}
4299
4302 MachineIRBuilder &B) const {
4303
4304 const LLT S1 = LLT::scalar(1);
4305 const LLT F64 = LLT::scalar(64); // TODO: Expected float64
4306 Register Dst = MI.getOperand(0).getReg();
4307 Register OrigSrc = MI.getOperand(1).getReg();
4308 unsigned Flags = MI.getFlags();
4309 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4310 "this should not have been custom lowered");
4311
4312 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4313 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4314 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4315 // V_FRACT bug is:
4316 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4317 //
4318 // Convert floor(x) to (x - fract(x))
4319
4320 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4321 .addUse(OrigSrc)
4322 .setMIFlags(Flags);
4323
4324 // Give source modifier matching some assistance before obscuring a foldable
4325 // pattern.
4326
4327 // TODO: We can avoid the neg on the fract? The input sign to fract
4328 // shouldn't matter?
4329 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4330
4331 auto Const =
4332 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4333
4335
4336 // We don't need to concern ourselves with the snan handling difference, so
4337 // use the one which will directly select.
4338 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4339 if (MFI->getMode().IEEE)
4340 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4341 else
4342 B.buildFMinNum(Min, Fract, Const, Flags);
4343
4344 Register CorrectedFract = Min;
4345 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4346 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4347 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4348 }
4349
4350 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4351 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4352
4353 MI.eraseFromParent();
4354 return true;
4355}
4356
4357// Turn an illegal packed v2s16 build vector into bit operations.
4358// TODO: This should probably be a bitcast action in LegalizerHelper.
4361 Register Dst = MI.getOperand(0).getReg();
4362 const LLT S32 = LLT::scalar(32);
4363 const LLT S16 = LLT::scalar(16);
4364 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4365
4366 Register Src0 = MI.getOperand(1).getReg();
4367 Register Src1 = MI.getOperand(2).getReg();
4368
4369 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4370 assert(MRI.getType(Src0) == S32);
4371 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4372 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4373 }
4374
4375 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4376 B.buildBitcast(Dst, Merge);
4377
4378 MI.eraseFromParent();
4379 return true;
4380}
4381
4382// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4383//
4384// Source and accumulation registers must all be 32-bits.
4385//
4386// TODO: When the multiply is uniform, we should produce a code sequence
4387// that is better suited to instruction selection on the SALU. Instead of
4388// the outer loop going over parts of the result, the outer loop should go
4389// over parts of one of the factors. This should result in instruction
4390// selection that makes full use of S_ADDC_U32 instructions.
4393 ArrayRef<Register> Src0,
4394 ArrayRef<Register> Src1,
4395 bool UsePartialMad64_32,
4396 bool SeparateOddAlignedProducts) const {
4397 // Use (possibly empty) vectors of S1 registers to represent the set of
4398 // carries from one pair of positions to the next.
4399 using Carry = SmallVector<Register, 2>;
4400
4401 MachineIRBuilder &B = Helper.MIRBuilder;
4402 GISelValueTracking &VT = *Helper.getValueTracking();
4403
4404 const LLT S1 = LLT::scalar(1);
4405 const LLT S32 = LLT::scalar(32);
4406 const LLT S64 = LLT::scalar(64);
4407
4408 Register Zero32;
4409 Register Zero64;
4410
4411 auto getZero32 = [&]() -> Register {
4412 if (!Zero32)
4413 Zero32 = B.buildConstant(S32, 0).getReg(0);
4414 return Zero32;
4415 };
4416 auto getZero64 = [&]() -> Register {
4417 if (!Zero64)
4418 Zero64 = B.buildConstant(S64, 0).getReg(0);
4419 return Zero64;
4420 };
4421
4422 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4423 for (unsigned i = 0; i < Src0.size(); ++i) {
4424 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4425 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4426 }
4427
4428 // Merge the given carries into the 32-bit LocalAccum, which is modified
4429 // in-place.
4430 //
4431 // Returns the carry-out, which is a single S1 register or null.
4432 auto mergeCarry =
4433 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4434 if (CarryIn.empty())
4435 return Register();
4436
4437 bool HaveCarryOut = true;
4438 Register CarryAccum;
4439 if (CarryIn.size() == 1) {
4440 if (!LocalAccum) {
4441 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4442 return Register();
4443 }
4444
4445 CarryAccum = getZero32();
4446 } else {
4447 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4448 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4449 CarryAccum =
4450 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4451 .getReg(0);
4452 }
4453
4454 if (!LocalAccum) {
4455 LocalAccum = getZero32();
4456 HaveCarryOut = false;
4457 }
4458 }
4459
4460 auto Add =
4461 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4462 LocalAccum = Add.getReg(0);
4463 return HaveCarryOut ? Add.getReg(1) : Register();
4464 };
4465
4466 // Build a multiply-add chain to compute
4467 //
4468 // LocalAccum + (partial products at DstIndex)
4469 // + (opportunistic subset of CarryIn)
4470 //
4471 // LocalAccum is an array of one or two 32-bit registers that are updated
4472 // in-place. The incoming registers may be null.
4473 //
4474 // In some edge cases, carry-ins can be consumed "for free". In that case,
4475 // the consumed carry bits are removed from CarryIn in-place.
4476 auto buildMadChain =
4477 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4478 -> Carry {
4479 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4480 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4481
4482 Carry CarryOut;
4483 unsigned j0 = 0;
4484
4485 // Use plain 32-bit multiplication for the most significant part of the
4486 // result by default.
4487 if (LocalAccum.size() == 1 &&
4488 (!UsePartialMad64_32 || !CarryIn.empty())) {
4489 do {
4490 // Skip multiplication if one of the operands is 0
4491 unsigned j1 = DstIndex - j0;
4492 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4493 ++j0;
4494 continue;
4495 }
4496 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4497 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4498 LocalAccum[0] = Mul.getReg(0);
4499 } else {
4500 if (CarryIn.empty()) {
4501 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4502 } else {
4503 LocalAccum[0] =
4504 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4505 .getReg(0);
4506 CarryIn.pop_back();
4507 }
4508 }
4509 ++j0;
4510 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4511 }
4512
4513 // Build full 64-bit multiplies.
4514 if (j0 <= DstIndex) {
4515 bool HaveSmallAccum = false;
4516 Register Tmp;
4517
4518 if (LocalAccum[0]) {
4519 if (LocalAccum.size() == 1) {
4520 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4521 HaveSmallAccum = true;
4522 } else if (LocalAccum[1]) {
4523 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4524 HaveSmallAccum = false;
4525 } else {
4526 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4527 HaveSmallAccum = true;
4528 }
4529 } else {
4530 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4531 Tmp = getZero64();
4532 HaveSmallAccum = true;
4533 }
4534
4535 do {
4536 unsigned j1 = DstIndex - j0;
4537 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4538 ++j0;
4539 continue;
4540 }
4541 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4542 {Src0[j0], Src1[j1], Tmp});
4543 Tmp = Mad.getReg(0);
4544 if (!HaveSmallAccum)
4545 CarryOut.push_back(Mad.getReg(1));
4546 HaveSmallAccum = false;
4547
4548 ++j0;
4549 } while (j0 <= DstIndex);
4550
4551 auto Unmerge = B.buildUnmerge(S32, Tmp);
4552 LocalAccum[0] = Unmerge.getReg(0);
4553 if (LocalAccum.size() > 1)
4554 LocalAccum[1] = Unmerge.getReg(1);
4555 }
4556
4557 return CarryOut;
4558 };
4559
4560 // Outer multiply loop, iterating over destination parts from least
4561 // significant to most significant parts.
4562 //
4563 // The columns of the following diagram correspond to the destination parts
4564 // affected by one iteration of the outer loop (ignoring boundary
4565 // conditions).
4566 //
4567 // Dest index relative to 2 * i: 1 0 -1
4568 // ------
4569 // Carries from previous iteration: e o
4570 // Even-aligned partial product sum: E E .
4571 // Odd-aligned partial product sum: O O
4572 //
4573 // 'o' is OddCarry, 'e' is EvenCarry.
4574 // EE and OO are computed from partial products via buildMadChain and use
4575 // accumulation where possible and appropriate.
4576 //
4577 Register SeparateOddCarry;
4578 Carry EvenCarry;
4579 Carry OddCarry;
4580
4581 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4582 Carry OddCarryIn = std::move(OddCarry);
4583 Carry EvenCarryIn = std::move(EvenCarry);
4584 OddCarry.clear();
4585 EvenCarry.clear();
4586
4587 // Partial products at offset 2 * i.
4588 if (2 * i < Accum.size()) {
4589 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4590 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4591 }
4592
4593 // Partial products at offset 2 * i - 1.
4594 if (i > 0) {
4595 if (!SeparateOddAlignedProducts) {
4596 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4597 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4598 } else {
4599 bool IsHighest = 2 * i >= Accum.size();
4600 Register SeparateOddOut[2];
4601 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4602 .take_front(IsHighest ? 1 : 2);
4603 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4604
4606
4607 if (i == 1) {
4608 if (!IsHighest)
4609 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4610 else
4611 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4612 } else {
4613 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4614 SeparateOddCarry);
4615 }
4616 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4617
4618 if (!IsHighest) {
4619 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4620 Lo->getOperand(1).getReg());
4621 Accum[2 * i] = Hi.getReg(0);
4622 SeparateOddCarry = Hi.getReg(1);
4623 }
4624 }
4625 }
4626
4627 // Add in the carries from the previous iteration
4628 if (i > 0) {
4629 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4630 EvenCarryIn.push_back(CarryOut);
4631
4632 if (2 * i < Accum.size()) {
4633 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4634 OddCarry.push_back(CarryOut);
4635 }
4636 }
4637 }
4638}
4639
4640// Custom narrowing of wide multiplies using wide multiply-add instructions.
4641//
4642// TODO: If the multiply is followed by an addition, we should attempt to
4643// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4645 MachineInstr &MI) const {
4646 assert(ST.hasMad64_32());
4647 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4648
4649 MachineIRBuilder &B = Helper.MIRBuilder;
4650 MachineRegisterInfo &MRI = *B.getMRI();
4651
4652 Register DstReg = MI.getOperand(0).getReg();
4653 Register Src0 = MI.getOperand(1).getReg();
4654 Register Src1 = MI.getOperand(2).getReg();
4655
4656 LLT Ty = MRI.getType(DstReg);
4657 assert(Ty.isScalar());
4658
4659 unsigned Size = Ty.getSizeInBits();
4660 if (ST.hasVMulU64Inst() && Size == 64)
4661 return true;
4662
4663 unsigned NumParts = Size / 32;
4664 assert((Size % 32) == 0);
4665 assert(NumParts >= 2);
4666
4667 // Whether to use MAD_64_32 for partial products whose high half is
4668 // discarded. This avoids some ADD instructions but risks false dependency
4669 // stalls on some subtargets in some cases.
4670 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4671
4672 // Whether to compute odd-aligned partial products separately. This is
4673 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4674 // in an even-aligned VGPR.
4675 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4676
4677 LLT S32 = LLT::scalar(32);
4678 SmallVector<Register, 2> Src0Parts, Src1Parts;
4679 for (unsigned i = 0; i < NumParts; ++i) {
4682 }
4683 B.buildUnmerge(Src0Parts, Src0);
4684 B.buildUnmerge(Src1Parts, Src1);
4685
4686 SmallVector<Register, 2> AccumRegs(NumParts);
4687 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4688 SeparateOddAlignedProducts);
4689
4690 B.buildMergeLikeInstr(DstReg, AccumRegs);
4691 MI.eraseFromParent();
4692 return true;
4693}
4694
4695// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4696// ctlz/cttz_zero_poison. This allows us to fix up the result for the zero input
4697// case with a single min instruction instead of a compare+select.
4700 MachineIRBuilder &B) const {
4701 Register Dst = MI.getOperand(0).getReg();
4702 Register Src = MI.getOperand(1).getReg();
4703 LLT DstTy = MRI.getType(Dst);
4704 LLT SrcTy = MRI.getType(Src);
4705
4706 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4707 ? AMDGPU::G_AMDGPU_FFBH_U32
4708 : AMDGPU::G_AMDGPU_FFBL_B32;
4709 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4710 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4711
4712 MI.eraseFromParent();
4713 return true;
4714}
4715
4718 MachineIRBuilder &B) const {
4719 Register Dst = MI.getOperand(0).getReg();
4720 Register Src = MI.getOperand(1).getReg();
4721 LLT SrcTy = MRI.getType(Src);
4722 TypeSize NumBits = SrcTy.getSizeInBits();
4723
4724 assert(NumBits < 32u);
4725
4726 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4727 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4728 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4729 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4730 B.buildTrunc(Dst, Ctlz);
4731 MI.eraseFromParent();
4732 return true;
4733}
4734
4737 MachineIRBuilder &B) const {
4738 Register Dst = MI.getOperand(0).getReg();
4739 Register Src = MI.getOperand(1).getReg();
4740 LLT SrcTy = MRI.getType(Src);
4741 const LLT S32 = LLT::scalar(32);
4742 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4743 unsigned BitWidth = SrcTy.getSizeInBits();
4744
4745 auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
4746 auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
4747 B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
4748 MI.eraseFromParent();
4749 return true;
4750}
4751
4752// Check that this is a G_XOR x, -1
4753static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4754 if (MI.getOpcode() != TargetOpcode::G_XOR)
4755 return false;
4756 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4757 return ConstVal == -1;
4758}
4759
4760// Return the use branch instruction, otherwise null if the usage is invalid.
4761static MachineInstr *
4763 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4764 Register CondDef = MI.getOperand(0).getReg();
4765 if (!MRI.hasOneNonDBGUse(CondDef))
4766 return nullptr;
4767
4768 MachineBasicBlock *Parent = MI.getParent();
4769 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4770
4771 if (isNot(MRI, *UseMI)) {
4772 Register NegatedCond = UseMI->getOperand(0).getReg();
4773 if (!MRI.hasOneNonDBGUse(NegatedCond))
4774 return nullptr;
4775
4776 // We're deleting the def of this value, so we need to remove it.
4777 eraseInstr(*UseMI, MRI);
4778
4779 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4780 Negated = true;
4781 }
4782
4783 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4784 return nullptr;
4785
4786 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4787 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4788 if (Next == Parent->end()) {
4789 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4790 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4791 return nullptr;
4792 UncondBrTarget = &*NextMBB;
4793 } else {
4794 if (Next->getOpcode() != AMDGPU::G_BR)
4795 return nullptr;
4796 Br = &*Next;
4797 UncondBrTarget = Br->getOperand(0).getMBB();
4798 }
4799
4800 return UseMI;
4801}
4802
4805 const ArgDescriptor *Arg,
4806 const TargetRegisterClass *ArgRC,
4807 LLT ArgTy) const {
4808 MCRegister SrcReg = Arg->getRegister();
4809 assert(SrcReg.isPhysical() && "Physical register expected");
4810 assert(DstReg.isVirtual() && "Virtual register expected");
4811
4812 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4813 *ArgRC, B.getDebugLoc(), ArgTy);
4814 if (Arg->isMasked()) {
4815 // TODO: Should we try to emit this once in the entry block?
4816 const LLT S32 = LLT::scalar(32);
4817 const unsigned Mask = Arg->getMask();
4818 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4819
4820 Register AndMaskSrc = LiveIn;
4821
4822 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4823 // 0.
4824 if (Shift != 0) {
4825 auto ShiftAmt = B.buildConstant(S32, Shift);
4826 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4827 }
4828
4829 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4830 } else {
4831 B.buildCopy(DstReg, LiveIn);
4832 }
4833}
4834
4839 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4840 Register DstReg = MI.getOperand(0).getReg();
4841 if (!ST.hasClusters()) {
4842 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4843 return false;
4844 MI.eraseFromParent();
4845 return true;
4846 }
4847
4848 // Clusters are supported. Return the global position in the grid. If clusters
4849 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4850
4851 // WorkGroupIdXYZ = ClusterId == 0 ?
4852 // ClusterIdXYZ :
4853 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4854 MachineRegisterInfo &MRI = *B.getMRI();
4855 const LLT S32 = LLT::scalar(32);
4856 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4857 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4858 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4859 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4860 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4861 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4862 return false;
4863
4864 auto One = B.buildConstant(S32, 1);
4865 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4866 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4867 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4868
4869 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4870
4871 switch (MFI->getClusterDims().getKind()) {
4874 B.buildCopy(DstReg, GlobalIdXYZ);
4875 MI.eraseFromParent();
4876 return true;
4877 }
4879 B.buildCopy(DstReg, ClusterIdXYZ);
4880 MI.eraseFromParent();
4881 return true;
4882 }
4884 using namespace AMDGPU::Hwreg;
4885 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4886 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4887 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4888 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4889 .addDef(ClusterId)
4890 .addImm(ClusterIdField);
4891 auto Zero = B.buildConstant(S32, 0);
4892 auto NoClusters =
4893 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4894 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4895 MI.eraseFromParent();
4896 return true;
4897 }
4898 }
4899
4900 llvm_unreachable("nothing should reach here");
4901}
4902
4904 Register DstReg, MachineIRBuilder &B,
4906 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4907 const ArgDescriptor *Arg = nullptr;
4908 const TargetRegisterClass *ArgRC;
4909 LLT ArgTy;
4910
4911 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4912 const ArgDescriptor WorkGroupIDX =
4913 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4914 // If GridZ is not programmed in an entry function then the hardware will set
4915 // it to all zeros, so there is no need to mask the GridY value in the low
4916 // order bits.
4917 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4918 AMDGPU::TTMP7,
4919 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4920 const ArgDescriptor WorkGroupIDZ =
4921 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4922 const ArgDescriptor ClusterWorkGroupIDX =
4923 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4924 const ArgDescriptor ClusterWorkGroupIDY =
4925 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4926 const ArgDescriptor ClusterWorkGroupIDZ =
4927 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4928 const ArgDescriptor ClusterWorkGroupMaxIDX =
4929 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4930 const ArgDescriptor ClusterWorkGroupMaxIDY =
4931 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4932 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4933 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4934 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4935 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4936
4937 auto LoadConstant = [&](unsigned N) {
4938 B.buildConstant(DstReg, N);
4939 return true;
4940 };
4941
4942 if (ST.hasArchitectedSGPRs() &&
4944 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4945 bool HasFixedDims = ClusterDims.isFixedDims();
4946
4947 switch (ArgType) {
4949 Arg = &WorkGroupIDX;
4950 ArgRC = &AMDGPU::SReg_32RegClass;
4951 ArgTy = LLT::scalar(32);
4952 break;
4954 Arg = &WorkGroupIDY;
4955 ArgRC = &AMDGPU::SReg_32RegClass;
4956 ArgTy = LLT::scalar(32);
4957 break;
4959 Arg = &WorkGroupIDZ;
4960 ArgRC = &AMDGPU::SReg_32RegClass;
4961 ArgTy = LLT::scalar(32);
4962 break;
4964 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4965 return LoadConstant(0);
4966 Arg = &ClusterWorkGroupIDX;
4967 ArgRC = &AMDGPU::SReg_32RegClass;
4968 ArgTy = LLT::scalar(32);
4969 break;
4971 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4972 return LoadConstant(0);
4973 Arg = &ClusterWorkGroupIDY;
4974 ArgRC = &AMDGPU::SReg_32RegClass;
4975 ArgTy = LLT::scalar(32);
4976 break;
4978 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4979 return LoadConstant(0);
4980 Arg = &ClusterWorkGroupIDZ;
4981 ArgRC = &AMDGPU::SReg_32RegClass;
4982 ArgTy = LLT::scalar(32);
4983 break;
4985 if (HasFixedDims)
4986 return LoadConstant(ClusterDims.getDims()[0] - 1);
4987 Arg = &ClusterWorkGroupMaxIDX;
4988 ArgRC = &AMDGPU::SReg_32RegClass;
4989 ArgTy = LLT::scalar(32);
4990 break;
4992 if (HasFixedDims)
4993 return LoadConstant(ClusterDims.getDims()[1] - 1);
4994 Arg = &ClusterWorkGroupMaxIDY;
4995 ArgRC = &AMDGPU::SReg_32RegClass;
4996 ArgTy = LLT::scalar(32);
4997 break;
4999 if (HasFixedDims)
5000 return LoadConstant(ClusterDims.getDims()[2] - 1);
5001 Arg = &ClusterWorkGroupMaxIDZ;
5002 ArgRC = &AMDGPU::SReg_32RegClass;
5003 ArgTy = LLT::scalar(32);
5004 break;
5006 Arg = &ClusterWorkGroupMaxFlatID;
5007 ArgRC = &AMDGPU::SReg_32RegClass;
5008 ArgTy = LLT::scalar(32);
5009 break;
5010 default:
5011 break;
5012 }
5013 }
5014
5015 if (!Arg)
5016 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5017
5018 if (!Arg) {
5020 // The intrinsic may appear when we have a 0 sized kernarg segment, in
5021 // which case the pointer argument may be missing and we use null.
5022 return LoadConstant(0);
5023 }
5024
5025 // It's undefined behavior if a function marked with the amdgpu-no-*
5026 // attributes uses the corresponding intrinsic.
5027 B.buildUndef(DstReg);
5028 return true;
5029 }
5030
5031 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5032 return false; // TODO: Handle these
5033 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5034 return true;
5035}
5036
5040 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
5041 return false;
5042
5043 MI.eraseFromParent();
5044 return true;
5045}
5046
5048 int64_t C) {
5049 B.buildConstant(MI.getOperand(0).getReg(), C);
5050 MI.eraseFromParent();
5051 return true;
5052}
5053
5056 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5057 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
5058 if (MaxID == 0)
5059 return replaceWithConstant(B, MI, 0);
5060
5061 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5062 const ArgDescriptor *Arg;
5063 const TargetRegisterClass *ArgRC;
5064 LLT ArgTy;
5065 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5066
5067 Register DstReg = MI.getOperand(0).getReg();
5068 if (!Arg) {
5069 // It's undefined behavior if a function marked with the amdgpu-no-*
5070 // attributes uses the corresponding intrinsic.
5071 B.buildUndef(DstReg);
5072 MI.eraseFromParent();
5073 return true;
5074 }
5075
5076 if (Arg->isMasked()) {
5077 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5078 // masking operations anyway.
5079 //
5080 // TODO: We could assert the top bit is 0 for the source copy.
5081 if (!loadInputValue(DstReg, B, ArgType))
5082 return false;
5083 } else {
5085 if (!loadInputValue(TmpReg, B, ArgType))
5086 return false;
5087 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5088 }
5089
5090 MI.eraseFromParent();
5091 return true;
5092}
5093
5096 // This isn't really a constant pool but close enough.
5099 return PtrInfo;
5100}
5101
5103 int64_t Offset) const {
5105 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5106
5107 // TODO: If we passed in the base kernel offset we could have a better
5108 // alignment than 4, but we don't really need it.
5109 if (!loadInputValue(KernArgReg, B,
5111 llvm_unreachable("failed to find kernarg segment ptr");
5112
5113 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5114 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5115}
5116
5117/// Legalize a value that's loaded from kernel arguments. This is only used by
5118/// legacy intrinsics.
5122 Align Alignment) const {
5123 Register DstReg = MI.getOperand(0).getReg();
5124
5125 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5126 "unexpected kernarg parameter type");
5127
5130 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5133 MI.eraseFromParent();
5134 return true;
5135}
5136
5139 MachineIRBuilder &B) const {
5140 Register Dst = MI.getOperand(0).getReg();
5141 LLT DstTy = MRI.getType(Dst);
5142 LLT S16 = LLT::scalar(16);
5143 LLT S32 = LLT::scalar(32);
5144 LLT S64 = LLT::scalar(64);
5145
5146 if (DstTy == S16)
5147 return legalizeFDIV16(MI, MRI, B);
5148 if (DstTy == S32)
5149 return legalizeFDIV32(MI, MRI, B);
5150 if (DstTy == S64)
5151 return legalizeFDIV64(MI, MRI, B);
5152
5153 return false;
5154}
5155
5157 Register DstDivReg,
5158 Register DstRemReg,
5159 Register X,
5160 Register Y) const {
5161 const LLT S1 = LLT::scalar(1);
5162 const LLT S32 = LLT::scalar(32);
5163
5164 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5165 // algorithm used here.
5166
5167 // Initial estimate of inv(y).
5168 auto FloatY = B.buildUITOFP(S32, Y);
5169 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5170 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5171 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5172 auto Z = B.buildFPTOUI(S32, ScaledY);
5173
5174 // One round of UNR.
5175 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5176 auto NegYZ = B.buildMul(S32, NegY, Z);
5177 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5178
5179 // Quotient/remainder estimate.
5180 auto Q = B.buildUMulH(S32, X, Z);
5181 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5182
5183 // First quotient/remainder refinement.
5184 auto One = B.buildConstant(S32, 1);
5185 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5186 if (DstDivReg)
5187 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5188 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5189
5190 // Second quotient/remainder refinement.
5191 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5192 if (DstDivReg)
5193 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5194
5195 if (DstRemReg)
5196 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5197}
5198
5199// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5200//
5201// Return lo, hi of result
5202//
5203// %cvt.lo = G_UITOFP Val.lo
5204// %cvt.hi = G_UITOFP Val.hi
5205// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5206// %rcp = G_AMDGPU_RCP_IFLAG %mad
5207// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5208// %mul2 = G_FMUL %mul1, 2**(-32)
5209// %trunc = G_INTRINSIC_TRUNC %mul2
5210// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5211// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5212static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5213 Register Val) {
5214 const LLT S32 = LLT::scalar(32);
5215 auto Unmerge = B.buildUnmerge(S32, Val);
5216
5217 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5218 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5219
5220 auto Mad = B.buildFMAD(
5221 S32, CvtHi, // 2**32
5222 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5223
5224 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5225 auto Mul1 = B.buildFMul(
5226 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5227
5228 // 2**(-32)
5229 auto Mul2 = B.buildFMul(
5230 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5231 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5232
5233 // -(2**32)
5234 auto Mad2 = B.buildFMAD(
5235 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5236 Mul1);
5237
5238 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5239 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5240
5241 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5242}
5243
5245 Register DstDivReg,
5246 Register DstRemReg,
5247 Register Numer,
5248 Register Denom) const {
5249 const LLT S32 = LLT::scalar(32);
5250 const LLT S64 = LLT::scalar(64);
5251 const LLT S1 = LLT::scalar(1);
5252 Register RcpLo, RcpHi;
5253
5254 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5255
5256 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5257
5258 auto Zero64 = B.buildConstant(S64, 0);
5259 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5260
5261 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5262 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5263
5264 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5265 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5266 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5267
5268 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5269 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5270 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5271
5272 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5273 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5274 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5275 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5276 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5277
5278 auto Zero32 = B.buildConstant(S32, 0);
5279 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5280 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5281 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5282
5283 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5284 Register NumerLo = UnmergeNumer.getReg(0);
5285 Register NumerHi = UnmergeNumer.getReg(1);
5286
5287 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5288 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5289 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5290 Register Mul3_Lo = UnmergeMul3.getReg(0);
5291 Register Mul3_Hi = UnmergeMul3.getReg(1);
5292 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5293 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5294 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5295 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5296
5297 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5298 Register DenomLo = UnmergeDenom.getReg(0);
5299 Register DenomHi = UnmergeDenom.getReg(1);
5300
5301 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5302 auto C1 = B.buildSExt(S32, CmpHi);
5303
5304 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5305 auto C2 = B.buildSExt(S32, CmpLo);
5306
5307 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5308 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5309
5310 // TODO: Here and below portions of the code can be enclosed into if/endif.
5311 // Currently control flow is unconditional and we have 4 selects after
5312 // potential endif to substitute PHIs.
5313
5314 // if C3 != 0 ...
5315 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5316 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5317 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5318 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5319
5320 auto One64 = B.buildConstant(S64, 1);
5321 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5322
5323 auto C4 =
5324 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5325 auto C5 =
5326 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5327 auto C6 = B.buildSelect(
5328 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5329
5330 // if (C6 != 0)
5331 auto Add4 = B.buildAdd(S64, Add3, One64);
5332 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5333
5334 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5335 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5336 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5337
5338 // endif C6
5339 // endif C3
5340
5341 if (DstDivReg) {
5342 auto Sel1 = B.buildSelect(
5343 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5344 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5345 Sel1, MulHi3);
5346 }
5347
5348 if (DstRemReg) {
5349 auto Sel2 = B.buildSelect(
5350 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5351 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5352 Sel2, Sub1);
5353 }
5354}
5355
5358 MachineIRBuilder &B) const {
5359 Register DstDivReg, DstRemReg;
5360 switch (MI.getOpcode()) {
5361 default:
5362 llvm_unreachable("Unexpected opcode!");
5363 case AMDGPU::G_UDIV: {
5364 DstDivReg = MI.getOperand(0).getReg();
5365 break;
5366 }
5367 case AMDGPU::G_UREM: {
5368 DstRemReg = MI.getOperand(0).getReg();
5369 break;
5370 }
5371 case AMDGPU::G_UDIVREM: {
5372 DstDivReg = MI.getOperand(0).getReg();
5373 DstRemReg = MI.getOperand(1).getReg();
5374 break;
5375 }
5376 }
5377
5378 const LLT S64 = LLT::scalar(64);
5379 const LLT S32 = LLT::scalar(32);
5380 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5381 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5382 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5383 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5384
5385 if (Ty == S32)
5386 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5387 else if (Ty == S64)
5388 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5389 else
5390 return false;
5391
5392 MI.eraseFromParent();
5393 return true;
5394}
5395
5398 MachineIRBuilder &B) const {
5399 const LLT S64 = LLT::scalar(64);
5400 const LLT S32 = LLT::scalar(32);
5401
5402 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5403 if (Ty != S32 && Ty != S64)
5404 return false;
5405
5406 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5407 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5408 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5409
5410 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5411 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5412 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5413
5414 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5415 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5416
5417 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5418 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5419
5420 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5421 switch (MI.getOpcode()) {
5422 default:
5423 llvm_unreachable("Unexpected opcode!");
5424 case AMDGPU::G_SDIV: {
5425 DstDivReg = MI.getOperand(0).getReg();
5426 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5427 break;
5428 }
5429 case AMDGPU::G_SREM: {
5430 DstRemReg = MI.getOperand(0).getReg();
5431 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5432 break;
5433 }
5434 case AMDGPU::G_SDIVREM: {
5435 DstDivReg = MI.getOperand(0).getReg();
5436 DstRemReg = MI.getOperand(1).getReg();
5437 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5438 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5439 break;
5440 }
5441 }
5442
5443 if (Ty == S32)
5444 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5445 else
5446 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5447
5448 if (DstDivReg) {
5449 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5450 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5451 B.buildSub(DstDivReg, SignXor, Sign);
5452 }
5453
5454 if (DstRemReg) {
5455 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5456 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5457 B.buildSub(DstRemReg, SignXor, Sign);
5458 }
5459
5460 MI.eraseFromParent();
5461 return true;
5462}
5463
5466 MachineIRBuilder &B) const {
5467 Register Res = MI.getOperand(0).getReg();
5468 Register LHS = MI.getOperand(1).getReg();
5469 Register RHS = MI.getOperand(2).getReg();
5470 uint16_t Flags = MI.getFlags();
5471 LLT ResTy = MRI.getType(Res);
5472
5473 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5474
5475 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5476 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5477 return false;
5478
5479 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5480 // the CI documentation has a worst case error of 1 ulp.
5481 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5482 // use it as long as we aren't trying to use denormals.
5483 //
5484 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5485
5486 // 1 / x -> RCP(x)
5487 if (CLHS->isExactlyValue(1.0)) {
5488 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5489 .addUse(RHS)
5490 .setMIFlags(Flags);
5491
5492 MI.eraseFromParent();
5493 return true;
5494 }
5495
5496 // -1 / x -> RCP( FNEG(x) )
5497 if (CLHS->isExactlyValue(-1.0)) {
5498 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5499 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5500 .addUse(FNeg.getReg(0))
5501 .setMIFlags(Flags);
5502
5503 MI.eraseFromParent();
5504 return true;
5505 }
5506 }
5507
5508 // For f16 require afn or arcp.
5509 // For f32 require afn.
5510 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5511 !MI.getFlag(MachineInstr::FmArcp)))
5512 return false;
5513
5514 // x / y -> x * (1.0 / y)
5515 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5516 .addUse(RHS)
5517 .setMIFlags(Flags);
5518 B.buildFMul(Res, LHS, RCP, Flags);
5519
5520 MI.eraseFromParent();
5521 return true;
5522}
5523
5526 MachineIRBuilder &B) const {
5527 Register Res = MI.getOperand(0).getReg();
5528 Register X = MI.getOperand(1).getReg();
5529 Register Y = MI.getOperand(2).getReg();
5530 uint16_t Flags = MI.getFlags();
5531 LLT ResTy = MRI.getType(Res);
5532
5533 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5534
5535 if (!AllowInaccurateRcp)
5536 return false;
5537
5538 const ConstantFP *CLHS = getConstantFPVRegVal(X, MRI);
5539 bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);
5540
5541 // Pull out the negation so it folds for free into the source modifiers.
5542 if (IsNegRcp)
5543 X = B.buildFConstant(ResTy, 1.0).getReg(0);
5544
5545 Register NegY = IsNegRcp ? Y : B.buildFNeg(ResTy, Y).getReg(0);
5546 auto One = B.buildFConstant(ResTy, 1.0);
5547
5548 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5549 .addUse(Y)
5550 .setMIFlags(Flags);
5551 if (IsNegRcp)
5552 R = B.buildFNeg(ResTy, R);
5553
5554 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5555 R = B.buildFMA(ResTy, Tmp0, R, R);
5556
5557 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5558 R = B.buildFMA(ResTy, Tmp1, R, R);
5559
5560 // Skip the last 2 correction terms for reciprocal.
5561 if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0))) {
5562 B.buildCopy(Res, R);
5563 MI.eraseFromParent();
5564 return true;
5565 }
5566
5567 auto Ret = B.buildFMul(ResTy, X, R);
5568 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5569
5570 B.buildFMA(Res, Tmp2, R, Ret);
5571 MI.eraseFromParent();
5572 return true;
5573}
5574
5577 MachineIRBuilder &B) const {
5578 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5579 return true;
5580
5581 Register Res = MI.getOperand(0).getReg();
5582 Register LHS = MI.getOperand(1).getReg();
5583 Register RHS = MI.getOperand(2).getReg();
5584
5585 uint16_t Flags = MI.getFlags();
5586
5587 LLT S16 = LLT::scalar(16);
5588 LLT S32 = LLT::scalar(32);
5589
5590 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5591 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5592 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5593 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5594 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5595 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5596 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5597 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5598 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5599 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5600 // q16.u = opx(V_CVT_F16_F32, q32.u);
5601 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5602
5603 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5604 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5605 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5606 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5607 .addUse(RHSExt.getReg(0))
5608 .setMIFlags(Flags);
5609 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5611 if (ST.hasMadMacF32Insts()) {
5612 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5613 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5614 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5615 } else {
5616 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5617 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5618 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5619 }
5620 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5621 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5622 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5623 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5624 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5625 .addUse(RDst.getReg(0))
5626 .addUse(RHS)
5627 .addUse(LHS)
5628 .setMIFlags(Flags);
5629
5630 MI.eraseFromParent();
5631 return true;
5632}
5633
5634static constexpr unsigned SPDenormModeBitField =
5636
5637// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5638// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5640 const GCNSubtarget &ST,
5642 // Set SP denorm mode to this value.
5643 unsigned SPDenormMode =
5644 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5645
5646 if (ST.hasDenormModeInst()) {
5647 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5648 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5649
5650 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5651 B.buildInstr(AMDGPU::S_DENORM_MODE)
5652 .addImm(NewDenormModeValue);
5653
5654 } else {
5655 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5656 .addImm(SPDenormMode)
5657 .addImm(SPDenormModeBitField);
5658 }
5659}
5660
5663 MachineIRBuilder &B) const {
5664 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5665 return true;
5666
5667 Register Res = MI.getOperand(0).getReg();
5668 Register LHS = MI.getOperand(1).getReg();
5669 Register RHS = MI.getOperand(2).getReg();
5670 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5671 SIModeRegisterDefaults Mode = MFI->getMode();
5672
5673 uint16_t Flags = MI.getFlags();
5674
5675 LLT S32 = LLT::scalar(32);
5676 LLT S1 = LLT::scalar(1);
5677
5678 auto One = B.buildFConstant(S32, 1.0f);
5679
5680 auto DenominatorScaled =
5681 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5682 .addUse(LHS)
5683 .addUse(RHS)
5684 .addImm(0)
5685 .setMIFlags(Flags);
5686 auto NumeratorScaled =
5687 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5688 .addUse(LHS)
5689 .addUse(RHS)
5690 .addImm(1)
5691 .setMIFlags(Flags);
5692
5693 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5694 .addUse(DenominatorScaled.getReg(0))
5695 .setMIFlags(Flags);
5696 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5697
5698 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5699 const bool HasDynamicDenormals =
5700 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5701 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5702
5703 Register SavedSPDenormMode;
5704 if (!PreservesDenormals) {
5705 if (HasDynamicDenormals) {
5706 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5707 B.buildInstr(AMDGPU::S_GETREG_B32)
5708 .addDef(SavedSPDenormMode)
5709 .addImm(SPDenormModeBitField);
5710 }
5711 toggleSPDenormMode(true, B, ST, Mode);
5712 }
5713
5714 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5715 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5716 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5717 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5718 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5719 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5720
5721 if (!PreservesDenormals) {
5722 if (HasDynamicDenormals) {
5723 assert(SavedSPDenormMode);
5724 B.buildInstr(AMDGPU::S_SETREG_B32)
5725 .addReg(SavedSPDenormMode)
5726 .addImm(SPDenormModeBitField);
5727 } else
5728 toggleSPDenormMode(false, B, ST, Mode);
5729 }
5730
5731 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5732 .addUse(Fma4.getReg(0))
5733 .addUse(Fma1.getReg(0))
5734 .addUse(Fma3.getReg(0))
5735 .addUse(NumeratorScaled.getReg(1))
5736 .setMIFlags(Flags);
5737
5738 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5739 .addUse(Fmas.getReg(0))
5740 .addUse(RHS)
5741 .addUse(LHS)
5742 .setMIFlags(Flags);
5743
5744 MI.eraseFromParent();
5745 return true;
5746}
5747
5750 MachineIRBuilder &B) const {
5751 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5752 return true;
5753
5754 Register Res = MI.getOperand(0).getReg();
5755 Register LHS = MI.getOperand(1).getReg();
5756 Register RHS = MI.getOperand(2).getReg();
5757
5758 uint16_t Flags = MI.getFlags();
5759
5760 LLT S64 = LLT::scalar(64);
5761 LLT S1 = LLT::scalar(1);
5762
5763 auto One = B.buildFConstant(S64, 1.0);
5764
5765 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5766 .addUse(LHS)
5767 .addUse(RHS)
5768 .addImm(0)
5769 .setMIFlags(Flags);
5770
5771 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5772
5773 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5774 .addUse(DivScale0.getReg(0))
5775 .setMIFlags(Flags);
5776
5777 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5778 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5779 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5780
5781 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5782 .addUse(LHS)
5783 .addUse(RHS)
5784 .addImm(1)
5785 .setMIFlags(Flags);
5786
5787 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5788 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5789 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5790
5791 Register Scale;
5792 if (!ST.hasUsableDivScaleConditionOutput()) {
5793 // Workaround a hardware bug on SI where the condition output from div_scale
5794 // is not usable.
5795
5796 LLT S32 = LLT::scalar(32);
5797
5798 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5799 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5800 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5801 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5802
5803 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5804 Scale1Unmerge.getReg(1));
5805 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5806 Scale0Unmerge.getReg(1));
5807 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5808 } else {
5809 Scale = DivScale1.getReg(1);
5810 }
5811
5812 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5813 .addUse(Fma4.getReg(0))
5814 .addUse(Fma3.getReg(0))
5815 .addUse(Mul.getReg(0))
5816 .addUse(Scale)
5817 .setMIFlags(Flags);
5818
5819 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5820 .addUse(Fmas.getReg(0))
5821 .addUse(RHS)
5822 .addUse(LHS)
5823 .setMIFlags(Flags);
5824
5825 MI.eraseFromParent();
5826 return true;
5827}
5828
5831 MachineIRBuilder &B) const {
5832 Register Res0 = MI.getOperand(0).getReg();
5833 Register Res1 = MI.getOperand(1).getReg();
5834 Register Val = MI.getOperand(2).getReg();
5835 uint16_t Flags = MI.getFlags();
5836
5837 LLT Ty = MRI.getType(Res0);
5838 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5839
5840 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5841 .addUse(Val)
5842 .setMIFlags(Flags);
5843 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5844 .addUse(Val)
5845 .setMIFlags(Flags);
5846
5847 if (ST.hasFractBug()) {
5848 auto Fabs = B.buildFAbs(Ty, Val);
5849 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5850 auto IsFinite =
5851 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5852 auto Zero = B.buildConstant(InstrExpTy, 0);
5853 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5854 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5855 }
5856
5857 B.buildCopy(Res0, Mant);
5858 B.buildSExtOrTrunc(Res1, Exp);
5859
5860 MI.eraseFromParent();
5861 return true;
5862}
5863
5866 MachineIRBuilder &B) const {
5867 Register Res = MI.getOperand(0).getReg();
5868 Register LHS = MI.getOperand(2).getReg();
5869 Register RHS = MI.getOperand(3).getReg();
5870 uint16_t Flags = MI.getFlags();
5871
5872 LLT S32 = LLT::scalar(32);
5873 LLT S1 = LLT::scalar(1);
5874
5875 auto Abs = B.buildFAbs(S32, RHS, Flags);
5876 const APFloat C0Val(1.0f);
5877
5878 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5879 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5880 auto C2 = B.buildFConstant(S32, 1.0f);
5881
5882 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5883 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5884
5885 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5886
5887 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5888 .addUse(Mul0.getReg(0))
5889 .setMIFlags(Flags);
5890
5891 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5892
5893 B.buildFMul(Res, Sel, Mul1, Flags);
5894
5895 MI.eraseFromParent();
5896 return true;
5897}
5898
5901 MachineIRBuilder &B) const {
5902 // Bypass the correct expansion a standard promotion through G_FSQRT would
5903 // get. The f32 op is accurate enough for the f16 cas.
5904 unsigned Flags = MI.getFlags();
5905 assert(!ST.has16BitInsts());
5906 const LLT F32 = LLT::scalar(32);
5907 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5908 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5909 .addUse(Ext.getReg(0))
5910 .setMIFlags(Flags);
5911 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5912 MI.eraseFromParent();
5913 return true;
5914}
5915
5918 MachineIRBuilder &B) const {
5919 MachineFunction &MF = B.getMF();
5920 Register Dst = MI.getOperand(0).getReg();
5921 Register X = MI.getOperand(1).getReg();
5922 const unsigned Flags = MI.getFlags();
5923 const LLT S1 = LLT::scalar(1);
5924 const LLT F32 = LLT::scalar(32);
5925 const LLT I32 = LLT::scalar(32);
5926
5927 if (allowApproxFunc(MF, Flags)) {
5928 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5929 .addUse(X)
5930 .setMIFlags(Flags);
5931 MI.eraseFromParent();
5932 return true;
5933 }
5934
5935 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5936 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5937 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5938 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5939 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5940
5942 if (needsDenormHandlingF32(MF, X, Flags)) {
5943 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5944 .addUse(SqrtX.getReg(0))
5945 .setMIFlags(Flags);
5946
5947 auto NegOne = B.buildConstant(I32, -1);
5948 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5949
5950 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5951 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5952
5953 auto PosOne = B.buildConstant(I32, 1);
5954 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5955
5956 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5957 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5958
5959 auto Zero = B.buildFConstant(F32, 0.0f);
5960 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5961
5962 SqrtS =
5963 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5964
5965 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5966 SqrtS =
5967 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5968 } else {
5969 auto SqrtR =
5970 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5971 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5972
5973 auto Half = B.buildFConstant(F32, 0.5f);
5974 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5975 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5976 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5977 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5978 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5979 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5980 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5981 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5982 }
5983
5984 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5985
5986 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5987
5988 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5989
5990 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5991 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5992
5993 MI.eraseFromParent();
5994 return true;
5995}
5996
5999 MachineIRBuilder &B) const {
6000 // For double type, the SQRT and RSQ instructions don't have required
6001 // precision, we apply Goldschmidt's algorithm to improve the result:
6002 //
6003 // y0 = rsq(x)
6004 // g0 = x * y0
6005 // h0 = 0.5 * y0
6006 //
6007 // r0 = 0.5 - h0 * g0
6008 // g1 = g0 * r0 + g0
6009 // h1 = h0 * r0 + h0
6010 //
6011 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
6012 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
6013 // h2 = h1 * r1 + h1
6014 //
6015 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
6016 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
6017 //
6018 // sqrt(x) = g3
6019
6020 const LLT S1 = LLT::scalar(1);
6021 const LLT S32 = LLT::scalar(32);
6022 const LLT F64 = LLT::scalar(64);
6023
6024 Register Dst = MI.getOperand(0).getReg();
6025 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
6026
6027 Register X = MI.getOperand(1).getReg();
6028 unsigned Flags = MI.getFlags();
6029
6030 Register SqrtX = X;
6031 Register Scaling, ZeroInt;
6032 if (!MI.getFlag(MachineInstr::FmAfn)) {
6033 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
6034
6035 ZeroInt = B.buildConstant(S32, 0).getReg(0);
6036 Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0);
6037
6038 // Scale up input if it is too small.
6039 auto ScaleUpFactor = B.buildConstant(S32, 256);
6040 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
6041 SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0);
6042 }
6043
6044 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX);
6045
6046 auto Half = B.buildFConstant(F64, 0.5);
6047 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
6048 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
6049
6050 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
6051 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
6052
6053 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
6054 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
6055
6056 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
6057 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
6058
6059 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
6060
6061 Register SqrtRet = SqrtS2.getReg(0);
6062 if (!MI.getFlag(MachineInstr::FmAfn)) {
6063 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
6064 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
6065 auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
6066
6067 // Scale down the result.
6068 auto ScaleDownFactor = B.buildConstant(S32, -128);
6069 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
6070 SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0);
6071 }
6072
6073 Register IsZeroOrInf;
6074 if (MI.getFlag(MachineInstr::FmNoInfs)) {
6075 auto ZeroFP = B.buildFConstant(F64, 0.0);
6076 IsZeroOrInf = B.buildFCmp(FCmpInst::FCMP_OEQ, S1, SqrtX, ZeroFP).getReg(0);
6077 } else {
6078 IsZeroOrInf = B.buildIsFPClass(S1, SqrtX, fcZero | fcPosInf).getReg(0);
6079 }
6080
6081 // TODO: Check for DAZ and expand to subnormals
6082
6083 // If x is +INF, +0, or -0, use its original value
6084 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6085
6086 MI.eraseFromParent();
6087 return true;
6088}
6089
6092 MachineIRBuilder &B) const {
6093 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6094 if (Ty == LLT::scalar(32))
6095 return legalizeFSQRTF32(MI, MRI, B);
6096 if (Ty == LLT::scalar(64))
6097 return legalizeFSQRTF64(MI, MRI, B);
6098 if (Ty == LLT::scalar(16))
6099 return legalizeFSQRTF16(MI, MRI, B);
6100 return false;
6101}
6102
6103// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6104// FIXME: Why do we handle this one but not other removed instructions?
6105//
6106// Reciprocal square root. The clamp prevents infinite results, clamping
6107// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6108// +-max_float.
6111 MachineIRBuilder &B) const {
6112 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6113 return true;
6114
6115 Register Dst = MI.getOperand(0).getReg();
6116 Register Src = MI.getOperand(2).getReg();
6117 auto Flags = MI.getFlags();
6118
6119 LLT Ty = MRI.getType(Dst);
6120
6121 const fltSemantics *FltSemantics;
6122 if (Ty == LLT::scalar(32))
6123 FltSemantics = &APFloat::IEEEsingle();
6124 else if (Ty == LLT::scalar(64))
6125 FltSemantics = &APFloat::IEEEdouble();
6126 else
6127 return false;
6128
6129 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6130 .addUse(Src)
6131 .setMIFlags(Flags);
6132
6133 // We don't need to concern ourselves with the snan handling difference, since
6134 // the rsq quieted (or not) so use the one which will directly select.
6135 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6136 const bool UseIEEE = MFI->getMode().IEEE;
6137
6138 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6139 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6140 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6141
6142 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6143
6144 if (UseIEEE)
6145 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6146 else
6147 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6148 MI.eraseFromParent();
6149 return true;
6150}
6151
6152// TODO: Fix pointer type handling
6155 Intrinsic::ID IID) const {
6156
6157 MachineIRBuilder &B = Helper.MIRBuilder;
6158 MachineRegisterInfo &MRI = *B.getMRI();
6159
6160 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6161 IID == Intrinsic::amdgcn_permlanex16;
6162 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6163 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6164 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6165 IID == Intrinsic::amdgcn_permlane_up ||
6166 IID == Intrinsic::amdgcn_permlane_down ||
6167 IID == Intrinsic::amdgcn_permlane_xor;
6168
6169 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6170 Register Src2, LLT VT) -> Register {
6171 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6172 switch (IID) {
6173 case Intrinsic::amdgcn_readfirstlane:
6174 case Intrinsic::amdgcn_permlane64:
6175 return LaneOp.getReg(0);
6176 case Intrinsic::amdgcn_readlane:
6177 case Intrinsic::amdgcn_set_inactive:
6178 case Intrinsic::amdgcn_set_inactive_chain_arg:
6179 return LaneOp.addUse(Src1).getReg(0);
6180 case Intrinsic::amdgcn_writelane:
6181 case Intrinsic::amdgcn_permlane_bcast:
6182 case Intrinsic::amdgcn_permlane_up:
6183 case Intrinsic::amdgcn_permlane_down:
6184 case Intrinsic::amdgcn_permlane_xor:
6185 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6186 case Intrinsic::amdgcn_permlane16:
6187 case Intrinsic::amdgcn_permlanex16: {
6188 Register Src3 = MI.getOperand(5).getReg();
6189 int64_t Src4 = MI.getOperand(6).getImm();
6190 int64_t Src5 = MI.getOperand(7).getImm();
6191 return LaneOp.addUse(Src1)
6192 .addUse(Src2)
6193 .addUse(Src3)
6194 .addImm(Src4)
6195 .addImm(Src5)
6196 .getReg(0);
6197 }
6198 case Intrinsic::amdgcn_mov_dpp8:
6199 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6200 case Intrinsic::amdgcn_update_dpp:
6201 return LaneOp.addUse(Src1)
6202 .addImm(MI.getOperand(4).getImm())
6203 .addImm(MI.getOperand(5).getImm())
6204 .addImm(MI.getOperand(6).getImm())
6205 .addImm(MI.getOperand(7).getImm())
6206 .getReg(0);
6207 default:
6208 llvm_unreachable("unhandled lane op");
6209 }
6210 };
6211
6212 Register DstReg = MI.getOperand(0).getReg();
6213 Register Src0 = MI.getOperand(2).getReg();
6214 Register Src1, Src2;
6215 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6216 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6217 IsPermlaneShuffle) {
6218 Src1 = MI.getOperand(3).getReg();
6219 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6220 IsPermlaneShuffle) {
6221 Src2 = MI.getOperand(4).getReg();
6222 }
6223 }
6224
6225 LLT Ty = MRI.getType(DstReg);
6226 unsigned Size = Ty.getSizeInBits();
6227
6228 unsigned SplitSize = 32;
6229 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6230 ST.hasDPALU_DPP() &&
6231 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6232 SplitSize = 64;
6233
6234 if (Size == SplitSize) {
6235 // Already legal
6236 return true;
6237 }
6238
6239 if (Size < 32) {
6240 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6241
6242 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6243 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6244
6245 if (IID == Intrinsic::amdgcn_writelane)
6246 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6247
6248 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6249 B.buildTrunc(DstReg, LaneOpDst);
6250 MI.eraseFromParent();
6251 return true;
6252 }
6253
6254 if (Size % SplitSize != 0)
6255 return false;
6256
6257 LLT PartialResTy = LLT::scalar(SplitSize);
6258 bool NeedsBitcast = false;
6259 if (Ty.isVector()) {
6260 LLT EltTy = Ty.getElementType();
6261 unsigned EltSize = EltTy.getSizeInBits();
6262 if (EltSize == SplitSize) {
6263 PartialResTy = EltTy;
6264 } else if (EltSize == 16 || EltSize == 32) {
6265 unsigned NElem = SplitSize / EltSize;
6266 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6267 } else {
6268 // Handle all other cases via S32/S64 pieces
6269 NeedsBitcast = true;
6270 }
6271 }
6272
6273 SmallVector<Register, 4> PartialRes;
6274 unsigned NumParts = Size / SplitSize;
6275 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6276 MachineInstrBuilder Src1Parts, Src2Parts;
6277
6278 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6279 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6280
6281 if (IID == Intrinsic::amdgcn_writelane)
6282 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6283
6284 for (unsigned i = 0; i < NumParts; ++i) {
6285 Src0 = Src0Parts.getReg(i);
6286
6287 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6288 Src1 = Src1Parts.getReg(i);
6289
6290 if (IID == Intrinsic::amdgcn_writelane)
6291 Src2 = Src2Parts.getReg(i);
6292
6293 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6294 }
6295
6296 if (NeedsBitcast)
6297 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6298 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6299 else
6300 B.buildMergeLikeInstr(DstReg, PartialRes);
6301
6302 MI.eraseFromParent();
6303 return true;
6304}
6305
6308 MachineIRBuilder &B) const {
6310 ST.getTargetLowering()->getImplicitParameterOffset(
6312 LLT DstTy = MRI.getType(DstReg);
6313 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6314
6315 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6316 if (!loadInputValue(KernargPtrReg, B,
6318 return false;
6319
6320 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6321 B.buildConstant(IdxTy, Offset).getReg(0));
6322 return true;
6323}
6324
6325/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6326/// bits of the pointer and replace them with the stride argument, then
6327/// merge_values everything together. In the common case of a raw buffer (the
6328/// stride component is 0), we can just AND off the upper half.
6331 Register Result = MI.getOperand(0).getReg();
6332 Register Pointer = MI.getOperand(2).getReg();
6333 Register Stride = MI.getOperand(3).getReg();
6334 Register NumRecords = MI.getOperand(4).getReg();
6335 Register Flags = MI.getOperand(5).getReg();
6336
6337 LLT S32 = LLT::scalar(32);
6338 LLT S64 = LLT::scalar(64);
6339
6340 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6341
6342 auto ExtStride = B.buildAnyExt(S32, Stride);
6343
6344 if (ST.has45BitNumRecordsBufferResource()) {
6345 Register Zero = B.buildConstant(S32, 0).getReg(0);
6346 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6347 // num_records.
6348 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6349 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6350 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6351 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6352 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6353
6354 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6355 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6356 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6357 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6358 auto ExtShiftedStride =
6359 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6360 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6361 auto ExtShiftedFlags =
6362 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6363 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6364 Register HighHalf =
6365 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6366 B.buildMergeValues(Result, {LowHalf, HighHalf});
6367 } else {
6368 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6369 auto Unmerge = B.buildUnmerge(S32, Pointer);
6370 auto LowHalf = Unmerge.getReg(0);
6371 auto HighHalf = Unmerge.getReg(1);
6372
6373 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6374 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6375 auto ShiftConst = B.buildConstant(S32, 16);
6376 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6377 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6378 Register NewHighHalfReg = NewHighHalf.getReg(0);
6379 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6380 }
6381
6382 MI.eraseFromParent();
6383 return true;
6384}
6385
6388 MachineIRBuilder &B) const {
6389 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6390 if (!MFI->isEntryFunction()) {
6391 return legalizePreloadedArgIntrin(MI, MRI, B,
6393 }
6394
6395 Register DstReg = MI.getOperand(0).getReg();
6396 if (!getImplicitArgPtr(DstReg, MRI, B))
6397 return false;
6398
6399 MI.eraseFromParent();
6400 return true;
6401}
6402
6405 MachineIRBuilder &B) const {
6406 Function &F = B.getMF().getFunction();
6407 std::optional<uint32_t> KnownSize =
6409 if (KnownSize.has_value())
6410 B.buildConstant(DstReg, *KnownSize);
6411 return false;
6412}
6413
6416 MachineIRBuilder &B) const {
6417
6418 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6419 if (!MFI->isEntryFunction()) {
6420 return legalizePreloadedArgIntrin(MI, MRI, B,
6422 }
6423
6424 Register DstReg = MI.getOperand(0).getReg();
6425 if (!getLDSKernelId(DstReg, MRI, B))
6426 return false;
6427
6428 MI.eraseFromParent();
6429 return true;
6430}
6431
6435 unsigned AddrSpace) const {
6436 const LLT S32 = LLT::scalar(32);
6437 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6438 Register Hi32 = Unmerge.getReg(1);
6439
6440 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6441 ST.hasGloballyAddressableScratch()) {
6442 Register FlatScratchBaseHi =
6443 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6444 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6445 .getReg(0);
6446 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6447 // Test bits 63..58 against the aperture address.
6448 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6449 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6450 B.buildConstant(S32, 1u << 26));
6451 } else {
6452 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6453 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6454 }
6455 MI.eraseFromParent();
6456 return true;
6457}
6458
6459// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6460// offset (the offset that is included in bounds checking and swizzling, to be
6461// split between the instruction's voffset and immoffset fields) and soffset
6462// (the offset that is excluded from bounds checking and swizzling, to go in
6463// the instruction's soffset field). This function takes the first kind of
6464// offset and figures out how to split it between voffset and immoffset.
6465std::pair<Register, unsigned>
6467 Register OrigOffset) const {
6468 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6469 Register BaseReg;
6470 unsigned ImmOffset;
6471 const LLT S32 = LLT::scalar(32);
6472 MachineRegisterInfo &MRI = *B.getMRI();
6473
6474 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6475 // being added, so we can only safely match a 32-bit addition with no unsigned
6476 // overflow.
6477 bool CheckNUW = ST.hasGFX1250Insts();
6478 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6479 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6480
6481 // If BaseReg is a pointer, convert it to int.
6482 if (MRI.getType(BaseReg).isPointer())
6483 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6484
6485 // If the immediate value is too big for the immoffset field, put only bits
6486 // that would normally fit in the immoffset field. The remaining value that
6487 // is copied/added for the voffset field is a large power of 2, and it
6488 // stands more chance of being CSEd with the copy/add for another similar
6489 // load/store.
6490 // However, do not do that rounding down if that is a negative
6491 // number, as it appears to be illegal to have a negative offset in the
6492 // vgpr, even if adding the immediate offset makes it positive.
6493 unsigned Overflow = ImmOffset & ~MaxImm;
6494 ImmOffset -= Overflow;
6495 if ((int32_t)Overflow < 0) {
6496 Overflow += ImmOffset;
6497 ImmOffset = 0;
6498 }
6499
6500 if (Overflow != 0) {
6501 if (!BaseReg) {
6502 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6503 } else {
6504 auto OverflowVal = B.buildConstant(S32, Overflow);
6505 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6506 }
6507 }
6508
6509 if (!BaseReg)
6510 BaseReg = B.buildConstant(S32, 0).getReg(0);
6511
6512 return std::pair(BaseReg, ImmOffset);
6513}
6514
6515/// Handle register layout difference for f16 images for some subtargets.
6518 Register Reg,
6519 bool ImageStore) const {
6520 const LLT S16 = LLT::scalar(16);
6521 const LLT S32 = LLT::scalar(32);
6522 LLT StoreVT = MRI.getType(Reg);
6523 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6524
6525 if (ST.hasUnpackedD16VMem()) {
6526 auto Unmerge = B.buildUnmerge(S16, Reg);
6527
6528 SmallVector<Register, 4> WideRegs;
6529 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6530 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6531
6532 int NumElts = StoreVT.getNumElements();
6533
6534 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6535 .getReg(0);
6536 }
6537
6538 if (ImageStore && ST.hasImageStoreD16Bug()) {
6539 if (StoreVT.getNumElements() == 2) {
6540 SmallVector<Register, 4> PackedRegs;
6541 Reg = B.buildBitcast(S32, Reg).getReg(0);
6542 PackedRegs.push_back(Reg);
6543 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6544 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6545 .getReg(0);
6546 }
6547
6548 if (StoreVT.getNumElements() == 3) {
6549 SmallVector<Register, 4> PackedRegs;
6550 auto Unmerge = B.buildUnmerge(S16, Reg);
6551 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6552 PackedRegs.push_back(Unmerge.getReg(I));
6553 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6554 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6555 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6556 }
6557
6558 if (StoreVT.getNumElements() == 4) {
6559 SmallVector<Register, 4> PackedRegs;
6560 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6561 auto Unmerge = B.buildUnmerge(S32, Reg);
6562 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6563 PackedRegs.push_back(Unmerge.getReg(I));
6564 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6565 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6566 .getReg(0);
6567 }
6568
6569 llvm_unreachable("invalid data type");
6570 }
6571
6572 if (StoreVT == LLT::fixed_vector(3, S16)) {
6573 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6574 .getReg(0);
6575 }
6576 return Reg;
6577}
6578
6580 Register VData, LLT MemTy,
6581 bool IsFormat) const {
6582 MachineRegisterInfo *MRI = B.getMRI();
6583 LLT Ty = MRI->getType(VData);
6584
6585 const LLT S16 = LLT::scalar(16);
6586
6587 // Fixup buffer resources themselves needing to be v4i128.
6589 return castBufferRsrcToV4I32(VData, B);
6590
6591 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6592 Ty = getBitcastRegisterType(Ty);
6593 VData = B.buildBitcast(Ty, VData).getReg(0);
6594 }
6595 // Fixup illegal register types for i8 stores.
6596 if (Ty == LLT::scalar(8) || Ty == S16) {
6597 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6598 return AnyExt;
6599 }
6600
6601 if (Ty.isVector()) {
6602 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6603 if (IsFormat)
6604 return handleD16VData(B, *MRI, VData);
6605 }
6606 }
6607
6608 return VData;
6609}
6610
6612 LegalizerHelper &Helper,
6613 bool IsTyped,
6614 bool IsFormat) const {
6615 MachineIRBuilder &B = Helper.MIRBuilder;
6616 MachineRegisterInfo &MRI = *B.getMRI();
6617
6618 Register VData = MI.getOperand(1).getReg();
6619 LLT Ty = MRI.getType(VData);
6620 LLT EltTy = Ty.getScalarType();
6621 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6622 const LLT S32 = LLT::scalar(32);
6623
6624 MachineMemOperand *MMO = *MI.memoperands_begin();
6625 const int MemSize = MMO->getSize().getValue();
6626 LLT MemTy = MMO->getMemoryType();
6627
6628 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6629
6631 Register RSrc = MI.getOperand(2).getReg();
6632
6633 unsigned ImmOffset;
6634
6635 // The typed intrinsics add an immediate after the registers.
6636 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6637
6638 // The struct intrinsic variants add one additional operand over raw.
6639 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6640 Register VIndex;
6641 int OpOffset = 0;
6642 if (HasVIndex) {
6643 VIndex = MI.getOperand(3).getReg();
6644 OpOffset = 1;
6645 } else {
6646 VIndex = B.buildConstant(S32, 0).getReg(0);
6647 }
6648
6649 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6650 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6651
6652 unsigned Format = 0;
6653 if (IsTyped) {
6654 Format = MI.getOperand(5 + OpOffset).getImm();
6655 ++OpOffset;
6656 }
6657
6658 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6659
6660 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6661
6662 unsigned Opc;
6663 if (IsTyped) {
6664 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6665 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6666 } else if (IsFormat) {
6667 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6668 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6669 } else {
6670 switch (MemSize) {
6671 case 1:
6672 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6673 break;
6674 case 2:
6675 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6676 break;
6677 default:
6678 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6679 break;
6680 }
6681 }
6682
6683 auto MIB = B.buildInstr(Opc)
6684 .addUse(VData) // vdata
6685 .addUse(RSrc) // rsrc
6686 .addUse(VIndex) // vindex
6687 .addUse(VOffset) // voffset
6688 .addUse(SOffset) // soffset
6689 .addImm(ImmOffset); // offset(imm)
6690
6691 if (IsTyped)
6692 MIB.addImm(Format);
6693
6694 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6695 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6696 .addMemOperand(MMO);
6697
6698 MI.eraseFromParent();
6699 return true;
6700}
6701
6702static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6703 Register VIndex, Register VOffset, Register SOffset,
6704 unsigned ImmOffset, unsigned Format,
6705 unsigned AuxiliaryData, MachineMemOperand *MMO,
6706 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6707 auto MIB = B.buildInstr(Opc)
6708 .addDef(LoadDstReg) // vdata
6709 .addUse(RSrc) // rsrc
6710 .addUse(VIndex) // vindex
6711 .addUse(VOffset) // voffset
6712 .addUse(SOffset) // soffset
6713 .addImm(ImmOffset); // offset(imm)
6714
6715 if (IsTyped)
6716 MIB.addImm(Format);
6717
6718 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6719 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6720 .addMemOperand(MMO);
6721}
6722
6724 LegalizerHelper &Helper,
6725 bool IsFormat,
6726 bool IsTyped) const {
6727 MachineIRBuilder &B = Helper.MIRBuilder;
6728 MachineRegisterInfo &MRI = *B.getMRI();
6729 GISelChangeObserver &Observer = Helper.Observer;
6730
6731 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6732 MachineMemOperand *MMO = *MI.memoperands_begin();
6733 const LLT MemTy = MMO->getMemoryType();
6734 const LLT S32 = LLT::scalar(32);
6735
6736 Register Dst = MI.getOperand(0).getReg();
6737
6738 Register StatusDst;
6739 int OpOffset = 0;
6740 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6741 bool IsTFE = MI.getNumExplicitDefs() == 2;
6742 if (IsTFE) {
6743 StatusDst = MI.getOperand(1).getReg();
6744 ++OpOffset;
6745 }
6746
6747 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6748 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6749
6750 // The typed intrinsics add an immediate after the registers.
6751 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6752
6753 // The struct intrinsic variants add one additional operand over raw.
6754 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6755 Register VIndex;
6756 if (HasVIndex) {
6757 VIndex = MI.getOperand(3 + OpOffset).getReg();
6758 ++OpOffset;
6759 } else {
6760 VIndex = B.buildConstant(S32, 0).getReg(0);
6761 }
6762
6763 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6764 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6765
6766 unsigned Format = 0;
6767 if (IsTyped) {
6768 Format = MI.getOperand(5 + OpOffset).getImm();
6769 ++OpOffset;
6770 }
6771
6772 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6773 unsigned ImmOffset;
6774
6775 LLT Ty = MRI.getType(Dst);
6776 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6777 // logic doesn't have to handle that case.
6778 if (hasBufferRsrcWorkaround(Ty)) {
6779 Observer.changingInstr(MI);
6780 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6781 Observer.changedInstr(MI);
6782 Dst = MI.getOperand(0).getReg();
6783 B.setInsertPt(B.getMBB(), MI);
6784 }
6785 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6786 Ty = getBitcastRegisterType(Ty);
6787 Observer.changingInstr(MI);
6788 Helper.bitcastDst(MI, Ty, 0);
6789 Observer.changedInstr(MI);
6790 Dst = MI.getOperand(0).getReg();
6791 B.setInsertPt(B.getMBB(), MI);
6792 }
6793
6794 LLT EltTy = Ty.getScalarType();
6795 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6796 const bool Unpacked = ST.hasUnpackedD16VMem();
6797
6798 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6799
6800 unsigned Opc;
6801
6802 // TODO: Support TFE for typed and narrow loads.
6803 if (IsTyped) {
6804 if (IsTFE)
6805 return false;
6806 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6807 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6808 } else if (IsFormat) {
6809 if (IsD16) {
6810 if (IsTFE)
6811 return false;
6812 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6813 } else {
6814 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6815 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6816 }
6817 } else {
6818 switch (MemTy.getSizeInBits()) {
6819 case 8:
6820 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6821 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6822 break;
6823 case 16:
6824 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6825 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6826 break;
6827 default:
6828 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6829 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6830 break;
6831 }
6832 }
6833
6834 if (IsTFE) {
6835 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6836 unsigned NumLoadDWords = NumValueDWords + 1;
6837 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6838 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6839 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6840 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6841 if (MemTy.getSizeInBits() < 32) {
6842 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6843 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6844 B.buildTrunc(Dst, ExtDst);
6845 } else if (NumValueDWords == 1) {
6846 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6847 } else {
6848 SmallVector<Register, 5> LoadElts;
6849 for (unsigned I = 0; I != NumValueDWords; ++I)
6850 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6851 LoadElts.push_back(StatusDst);
6852 B.buildUnmerge(LoadElts, LoadDstReg);
6853 LoadElts.truncate(NumValueDWords);
6854 B.buildMergeLikeInstr(Dst, LoadElts);
6855 }
6856 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6857 (IsD16 && !Ty.isVector())) {
6858 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6859 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6860 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6861 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6862 B.buildTrunc(Dst, LoadDstReg);
6863 } else if (Unpacked && IsD16 && Ty.isVector()) {
6864 LLT UnpackedTy = Ty.changeElementSize(32);
6865 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6866 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6867 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6868 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6869 // FIXME: G_TRUNC should work, but legalization currently fails
6870 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6872 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6873 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6874 B.buildMergeLikeInstr(Dst, Repack);
6875 } else {
6876 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6877 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6878 }
6879
6880 MI.eraseFromParent();
6881 return true;
6882}
6883
6884static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6885 switch (IntrID) {
6886 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6888 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6891 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6893 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6896 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6898 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6901 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6903 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6906 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6908 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6911 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6913 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6916 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6931 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6933 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6936 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6938 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6941 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6943 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6946 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6948 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6951 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6953 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6956 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6958 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6961 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6962 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6963 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6966 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6968 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6969 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6971 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6972 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6973 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6976 default:
6977 llvm_unreachable("unhandled atomic opcode");
6978 }
6979}
6980
6983 Intrinsic::ID IID) const {
6984 const bool IsCmpSwap =
6985 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6986 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6987 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6988 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6989
6990 Register Dst = MI.getOperand(0).getReg();
6991 // Since we don't have 128-bit atomics, we don't need to handle the case of
6992 // p8 argmunents to the atomic itself
6993 Register VData = MI.getOperand(2).getReg();
6994
6995 Register CmpVal;
6996 int OpOffset = 0;
6997
6998 if (IsCmpSwap) {
6999 CmpVal = MI.getOperand(3).getReg();
7000 ++OpOffset;
7001 }
7002
7003 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
7004 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
7005 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7006
7007 // The struct intrinsic variants add one additional operand over raw.
7008 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
7009 Register VIndex;
7010 if (HasVIndex) {
7011 VIndex = MI.getOperand(4 + OpOffset).getReg();
7012 ++OpOffset;
7013 } else {
7014 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
7015 }
7016
7017 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
7018 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
7019 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
7020
7021 MachineMemOperand *MMO = *MI.memoperands_begin();
7022
7023 unsigned ImmOffset;
7024 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
7025
7026 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
7027 .addDef(Dst)
7028 .addUse(VData); // vdata
7029
7030 if (IsCmpSwap)
7031 MIB.addReg(CmpVal);
7032
7033 MIB.addUse(RSrc) // rsrc
7034 .addUse(VIndex) // vindex
7035 .addUse(VOffset) // voffset
7036 .addUse(SOffset) // soffset
7037 .addImm(ImmOffset) // offset(imm)
7038 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
7039 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
7040 .addMemOperand(MMO);
7041
7042 MI.eraseFromParent();
7043 return true;
7044}
7045
7046/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
7047/// vector with s16 typed elements.
7049 SmallVectorImpl<Register> &PackedAddrs,
7050 unsigned ArgOffset,
7052 bool IsA16, bool IsG16) {
7053 const LLT S16 = LLT::scalar(16);
7054 const LLT V2S16 = LLT::fixed_vector(2, 16);
7055 auto EndIdx = Intr->VAddrEnd;
7056
7057 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
7058 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7059 if (!SrcOp.isReg())
7060 continue; // _L to _LZ may have eliminated this.
7061
7062 Register AddrReg = SrcOp.getReg();
7063
7064 if ((I < Intr->GradientStart) ||
7065 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7066 (I >= Intr->CoordStart && !IsA16)) {
7067 if ((I < Intr->GradientStart) && IsA16 &&
7068 (B.getMRI()->getType(AddrReg) == S16)) {
7069 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7070 // Special handling of bias when A16 is on. Bias is of type half but
7071 // occupies full 32-bit.
7072 PackedAddrs.push_back(
7073 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7074 .getReg(0));
7075 } else {
7076 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7077 "Bias needs to be converted to 16 bit in A16 mode");
7078 // Handle any gradient or coordinate operands that should not be packed
7079 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
7080 PackedAddrs.push_back(AddrReg);
7081 }
7082 } else {
7083 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7084 // derivatives dx/dh and dx/dv are packed with undef.
7085 if (((I + 1) >= EndIdx) ||
7086 ((Intr->NumGradients / 2) % 2 == 1 &&
7087 (I == static_cast<unsigned>(Intr->GradientStart +
7088 (Intr->NumGradients / 2) - 1) ||
7089 I == static_cast<unsigned>(Intr->GradientStart +
7090 Intr->NumGradients - 1))) ||
7091 // Check for _L to _LZ optimization
7092 !MI.getOperand(ArgOffset + I + 1).isReg()) {
7093 PackedAddrs.push_back(
7094 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7095 .getReg(0));
7096 } else {
7097 PackedAddrs.push_back(
7098 B.buildBuildVector(
7099 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7100 .getReg(0));
7101 ++I;
7102 }
7103 }
7104 }
7105}
7106
7107/// Convert from separate vaddr components to a single vector address register,
7108/// and replace the remaining operands with $noreg.
7110 int DimIdx, int NumVAddrs) {
7111 const LLT S32 = LLT::scalar(32);
7112 (void)S32;
7113 SmallVector<Register, 8> AddrRegs;
7114 for (int I = 0; I != NumVAddrs; ++I) {
7115 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7116 if (SrcOp.isReg()) {
7117 AddrRegs.push_back(SrcOp.getReg());
7118 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7119 }
7120 }
7121
7122 int NumAddrRegs = AddrRegs.size();
7123 if (NumAddrRegs != 1) {
7124 auto VAddr =
7125 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7126 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7127 }
7128
7129 for (int I = 1; I != NumVAddrs; ++I) {
7130 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7131 if (SrcOp.isReg())
7132 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7133 }
7134}
7135
7136/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7137///
7138/// Depending on the subtarget, load/store with 16-bit element data need to be
7139/// rewritten to use the low half of 32-bit registers, or directly use a packed
7140/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7141/// registers.
7142///
7143/// We don't want to directly select image instructions just yet, but also want
7144/// to exposes all register repacking to the legalizer/combiners. We also don't
7145/// want a selected instruction entering RegBankSelect. In order to avoid
7146/// defining a multitude of intermediate image instructions, directly hack on
7147/// the intrinsic's arguments. In cases like a16 addresses, this requires
7148/// padding now unnecessary arguments with $noreg.
7151 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7152
7153 const MachineFunction &MF = *MI.getMF();
7154 const unsigned NumDefs = MI.getNumExplicitDefs();
7155 const unsigned ArgOffset = NumDefs + 1;
7156 bool IsTFE = NumDefs == 2;
7157 // We are only processing the operands of d16 image operations on subtargets
7158 // that use the unpacked register layout, or need to repack the TFE result.
7159
7160 // TODO: Do we need to guard against already legalized intrinsics?
7161 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7163
7164 MachineRegisterInfo *MRI = B.getMRI();
7165 const LLT S32 = LLT::scalar(32);
7166 const LLT S16 = LLT::scalar(16);
7167 const LLT V2S16 = LLT::fixed_vector(2, 16);
7168
7169 unsigned DMask = 0;
7170 Register VData;
7171 LLT Ty;
7172
7173 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7174 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7175 Ty = MRI->getType(VData);
7176 }
7177
7178 const bool IsAtomicPacked16Bit =
7179 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7180 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7181
7182 // Check for 16 bit addresses and pack if true.
7183 LLT GradTy =
7184 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7185 LLT AddrTy =
7186 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7187 const bool IsG16 =
7188 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7189 const bool IsA16 = AddrTy == S16;
7190 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7191
7192 int DMaskLanes = 0;
7193 if (!BaseOpcode->Atomic) {
7194 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7195 if (BaseOpcode->Gather4) {
7196 DMaskLanes = 4;
7197 } else if (DMask != 0) {
7198 DMaskLanes = llvm::popcount(DMask);
7199 } else if (!IsTFE && !BaseOpcode->Store) {
7200 // If dmask is 0, this is a no-op load. This can be eliminated.
7201 B.buildUndef(MI.getOperand(0));
7202 MI.eraseFromParent();
7203 return true;
7204 }
7205 }
7206
7207 Observer.changingInstr(MI);
7208 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7209
7210 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7211 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7212 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7213 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7214 unsigned NewOpcode = LoadOpcode;
7215 if (BaseOpcode->Store)
7216 NewOpcode = StoreOpcode;
7217 else if (BaseOpcode->NoReturn)
7218 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7219
7220 // Track that we legalized this
7221 MI.setDesc(B.getTII().get(NewOpcode));
7222
7223 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7224 // dmask to be at least 1 otherwise the instruction will fail
7225 if (IsTFE && DMask == 0) {
7226 DMask = 0x1;
7227 DMaskLanes = 1;
7228 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7229 }
7230
7231 if (BaseOpcode->Atomic) {
7232 Register VData0 = MI.getOperand(2).getReg();
7233 LLT Ty = MRI->getType(VData0);
7234
7235 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7236 if (Ty.isVector() && !IsAtomicPacked16Bit)
7237 return false;
7238
7239 if (BaseOpcode->AtomicX2) {
7240 Register VData1 = MI.getOperand(3).getReg();
7241 // The two values are packed in one register.
7242 LLT PackedTy = LLT::fixed_vector(2, Ty);
7243 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7244 MI.getOperand(2).setReg(Concat.getReg(0));
7245 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7246 }
7247 }
7248
7249 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7250
7251 // Rewrite the addressing register layout before doing anything else.
7252 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7253 // 16 bit gradients are supported, but are tied to the A16 control
7254 // so both gradients and addresses must be 16 bit
7255 return false;
7256 }
7257
7258 if (IsA16 && !ST.hasA16()) {
7259 // A16 not supported
7260 return false;
7261 }
7262
7263 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7264 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7265
7266 if (IsA16 || IsG16) {
7267 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7268 // instructions expect VGPR_32
7269 SmallVector<Register, 4> PackedRegs;
7270
7271 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7272
7273 // See also below in the non-a16 branch
7274 const bool UseNSA = ST.hasNSAEncoding() &&
7275 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7276 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7277 const bool UsePartialNSA =
7278 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7279
7280 if (UsePartialNSA) {
7281 // Pack registers that would go over NSAMaxSize into last VAddr register
7282 LLT PackedAddrTy =
7283 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7284 auto Concat = B.buildConcatVectors(
7285 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7286 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7287 PackedRegs.resize(NSAMaxSize);
7288 } else if (!UseNSA && PackedRegs.size() > 1) {
7289 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7290 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7291 PackedRegs[0] = Concat.getReg(0);
7292 PackedRegs.resize(1);
7293 }
7294
7295 const unsigned NumPacked = PackedRegs.size();
7296 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7297 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7298 if (!SrcOp.isReg()) {
7299 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7300 continue;
7301 }
7302
7303 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7304
7305 if (I - Intr->VAddrStart < NumPacked)
7306 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7307 else
7308 SrcOp.setReg(AMDGPU::NoRegister);
7309 }
7310 } else {
7311 // If the register allocator cannot place the address registers contiguously
7312 // without introducing moves, then using the non-sequential address encoding
7313 // is always preferable, since it saves VALU instructions and is usually a
7314 // wash in terms of code size or even better.
7315 //
7316 // However, we currently have no way of hinting to the register allocator
7317 // that MIMG addresses should be placed contiguously when it is possible to
7318 // do so, so force non-NSA for the common 2-address case as a heuristic.
7319 //
7320 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7321 // allocation when possible.
7322 //
7323 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7324 // set of the remaining addresses.
7325 const bool UseNSA = ST.hasNSAEncoding() &&
7326 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7327 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7328 const bool UsePartialNSA =
7329 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7330
7331 if (UsePartialNSA) {
7333 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7334 Intr->NumVAddrs - NSAMaxSize + 1);
7335 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7336 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7337 Intr->NumVAddrs);
7338 }
7339 }
7340
7341 int Flags = 0;
7342 if (IsA16)
7343 Flags |= 1;
7344 if (IsG16)
7345 Flags |= 2;
7346 MI.addOperand(MachineOperand::CreateImm(Flags));
7347
7348 if (BaseOpcode->NoReturn) { // No TFE for stores?
7349 // TODO: Handle dmask trim
7350 if (!Ty.isVector() || !IsD16)
7351 return true;
7352
7353 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7354 if (RepackedReg != VData) {
7355 MI.getOperand(1).setReg(RepackedReg);
7356 }
7357
7358 return true;
7359 }
7360
7361 Register DstReg = MI.getOperand(0).getReg();
7362 const LLT EltTy = Ty.getScalarType();
7363 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7364
7365 // Confirm that the return type is large enough for the dmask specified
7366 if (NumElts < DMaskLanes)
7367 return false;
7368
7369 if (NumElts > 4 || DMaskLanes > 4)
7370 return false;
7371
7372 // Image atomic instructions are using DMask to specify how many bits
7373 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7374 // DMaskLanes for image atomic has default value '0'.
7375 // We must be sure that atomic variants (especially packed) will not be
7376 // truncated from v2s16 or v4s16 to s16 type.
7377 //
7378 // ChangeElementCount will be needed for image load where Ty is always scalar.
7379 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7380 const LLT AdjustedTy =
7381 DMaskLanes == 0
7382 ? Ty
7383 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7384
7385 // The raw dword aligned data component of the load. The only legal cases
7386 // where this matters should be when using the packed D16 format, for
7387 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7388 LLT RoundedTy;
7389
7390 // S32 vector to cover all data, plus TFE result element.
7391 LLT TFETy;
7392
7393 // Register type to use for each loaded component. Will be S32 or V2S16.
7394 LLT RegTy;
7395
7396 if (IsD16 && ST.hasUnpackedD16VMem()) {
7397 RoundedTy =
7398 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7399 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7400 RegTy = S32;
7401 } else {
7402 unsigned EltSize = EltTy.getSizeInBits();
7403 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7404 unsigned RoundedSize = 32 * RoundedElts;
7405 RoundedTy = LLT::scalarOrVector(
7406 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7407 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7408 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7409 }
7410
7411 // The return type does not need adjustment.
7412 // TODO: Should we change s16 case to s32 or <2 x s16>?
7413 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7414 return true;
7415
7416 Register Dst1Reg;
7417
7418 // Insert after the instruction.
7419 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7420
7421 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7422 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7423 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7424 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7425
7426 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7427
7428 MI.getOperand(0).setReg(NewResultReg);
7429
7430 // In the IR, TFE is supposed to be used with a 2 element struct return
7431 // type. The instruction really returns these two values in one contiguous
7432 // register, with one additional dword beyond the loaded data. Rewrite the
7433 // return type to use a single register result.
7434
7435 if (IsTFE) {
7436 Dst1Reg = MI.getOperand(1).getReg();
7437 if (MRI->getType(Dst1Reg) != S32)
7438 return false;
7439
7440 // TODO: Make sure the TFE operand bit is set.
7441 MI.removeOperand(1);
7442
7443 // Handle the easy case that requires no repack instructions.
7444 if (Ty == S32) {
7445 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7446 return true;
7447 }
7448 }
7449
7450 // Now figure out how to copy the new result register back into the old
7451 // result.
7452 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7453
7454 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7455
7456 if (ResultNumRegs == 1) {
7457 assert(!IsTFE);
7458 ResultRegs[0] = NewResultReg;
7459 } else {
7460 // We have to repack into a new vector of some kind.
7461 for (int I = 0; I != NumDataRegs; ++I)
7462 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7463 B.buildUnmerge(ResultRegs, NewResultReg);
7464
7465 // Drop the final TFE element to get the data part. The TFE result is
7466 // directly written to the right place already.
7467 if (IsTFE)
7468 ResultRegs.resize(NumDataRegs);
7469 }
7470
7471 // For an s16 scalar result, we form an s32 result with a truncate regardless
7472 // of packed vs. unpacked.
7473 if (IsD16 && !Ty.isVector()) {
7474 B.buildTrunc(DstReg, ResultRegs[0]);
7475 return true;
7476 }
7477
7478 // Avoid a build/concat_vector of 1 entry.
7479 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7480 B.buildBitcast(DstReg, ResultRegs[0]);
7481 return true;
7482 }
7483
7484 assert(Ty.isVector());
7485
7486 if (IsD16) {
7487 // For packed D16 results with TFE enabled, all the data components are
7488 // S32. Cast back to the expected type.
7489 //
7490 // TODO: We don't really need to use load s32 elements. We would only need one
7491 // cast for the TFE result if a multiple of v2s16 was used.
7492 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7493 for (Register &Reg : ResultRegs)
7494 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7495 } else if (ST.hasUnpackedD16VMem()) {
7496 for (Register &Reg : ResultRegs)
7497 Reg = B.buildTrunc(S16, Reg).getReg(0);
7498 }
7499 }
7500
7501 auto padWithUndef = [&](LLT Ty, int NumElts) {
7502 if (NumElts == 0)
7503 return;
7504 Register Undef = B.buildUndef(Ty).getReg(0);
7505 for (int I = 0; I != NumElts; ++I)
7506 ResultRegs.push_back(Undef);
7507 };
7508
7509 // Pad out any elements eliminated due to the dmask.
7510 LLT ResTy = MRI->getType(ResultRegs[0]);
7511 if (!ResTy.isVector()) {
7512 padWithUndef(ResTy, NumElts - ResultRegs.size());
7513 B.buildBuildVector(DstReg, ResultRegs);
7514 return true;
7515 }
7516
7517 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7518 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7519
7520 // Deal with the one annoying legal case.
7521 const LLT V3S16 = LLT::fixed_vector(3, 16);
7522 if (Ty == V3S16) {
7523 if (IsTFE) {
7524 if (ResultRegs.size() == 1) {
7525 NewResultReg = ResultRegs[0];
7526 } else if (ResultRegs.size() == 2) {
7527 LLT V4S16 = LLT::fixed_vector(4, 16);
7528 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7529 } else {
7530 return false;
7531 }
7532 }
7533
7534 if (MRI->getType(DstReg).getNumElements() <
7535 MRI->getType(NewResultReg).getNumElements()) {
7536 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7537 } else {
7538 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7539 }
7540 return true;
7541 }
7542
7543 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7544 B.buildConcatVectors(DstReg, ResultRegs);
7545 return true;
7546}
7547
7549 MachineInstr &MI) const {
7550 MachineIRBuilder &B = Helper.MIRBuilder;
7551 GISelChangeObserver &Observer = Helper.Observer;
7552
7553 Register OrigDst = MI.getOperand(0).getReg();
7554 Register Dst;
7555 LLT Ty = B.getMRI()->getType(OrigDst);
7556 unsigned Size = Ty.getSizeInBits();
7557 MachineFunction &MF = B.getMF();
7558 unsigned Opc = 0;
7559 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7560 assert(Size == 8 || Size == 16);
7561 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7562 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7563 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7564 // destination register.
7565 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7566 } else {
7567 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7568 Dst = OrigDst;
7569 }
7570
7571 Observer.changingInstr(MI);
7572
7573 // Handle needing to s.buffer.load() a p8 value.
7574 if (hasBufferRsrcWorkaround(Ty)) {
7575 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7576 B.setInsertPt(B.getMBB(), MI);
7577 }
7579 Ty = getBitcastRegisterType(Ty);
7580 Helper.bitcastDst(MI, Ty, 0);
7581 B.setInsertPt(B.getMBB(), MI);
7582 }
7583
7584 // FIXME: We don't really need this intermediate instruction. The intrinsic
7585 // should be fixed to have a memory operand. Since it's readnone, we're not
7586 // allowed to add one.
7587 MI.setDesc(B.getTII().get(Opc));
7588 MI.removeOperand(1); // Remove intrinsic ID
7589
7590 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7591 const unsigned MemSize = (Size + 7) / 8;
7592 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7598 MemSize, MemAlign);
7599 MI.addMemOperand(MF, MMO);
7600 if (Dst != OrigDst) {
7601 MI.getOperand(0).setReg(Dst);
7602 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7603 B.buildTrunc(OrigDst, Dst);
7604 }
7605
7606 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7607 // always be legal. We may need to restore this to a 96-bit result if it turns
7608 // out this needs to be converted to a vector load during RegBankSelect.
7609 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7610 if (Ty.isVector())
7612 else
7613 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7614 }
7615
7616 Observer.changedInstr(MI);
7617 return true;
7618}
7619
7621 MachineInstr &MI) const {
7622 MachineIRBuilder &B = Helper.MIRBuilder;
7623 GISelChangeObserver &Observer = Helper.Observer;
7624 Observer.changingInstr(MI);
7625 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7626 MI.removeOperand(0); // Remove intrinsic ID
7628 Observer.changedInstr(MI);
7629 return true;
7630}
7631
7632// TODO: Move to selection
7635 MachineIRBuilder &B) const {
7636 if (!ST.hasTrapHandler() ||
7637 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7638 return legalizeTrapEndpgm(MI, MRI, B);
7639
7640 return ST.supportsGetDoorbellID() ?
7642}
7643
7646 const DebugLoc &DL = MI.getDebugLoc();
7647 MachineBasicBlock &BB = B.getMBB();
7648 MachineFunction *MF = BB.getParent();
7649
7650 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7651 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7652 .addImm(0);
7653 MI.eraseFromParent();
7654 return true;
7655 }
7656
7657 // We need a block split to make the real endpgm a terminator. We also don't
7658 // want to break phis in successor blocks, so we can't just delete to the
7659 // end of the block.
7660 BB.splitAt(MI, false /*UpdateLiveIns*/);
7662 MF->push_back(TrapBB);
7663 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7664 .addImm(0);
7665 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7666 .addMBB(TrapBB);
7667
7668 BB.addSuccessor(TrapBB);
7669 MI.eraseFromParent();
7670 return true;
7671}
7672
7675 MachineFunction &MF = B.getMF();
7676 const LLT S64 = LLT::scalar(64);
7677
7678 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7679 // For code object version 5, queue_ptr is passed through implicit kernarg.
7685 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7686
7687 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7689
7690 if (!loadInputValue(KernargPtrReg, B,
7692 return false;
7693
7694 // TODO: can we be smarter about machine pointer info?
7697 PtrInfo.getWithOffset(Offset),
7701
7702 // Pointer address
7705 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7706 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7707 // Load address
7708 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7709 B.buildCopy(SGPR01, Temp);
7710 B.buildInstr(AMDGPU::S_TRAP)
7711 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7712 .addReg(SGPR01, RegState::Implicit);
7713 MI.eraseFromParent();
7714 return true;
7715 }
7716
7717 // Pass queue pointer to trap handler as input, and insert trap instruction
7718 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7719 Register LiveIn =
7722 return false;
7723
7724 B.buildCopy(SGPR01, LiveIn);
7725 B.buildInstr(AMDGPU::S_TRAP)
7726 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7727 .addReg(SGPR01, RegState::Implicit);
7728
7729 MI.eraseFromParent();
7730 return true;
7731}
7732
7735 MachineIRBuilder &B) const {
7736 // We need to simulate the 's_trap 2' instruction on targets that run in
7737 // PRIV=1 (where it is treated as a nop).
7738 if (ST.hasPrivEnabledTrap2NopBug()) {
7739 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7740 MI.getDebugLoc());
7741 MI.eraseFromParent();
7742 return true;
7743 }
7744
7745 B.buildInstr(AMDGPU::S_TRAP)
7746 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7747 MI.eraseFromParent();
7748 return true;
7749}
7750
7753 MachineIRBuilder &B) const {
7754 // Is non-HSA path or trap-handler disabled? Then, report a warning
7755 // accordingly
7756 if (!ST.hasTrapHandler() ||
7757 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7758 Function &Fn = B.getMF().getFunction();
7760 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7761 } else {
7762 // Insert debug-trap instruction
7763 B.buildInstr(AMDGPU::S_TRAP)
7764 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7765 }
7766
7767 MI.eraseFromParent();
7768 return true;
7769}
7770
7772 MachineInstr &MI, MachineIRBuilder &B) const {
7773 MachineRegisterInfo &MRI = *B.getMRI();
7774 const LLT S16 = LLT::scalar(16);
7775 const LLT S32 = LLT::scalar(32);
7776 const LLT V2S16 = LLT::fixed_vector(2, 16);
7777 const LLT V3S32 = LLT::fixed_vector(3, 32);
7778
7779 Register DstReg = MI.getOperand(0).getReg();
7780 Register NodePtr = MI.getOperand(2).getReg();
7781 Register RayExtent = MI.getOperand(3).getReg();
7782 Register RayOrigin = MI.getOperand(4).getReg();
7783 Register RayDir = MI.getOperand(5).getReg();
7784 Register RayInvDir = MI.getOperand(6).getReg();
7785 Register TDescr = MI.getOperand(7).getReg();
7786
7787 if (!ST.hasGFX10_AEncoding()) {
7788 Function &Fn = B.getMF().getFunction();
7790 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7791 return false;
7792 }
7793
7794 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7795 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7796 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7797 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7798 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7799 const unsigned NumVDataDwords = 4;
7800 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7801 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7802 const bool UseNSA =
7803 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7804
7805 const unsigned BaseOpcodes[2][2] = {
7806 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7807 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7808 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7809 int Opcode;
7810 if (UseNSA) {
7811 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7812 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7813 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7814 : AMDGPU::MIMGEncGfx10NSA,
7815 NumVDataDwords, NumVAddrDwords);
7816 } else {
7817 assert(!IsGFX12Plus);
7818 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7819 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7820 : AMDGPU::MIMGEncGfx10Default,
7821 NumVDataDwords, NumVAddrDwords);
7822 }
7823 assert(Opcode != -1);
7824
7826 if (UseNSA && IsGFX11Plus) {
7827 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7828 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7829 auto Merged = B.buildMergeLikeInstr(
7830 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7831 Ops.push_back(Merged.getReg(0));
7832 };
7833
7834 Ops.push_back(NodePtr);
7835 Ops.push_back(RayExtent);
7836 packLanes(RayOrigin);
7837
7838 if (IsA16) {
7839 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7840 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7841 auto MergedDir = B.buildMergeLikeInstr(
7842 V3S32,
7843 {B.buildBitcast(
7844 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7845 UnmergeRayDir.getReg(0)}))
7846 .getReg(0),
7847 B.buildBitcast(
7848 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7849 UnmergeRayDir.getReg(1)}))
7850 .getReg(0),
7851 B.buildBitcast(
7852 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7853 UnmergeRayDir.getReg(2)}))
7854 .getReg(0)});
7855 Ops.push_back(MergedDir.getReg(0));
7856 } else {
7857 packLanes(RayDir);
7858 packLanes(RayInvDir);
7859 }
7860 } else {
7861 if (Is64) {
7862 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7863 Ops.push_back(Unmerge.getReg(0));
7864 Ops.push_back(Unmerge.getReg(1));
7865 } else {
7866 Ops.push_back(NodePtr);
7867 }
7868 Ops.push_back(RayExtent);
7869
7870 auto packLanes = [&Ops, &S32, &B](Register Src) {
7871 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7872 Ops.push_back(Unmerge.getReg(0));
7873 Ops.push_back(Unmerge.getReg(1));
7874 Ops.push_back(Unmerge.getReg(2));
7875 };
7876
7877 packLanes(RayOrigin);
7878 if (IsA16) {
7879 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7880 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7884 B.buildMergeLikeInstr(R1,
7885 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7886 B.buildMergeLikeInstr(
7887 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7888 B.buildMergeLikeInstr(
7889 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7890 Ops.push_back(R1);
7891 Ops.push_back(R2);
7892 Ops.push_back(R3);
7893 } else {
7894 packLanes(RayDir);
7895 packLanes(RayInvDir);
7896 }
7897 }
7898
7899 if (!UseNSA) {
7900 // Build a single vector containing all the operands so far prepared.
7901 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7902 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7903 Ops.clear();
7904 Ops.push_back(MergedOps);
7905 }
7906
7907 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7908 .addDef(DstReg)
7909 .addImm(Opcode);
7910
7911 for (Register R : Ops) {
7912 MIB.addUse(R);
7913 }
7914
7915 MIB.addUse(TDescr)
7916 .addImm(IsA16 ? 1 : 0)
7917 .cloneMemRefs(MI);
7918
7919 MI.eraseFromParent();
7920 return true;
7921}
7922
7924 MachineInstr &MI, MachineIRBuilder &B) const {
7925 const LLT S32 = LLT::scalar(32);
7926 const LLT V2S32 = LLT::fixed_vector(2, 32);
7927
7928 Register DstReg = MI.getOperand(0).getReg();
7929 Register DstOrigin = MI.getOperand(1).getReg();
7930 Register DstDir = MI.getOperand(2).getReg();
7931 Register NodePtr = MI.getOperand(4).getReg();
7932 Register RayExtent = MI.getOperand(5).getReg();
7933 Register InstanceMask = MI.getOperand(6).getReg();
7934 Register RayOrigin = MI.getOperand(7).getReg();
7935 Register RayDir = MI.getOperand(8).getReg();
7936 Register Offsets = MI.getOperand(9).getReg();
7937 Register TDescr = MI.getOperand(10).getReg();
7938
7939 if (!ST.hasBVHDualAndBVH8Insts()) {
7940 Function &Fn = B.getMF().getFunction();
7942 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7943 return false;
7944 }
7945
7946 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7947 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7948 const unsigned NumVDataDwords = 10;
7949 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7950 int Opcode = AMDGPU::getMIMGOpcode(
7951 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7952 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7953 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7954 assert(Opcode != -1);
7955
7956 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7957 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7958
7959 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7960 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7961 .addDef(DstReg)
7962 .addDef(DstOrigin)
7963 .addDef(DstDir)
7964 .addImm(Opcode)
7965 .addUse(NodePtr)
7966 .addUse(RayExtentInstanceMaskVec.getReg(0))
7967 .addUse(RayOrigin)
7968 .addUse(RayDir)
7969 .addUse(Offsets)
7970 .addUse(TDescr)
7971 .cloneMemRefs(MI);
7972
7973 MI.eraseFromParent();
7974 return true;
7975}
7976
7978 MachineIRBuilder &B) const {
7979 const SITargetLowering *TLI = ST.getTargetLowering();
7981 Register DstReg = MI.getOperand(0).getReg();
7982 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7983 MI.eraseFromParent();
7984 return true;
7985}
7986
7988 MachineIRBuilder &B) const {
7989 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7990 if (!ST.hasArchitectedSGPRs())
7991 return false;
7992 LLT S32 = LLT::scalar(32);
7993 Register DstReg = MI.getOperand(0).getReg();
7994 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7995 auto LSB = B.buildConstant(S32, 25);
7996 auto Width = B.buildConstant(S32, 5);
7997 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7998 MI.eraseFromParent();
7999 return true;
8000}
8001
8004 AMDGPU::Hwreg::Id HwReg,
8005 unsigned LowBit,
8006 unsigned Width) const {
8007 MachineRegisterInfo &MRI = *B.getMRI();
8008 Register DstReg = MI.getOperand(0).getReg();
8009 if (!MRI.getRegClassOrNull(DstReg))
8010 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8011 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8012 .addDef(DstReg)
8013 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
8014 MI.eraseFromParent();
8015 return true;
8016}
8017
8018static constexpr unsigned FPEnvModeBitField =
8020
8021static constexpr unsigned FPEnvTrapBitField =
8023
8026 MachineIRBuilder &B) const {
8027 Register Src = MI.getOperand(0).getReg();
8028 if (MRI.getType(Src) != S64)
8029 return false;
8030
8031 auto ModeReg =
8032 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8033 /*HasSideEffects=*/true, /*isConvergent=*/false)
8034 .addImm(FPEnvModeBitField);
8035 auto TrapReg =
8036 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8037 /*HasSideEffects=*/true, /*isConvergent=*/false)
8038 .addImm(FPEnvTrapBitField);
8039 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8040 MI.eraseFromParent();
8041 return true;
8042}
8043
8046 MachineIRBuilder &B) const {
8047 Register Src = MI.getOperand(0).getReg();
8048 if (MRI.getType(Src) != S64)
8049 return false;
8050
8051 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
8052 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8053 /*HasSideEffects=*/true, /*isConvergent=*/false)
8054 .addImm(static_cast<int16_t>(FPEnvModeBitField))
8055 .addReg(Unmerge.getReg(0));
8056 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8057 /*HasSideEffects=*/true, /*isConvergent=*/false)
8058 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
8059 .addReg(Unmerge.getReg(1));
8060 MI.eraseFromParent();
8061 return true;
8062}
8063
8065 MachineInstr &MI) const {
8066 MachineIRBuilder &B = Helper.MIRBuilder;
8067 MachineRegisterInfo &MRI = *B.getMRI();
8068
8069 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8070 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
8071 switch (IntrID) {
8072 case Intrinsic::sponentry:
8073 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8074 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8075 // that we can remove this cast.
8076 const LLT S32 = LLT::scalar(32);
8078 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8079
8080 Register DstReg = MI.getOperand(0).getReg();
8081 B.buildIntToPtr(DstReg, TmpReg);
8082 MI.eraseFromParent();
8083 } else {
8084 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8085 1, 0, /*IsImmutable=*/false);
8086 B.buildFrameIndex(MI.getOperand(0), FI);
8087 MI.eraseFromParent();
8088 }
8089 return true;
8090 case Intrinsic::amdgcn_if:
8091 case Intrinsic::amdgcn_else: {
8092 MachineInstr *Br = nullptr;
8093 MachineBasicBlock *UncondBrTarget = nullptr;
8094 bool Negated = false;
8095 if (MachineInstr *BrCond =
8096 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8097 const SIRegisterInfo *TRI
8098 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8099
8100 Register Def = MI.getOperand(1).getReg();
8101 Register Use = MI.getOperand(3).getReg();
8102
8103 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8104
8105 if (Negated)
8106 std::swap(CondBrTarget, UncondBrTarget);
8107
8108 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8109 if (IntrID == Intrinsic::amdgcn_if) {
8110 B.buildInstr(AMDGPU::SI_IF)
8111 .addDef(Def)
8112 .addUse(Use)
8113 .addMBB(UncondBrTarget);
8114 } else {
8115 B.buildInstr(AMDGPU::SI_ELSE)
8116 .addDef(Def)
8117 .addUse(Use)
8118 .addMBB(UncondBrTarget);
8119 }
8120
8121 if (Br) {
8122 Br->getOperand(0).setMBB(CondBrTarget);
8123 } else {
8124 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8125 // since we're swapping branch targets it needs to be reinserted.
8126 // FIXME: IRTranslator should probably not do this
8127 B.buildBr(*CondBrTarget);
8128 }
8129
8130 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8131 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8132 MI.eraseFromParent();
8133 BrCond->eraseFromParent();
8134 return true;
8135 }
8136
8137 return false;
8138 }
8139 case Intrinsic::amdgcn_loop: {
8140 MachineInstr *Br = nullptr;
8141 MachineBasicBlock *UncondBrTarget = nullptr;
8142 bool Negated = false;
8143 if (MachineInstr *BrCond =
8144 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8145 const SIRegisterInfo *TRI
8146 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8147
8148 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8149 Register Reg = MI.getOperand(2).getReg();
8150
8151 if (Negated)
8152 std::swap(CondBrTarget, UncondBrTarget);
8153
8154 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8155 B.buildInstr(AMDGPU::SI_LOOP)
8156 .addUse(Reg)
8157 .addMBB(UncondBrTarget);
8158
8159 if (Br)
8160 Br->getOperand(0).setMBB(CondBrTarget);
8161 else
8162 B.buildBr(*CondBrTarget);
8163
8164 MI.eraseFromParent();
8165 BrCond->eraseFromParent();
8166 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8167 return true;
8168 }
8169
8170 return false;
8171 }
8172 case Intrinsic::amdgcn_addrspacecast_nonnull:
8173 return legalizeAddrSpaceCast(MI, MRI, B);
8174 case Intrinsic::amdgcn_make_buffer_rsrc:
8175 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8176 case Intrinsic::amdgcn_kernarg_segment_ptr:
8177 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8178 // This only makes sense to call in a kernel, so just lower to null.
8179 B.buildConstant(MI.getOperand(0).getReg(), 0);
8180 MI.eraseFromParent();
8181 return true;
8182 }
8183
8186 case Intrinsic::amdgcn_implicitarg_ptr:
8187 return legalizeImplicitArgPtr(MI, MRI, B);
8188 case Intrinsic::amdgcn_workitem_id_x:
8189 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8191 case Intrinsic::amdgcn_workitem_id_y:
8192 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8194 case Intrinsic::amdgcn_workitem_id_z:
8195 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8197 case Intrinsic::amdgcn_workgroup_id_x:
8198 return legalizeWorkGroupId(
8202 case Intrinsic::amdgcn_workgroup_id_y:
8203 return legalizeWorkGroupId(
8207 case Intrinsic::amdgcn_workgroup_id_z:
8208 return legalizeWorkGroupId(
8212 case Intrinsic::amdgcn_cluster_id_x:
8213 return ST.hasClusters() &&
8216 case Intrinsic::amdgcn_cluster_id_y:
8217 return ST.hasClusters() &&
8220 case Intrinsic::amdgcn_cluster_id_z:
8221 return ST.hasClusters() &&
8224 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8225 return ST.hasClusters() &&
8228 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8229 return ST.hasClusters() &&
8232 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8233 return ST.hasClusters() &&
8236 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8237 return ST.hasClusters() &&
8239 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8240 return ST.hasClusters() &&
8243 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8244 return ST.hasClusters() &&
8247 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8248 return ST.hasClusters() &&
8251 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8252 return ST.hasClusters() &&
8254 MI, MRI, B,
8256 case Intrinsic::amdgcn_wave_id:
8257 return legalizeWaveID(MI, B);
8258 case Intrinsic::amdgcn_lds_kernel_id:
8259 return legalizePreloadedArgIntrin(MI, MRI, B,
8261 case Intrinsic::amdgcn_dispatch_ptr:
8262 return legalizePreloadedArgIntrin(MI, MRI, B,
8264 case Intrinsic::amdgcn_queue_ptr:
8265 return legalizePreloadedArgIntrin(MI, MRI, B,
8267 case Intrinsic::amdgcn_implicit_buffer_ptr:
8270 case Intrinsic::amdgcn_dispatch_id:
8271 return legalizePreloadedArgIntrin(MI, MRI, B,
8273 case Intrinsic::r600_read_ngroups_x:
8274 // TODO: Emit error for hsa
8277 case Intrinsic::r600_read_ngroups_y:
8280 case Intrinsic::r600_read_ngroups_z:
8283 case Intrinsic::r600_read_local_size_x:
8284 // TODO: Could insert G_ASSERT_ZEXT from s16
8286 case Intrinsic::r600_read_local_size_y:
8287 // TODO: Could insert G_ASSERT_ZEXT from s16
8289 // TODO: Could insert G_ASSERT_ZEXT from s16
8290 case Intrinsic::r600_read_local_size_z:
8293 case Intrinsic::amdgcn_fdiv_fast:
8294 return legalizeFDIVFastIntrin(MI, MRI, B);
8295 case Intrinsic::amdgcn_is_shared:
8297 case Intrinsic::amdgcn_is_private:
8299 case Intrinsic::amdgcn_wavefrontsize: {
8300 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8301 MI.eraseFromParent();
8302 return true;
8303 }
8304 case Intrinsic::amdgcn_s_buffer_load:
8305 return legalizeSBufferLoad(Helper, MI);
8306 case Intrinsic::amdgcn_raw_buffer_store:
8307 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8308 case Intrinsic::amdgcn_struct_buffer_store:
8309 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8310 return legalizeBufferStore(MI, Helper, false, false);
8311 case Intrinsic::amdgcn_raw_buffer_store_format:
8312 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8313 case Intrinsic::amdgcn_struct_buffer_store_format:
8314 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8315 return legalizeBufferStore(MI, Helper, false, true);
8316 case Intrinsic::amdgcn_raw_tbuffer_store:
8317 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8318 case Intrinsic::amdgcn_struct_tbuffer_store:
8319 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8320 return legalizeBufferStore(MI, Helper, true, true);
8321 case Intrinsic::amdgcn_raw_buffer_load:
8322 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8323 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8324 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8325 case Intrinsic::amdgcn_struct_buffer_load:
8326 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8327 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8328 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8329 return legalizeBufferLoad(MI, Helper, false, false);
8330 case Intrinsic::amdgcn_raw_buffer_load_format:
8331 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8332 case Intrinsic::amdgcn_struct_buffer_load_format:
8333 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8334 return legalizeBufferLoad(MI, Helper, true, false);
8335 case Intrinsic::amdgcn_raw_tbuffer_load:
8336 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8337 case Intrinsic::amdgcn_struct_tbuffer_load:
8338 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8339 return legalizeBufferLoad(MI, Helper, true, true);
8340 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8342 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8344 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8345 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8346 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8347 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8348 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8349 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8350 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8351 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8352 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8353 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8354 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8355 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8356 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8357 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8358 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8359 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8360 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8361 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8362 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8363 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8364 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8365 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8366 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8367 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8368 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8369 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8370 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8371 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8372 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8373 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8374 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8375 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8376 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8377 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8378 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8379 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8380 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8381 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8382 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8383 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8384 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8385 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8386 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8388 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8389 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8390 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8391 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8392 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8393 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8394 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8395 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8396 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8398 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8400 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8402 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8404 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8406 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8408 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8410 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8412 return legalizeBufferAtomic(MI, B, IntrID);
8413 case Intrinsic::amdgcn_rsq_clamp:
8414 return legalizeRsqClampIntrinsic(MI, MRI, B);
8415 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8417 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8418 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8420 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8421 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8422 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8423 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8424 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8425 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8426 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8427 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8428 Register Index = MI.getOperand(5).getReg();
8429 LLT S64 = LLT::scalar(64);
8430 LLT IndexArgTy = MRI.getType(Index);
8431 if (IndexArgTy != S64) {
8432 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8433 : B.buildAnyExt(S64, Index);
8434 MI.getOperand(5).setReg(NewIndex.getReg(0));
8435 }
8436 return true;
8437 }
8438 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8439 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8440 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8441 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8442 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8443 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8444 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8445 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8446 Register Index = MI.getOperand(5).getReg();
8447 LLT S32 = LLT::scalar(32);
8448 if (MRI.getType(Index) != S32)
8449 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8450 return true;
8451 }
8452 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8453 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8454 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8455 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8456 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8457 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8458 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8459 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8460 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8461 Register Index = MI.getOperand(7).getReg();
8462 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8463 ? LLT::scalar(64)
8464 : LLT::scalar(32);
8465 LLT IndexArgTy = MRI.getType(Index);
8466 if (IndexArgTy != IdxTy) {
8467 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8468 : B.buildAnyExt(IdxTy, Index);
8469 MI.getOperand(7).setReg(NewIndex.getReg(0));
8470 }
8471 return true;
8472 }
8473
8474 case Intrinsic::amdgcn_fmed3: {
8475 GISelChangeObserver &Observer = Helper.Observer;
8476
8477 // FIXME: This is to workaround the inability of tablegen match combiners to
8478 // match intrinsics in patterns.
8479 Observer.changingInstr(MI);
8480 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8481 MI.removeOperand(1);
8482 Observer.changedInstr(MI);
8483 return true;
8484 }
8485 case Intrinsic::amdgcn_readlane:
8486 case Intrinsic::amdgcn_writelane:
8487 case Intrinsic::amdgcn_readfirstlane:
8488 case Intrinsic::amdgcn_permlane16:
8489 case Intrinsic::amdgcn_permlanex16:
8490 case Intrinsic::amdgcn_permlane64:
8491 case Intrinsic::amdgcn_set_inactive:
8492 case Intrinsic::amdgcn_set_inactive_chain_arg:
8493 case Intrinsic::amdgcn_mov_dpp8:
8494 case Intrinsic::amdgcn_update_dpp:
8495 case Intrinsic::amdgcn_permlane_bcast:
8496 case Intrinsic::amdgcn_permlane_up:
8497 case Intrinsic::amdgcn_permlane_down:
8498 case Intrinsic::amdgcn_permlane_xor:
8499 return legalizeLaneOp(Helper, MI, IntrID);
8500 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8501 return legalizeSBufferPrefetch(Helper, MI);
8502 case Intrinsic::amdgcn_dead: {
8503 // TODO: Use poison instead of undef
8504 for (const MachineOperand &Def : MI.defs())
8505 B.buildUndef(Def);
8506 MI.eraseFromParent();
8507 return true;
8508 }
8509 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8510 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8511 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8512 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8513 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8514 MI.eraseFromParent();
8515 return true;
8516 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8517 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8518 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8519 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8520 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8521 MI.eraseFromParent();
8522 return true;
8523 case Intrinsic::amdgcn_av_load_b128:
8524 case Intrinsic::amdgcn_av_store_b128: {
8525 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
8526 if (!ST.hasFlatGlobalInsts()) {
8527 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8528 ? "llvm.amdgcn.av.load.b128"
8529 : "llvm.amdgcn.av.store.b128";
8530 Function &Fn = B.getMF().getFunction();
8532 Fn, Twine(Name) + " not supported on subtarget", MI.getDebugLoc()));
8533 return false;
8534 }
8535 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8536 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8537 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8538 else
8539 B.buildStore(MI.getOperand(2), MI.getOperand(1),
8540 **MI.memoperands_begin());
8541 MI.eraseFromParent();
8542 return true;
8543 }
8544 case Intrinsic::amdgcn_flat_load_monitor_b32:
8545 case Intrinsic::amdgcn_flat_load_monitor_b64:
8546 case Intrinsic::amdgcn_flat_load_monitor_b128:
8547 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8548 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8549 .add(MI.getOperand(0))
8550 .add(MI.getOperand(2))
8551 .addMemOperand(*MI.memoperands_begin());
8552 MI.eraseFromParent();
8553 return true;
8554 case Intrinsic::amdgcn_global_load_monitor_b32:
8555 case Intrinsic::amdgcn_global_load_monitor_b64:
8556 case Intrinsic::amdgcn_global_load_monitor_b128:
8557 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8558 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8559 .add(MI.getOperand(0))
8560 .add(MI.getOperand(2))
8561 .addMemOperand(*MI.memoperands_begin());
8562 MI.eraseFromParent();
8563 return true;
8564 default: {
8565 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8567 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8568 return true;
8569 }
8570 }
8571
8572 return true;
8573}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:77
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1273
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1197
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:196
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:624
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:383
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:558
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:1984
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1682
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:78
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.