LLVM 17.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
22#include "llvm/ADT/ScopeExit.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
30
31#define DEBUG_TYPE "amdgpu-legalinfo"
32
33using namespace llvm;
34using namespace LegalizeActions;
35using namespace LegalizeMutations;
36using namespace LegalityPredicates;
37using namespace MIPatternMatch;
38
39// Hack until load/store selection patterns support any tuple of legal types.
41 "amdgpu-global-isel-new-legality",
42 cl::desc("Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
44 cl::init(false),
46
47static constexpr unsigned MaxRegisterSize = 1024;
48
49// Round the number of elements to the next power of two elements
51 unsigned NElts = Ty.getNumElements();
52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54}
55
56// Round the number of bits to the next power of two bits
58 unsigned Bits = Ty.getSizeInBits();
59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60 return LLT::scalar(Pow2Bits);
61}
62
63/// \returns true if this is an odd sized vector which should widen by adding an
64/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65/// excludes s1 vectors, which should always be scalarized.
66static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67 return [=](const LegalityQuery &Query) {
68 const LLT Ty = Query.Types[TypeIdx];
69 if (!Ty.isVector())
70 return false;
71
72 const LLT EltTy = Ty.getElementType();
73 const unsigned EltSize = EltTy.getSizeInBits();
74 return Ty.getNumElements() % 2 != 0 &&
75 EltSize > 1 && EltSize < 32 &&
76 Ty.getSizeInBits() % 32 != 0;
77 };
78}
79
80static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81 return [=](const LegalityQuery &Query) {
82 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getSizeInBits() % 32 == 0;
84 };
85}
86
87static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 const LLT EltTy = Ty.getScalarType();
91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92 };
93}
94
95static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getElementType();
99 return std::pair(TypeIdx,
100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101 };
102}
103
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 unsigned Size = Ty.getSizeInBits();
109 unsigned Pieces = (Size + 63) / 64;
110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111 return std::pair(TypeIdx, LLT::scalarOrVector(
112 ElementCount::getFixed(NewNumElts), EltTy));
113 };
114}
115
116// Increase the number of vector elements to reach the next multiple of 32-bit
117// type.
118static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
126
127 assert(EltSize < 32);
128
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131 };
132}
133
135 const unsigned Size = Ty.getSizeInBits();
136
137 if (Size <= 32) {
138 // <2 x s8> -> s16
139 // <4 x s8> -> s32
140 return LLT::scalar(Size);
141 }
142
144}
145
146static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147 return [=](const LegalityQuery &Query) {
148 const LLT Ty = Query.Types[TypeIdx];
149 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
150 };
151}
152
154 return [=](const LegalityQuery &Query) {
155 const LLT Ty = Query.Types[TypeIdx];
156 unsigned Size = Ty.getSizeInBits();
157 assert(Size % 32 == 0);
158 return std::pair(
160 };
161}
162
163static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
164 return [=](const LegalityQuery &Query) {
165 const LLT QueryTy = Query.Types[TypeIdx];
166 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
167 };
168}
169
170static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
171 return [=](const LegalityQuery &Query) {
172 const LLT QueryTy = Query.Types[TypeIdx];
173 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
174 };
175}
176
177static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
178 return [=](const LegalityQuery &Query) {
179 const LLT QueryTy = Query.Types[TypeIdx];
180 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
181 };
182}
183
184static bool isRegisterSize(unsigned Size) {
185 return Size % 32 == 0 && Size <= MaxRegisterSize;
186}
187
189 const int EltSize = EltTy.getSizeInBits();
190 return EltSize == 16 || EltSize % 32 == 0;
191}
192
193static bool isRegisterVectorType(LLT Ty) {
194 const int EltSize = Ty.getElementType().getSizeInBits();
195 return EltSize == 32 || EltSize == 64 ||
196 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
197 EltSize == 128 || EltSize == 256;
198}
199
200static bool isRegisterType(LLT Ty) {
201 if (!isRegisterSize(Ty.getSizeInBits()))
202 return false;
203
204 if (Ty.isVector())
205 return isRegisterVectorType(Ty);
206
207 return true;
208}
209
210// Any combination of 32 or 64-bit elements up the maximum register size, and
211// multiples of v2s16.
212static LegalityPredicate isRegisterType(unsigned TypeIdx) {
213 return [=](const LegalityQuery &Query) {
214 return isRegisterType(Query.Types[TypeIdx]);
215 };
216}
217
218static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
219 return [=](const LegalityQuery &Query) {
220 const LLT QueryTy = Query.Types[TypeIdx];
221 if (!QueryTy.isVector())
222 return false;
223 const LLT EltTy = QueryTy.getElementType();
224 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
225 };
226}
227
228// If we have a truncating store or an extending load with a data size larger
229// than 32-bits, we need to reduce to a 32-bit type.
231 return [=](const LegalityQuery &Query) {
232 const LLT Ty = Query.Types[TypeIdx];
233 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
234 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
235 };
236}
237
238// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
239// handle some operations by just promoting the register during
240// selection. There are also d16 loads on GFX9+ which preserve the high bits.
241static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
242 bool IsLoad, bool IsAtomic) {
243 switch (AS) {
245 // FIXME: Private element size.
246 return ST.enableFlatScratch() ? 128 : 32;
248 return ST.useDS128() ? 128 : 64;
252 // Treat constant and global as identical. SMRD loads are sometimes usable for
253 // global loads (ideally constant address space should be eliminated)
254 // depending on the context. Legality cannot be context dependent, but
255 // RegBankSelect can split the load as necessary depending on the pointer
256 // register bank/uniformity and if the memory is invariant or not written in a
257 // kernel.
258 return IsLoad ? 512 : 128;
259 default:
260 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
261 // if they may alias scratch depending on the subtarget. This needs to be
262 // moved to custom handling to use addressMayBeAccessedAsPrivate
263 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
264 }
265}
266
267static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289#if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293#endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
300 Query.MMODescrs[0].Ordering !=
301 AtomicOrdering::NotAtomic))
302 return false;
303
304 switch (MemSize) {
305 case 8:
306 case 16:
307 case 32:
308 case 64:
309 case 128:
310 break;
311 case 96:
312 if (!ST.hasDwordx3LoadStores())
313 return false;
314 break;
315 case 256:
316 case 512:
317 // These may contextually need to be broken down.
318 break;
319 default:
320 return false;
321 }
322
323 assert(RegSize >= MemSize);
324
325 if (AlignBits < MemSize) {
326 const SITargetLowering *TLI = ST.getTargetLowering();
327 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
328 Align(AlignBits / 8)))
329 return false;
330 }
331
332 return true;
333}
334
335// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
336// workaround this. Eventually it should ignore the type for loads and only care
337// about the size. Return true in cases where we will workaround this for now by
338// bitcasting.
339static bool loadStoreBitcastWorkaround(const LLT Ty) {
341 return false;
342
343 const unsigned Size = Ty.getSizeInBits();
344 if (Size <= 64)
345 return false;
346 if (!Ty.isVector())
347 return true;
348
349 LLT EltTy = Ty.getElementType();
350 if (EltTy.isPointer())
351 return true;
352
353 unsigned EltSize = EltTy.getSizeInBits();
354 return EltSize != 32 && EltSize != 64;
355}
356
357static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
358 const LLT Ty = Query.Types[0];
359 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
361}
362
363/// Return true if a load or store of the type should be lowered with a bitcast
364/// to a different type.
365static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
366 const LLT MemTy) {
367 const unsigned MemSizeInBits = MemTy.getSizeInBits();
368 const unsigned Size = Ty.getSizeInBits();
369 if (Size != MemSizeInBits)
370 return Size <= 32 && Ty.isVector();
371
373 return true;
374
375 // Don't try to handle bitcasting vector ext loads for now.
376 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
377 (Size <= 32 || isRegisterSize(Size)) &&
379}
380
381/// Return true if we should legalize a load by widening an odd sized memory
382/// access up to the alignment. Note this case when the memory access itself
383/// changes, not the size of the result register.
384static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
385 uint64_t AlignInBits, unsigned AddrSpace,
386 unsigned Opcode) {
387 unsigned SizeInBits = MemoryTy.getSizeInBits();
388 // We don't want to widen cases that are naturally legal.
389 if (isPowerOf2_32(SizeInBits))
390 return false;
391
392 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
393 // end up widening these for a scalar load during RegBankSelect, since there
394 // aren't 96-bit scalar loads.
395 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
396 return false;
397
398 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
399 return false;
400
401 // A load is known dereferenceable up to the alignment, so it's legal to widen
402 // to it.
403 //
404 // TODO: Could check dereferenceable for less aligned cases.
405 unsigned RoundedSize = NextPowerOf2(SizeInBits);
406 if (AlignInBits < RoundedSize)
407 return false;
408
409 // Do not widen if it would introduce a slow unaligned load.
410 const SITargetLowering *TLI = ST.getTargetLowering();
411 unsigned Fast = 0;
413 RoundedSize, AddrSpace, Align(AlignInBits / 8),
415 Fast;
416}
417
418static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
419 unsigned Opcode) {
420 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
421 return false;
422
423 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
424 Query.MMODescrs[0].AlignInBits,
425 Query.Types[1].getAddressSpace(), Opcode);
426}
427
429 const GCNTargetMachine &TM)
430 : ST(ST_) {
431 using namespace TargetOpcode;
432
433 auto GetAddrSpacePtr = [&TM](unsigned AS) {
434 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
435 };
436
437 const LLT S1 = LLT::scalar(1);
438 const LLT S8 = LLT::scalar(8);
439 const LLT S16 = LLT::scalar(16);
440 const LLT S32 = LLT::scalar(32);
441 const LLT S64 = LLT::scalar(64);
442 const LLT S128 = LLT::scalar(128);
443 const LLT S256 = LLT::scalar(256);
444 const LLT S512 = LLT::scalar(512);
445 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
446
447 const LLT V2S8 = LLT::fixed_vector(2, 8);
448 const LLT V2S16 = LLT::fixed_vector(2, 16);
449 const LLT V4S16 = LLT::fixed_vector(4, 16);
450
451 const LLT V2S32 = LLT::fixed_vector(2, 32);
452 const LLT V3S32 = LLT::fixed_vector(3, 32);
453 const LLT V4S32 = LLT::fixed_vector(4, 32);
454 const LLT V5S32 = LLT::fixed_vector(5, 32);
455 const LLT V6S32 = LLT::fixed_vector(6, 32);
456 const LLT V7S32 = LLT::fixed_vector(7, 32);
457 const LLT V8S32 = LLT::fixed_vector(8, 32);
458 const LLT V9S32 = LLT::fixed_vector(9, 32);
459 const LLT V10S32 = LLT::fixed_vector(10, 32);
460 const LLT V11S32 = LLT::fixed_vector(11, 32);
461 const LLT V12S32 = LLT::fixed_vector(12, 32);
462 const LLT V13S32 = LLT::fixed_vector(13, 32);
463 const LLT V14S32 = LLT::fixed_vector(14, 32);
464 const LLT V15S32 = LLT::fixed_vector(15, 32);
465 const LLT V16S32 = LLT::fixed_vector(16, 32);
466 const LLT V32S32 = LLT::fixed_vector(32, 32);
467
468 const LLT V2S64 = LLT::fixed_vector(2, 64);
469 const LLT V3S64 = LLT::fixed_vector(3, 64);
470 const LLT V4S64 = LLT::fixed_vector(4, 64);
471 const LLT V5S64 = LLT::fixed_vector(5, 64);
472 const LLT V6S64 = LLT::fixed_vector(6, 64);
473 const LLT V7S64 = LLT::fixed_vector(7, 64);
474 const LLT V8S64 = LLT::fixed_vector(8, 64);
475 const LLT V16S64 = LLT::fixed_vector(16, 64);
476
477 std::initializer_list<LLT> AllS32Vectors =
478 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
479 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
480 std::initializer_list<LLT> AllS64Vectors =
481 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
482
483 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
484 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
485 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
486 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
487 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
488 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
489 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
490
491 const LLT CodePtr = FlatPtr;
492
493 const std::initializer_list<LLT> AddrSpaces64 = {
494 GlobalPtr, ConstantPtr, FlatPtr
495 };
496
497 const std::initializer_list<LLT> AddrSpaces32 = {
498 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
499 };
500
501 const std::initializer_list<LLT> FPTypesBase = {
502 S32, S64
503 };
504
505 const std::initializer_list<LLT> FPTypes16 = {
506 S32, S64, S16
507 };
508
509 const std::initializer_list<LLT> FPTypesPK16 = {
510 S32, S64, S16, V2S16
511 };
512
513 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
514
515 // s1 for VCC branches, s32 for SCC branches.
516 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
517
518 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
519 // elements for v3s16
521 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
522 .legalFor(AllS32Vectors)
523 .legalFor(AllS64Vectors)
524 .legalFor(AddrSpaces64)
525 .legalFor(AddrSpaces32)
526 .legalIf(isPointer(0))
527 .clampScalar(0, S16, S256)
529 .clampMaxNumElements(0, S32, 16)
531 .scalarize(0);
532
533 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
534 // Full set of gfx9 features.
535 getActionDefinitionsBuilder({G_ADD, G_SUB})
536 .legalFor({S32, S16, V2S16})
537 .clampMaxNumElementsStrict(0, S16, 2)
538 .scalarize(0)
539 .minScalar(0, S16)
541 .maxScalar(0, S32);
542
544 .legalFor({S32, S16, V2S16})
545 .clampMaxNumElementsStrict(0, S16, 2)
546 .scalarize(0)
547 .minScalar(0, S16)
549 .custom();
550 assert(ST.hasMad64_32());
551
552 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
553 .legalFor({S32, S16, V2S16}) // Clamp modifier
554 .minScalarOrElt(0, S16)
555 .clampMaxNumElementsStrict(0, S16, 2)
556 .scalarize(0)
558 .lower();
559 } else if (ST.has16BitInsts()) {
560 getActionDefinitionsBuilder({G_ADD, G_SUB})
561 .legalFor({S32, S16})
562 .minScalar(0, S16)
564 .maxScalar(0, S32)
565 .scalarize(0);
566
568 .legalFor({S32, S16})
569 .scalarize(0)
570 .minScalar(0, S16)
571 .widenScalarToNextMultipleOf(0, 32)
572 .custom();
573 assert(ST.hasMad64_32());
574
575 // Technically the saturating operations require clamp bit support, but this
576 // was introduced at the same time as 16-bit operations.
577 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
578 .legalFor({S32, S16}) // Clamp modifier
579 .minScalar(0, S16)
580 .scalarize(0)
582 .lower();
583
584 // We're just lowering this, but it helps get a better result to try to
585 // coerce to the desired type first.
586 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
587 .minScalar(0, S16)
588 .scalarize(0)
589 .lower();
590 } else {
591 getActionDefinitionsBuilder({G_ADD, G_SUB})
592 .legalFor({S32})
593 .widenScalarToNextMultipleOf(0, 32)
594 .clampScalar(0, S32, S32)
595 .scalarize(0);
596
597 auto &Mul = getActionDefinitionsBuilder(G_MUL)
598 .legalFor({S32})
599 .scalarize(0)
600 .minScalar(0, S32)
601 .widenScalarToNextMultipleOf(0, 32);
602
603 if (ST.hasMad64_32())
604 Mul.custom();
605 else
606 Mul.maxScalar(0, S32);
607
608 if (ST.hasIntClamp()) {
609 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
610 .legalFor({S32}) // Clamp modifier.
611 .scalarize(0)
612 .minScalarOrElt(0, S32)
613 .lower();
614 } else {
615 // Clamp bit support was added in VI, along with 16-bit operations.
616 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
617 .minScalar(0, S32)
618 .scalarize(0)
619 .lower();
620 }
621
622 // FIXME: DAG expansion gets better results. The widening uses the smaller
623 // range values and goes for the min/max lowering directly.
624 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
625 .minScalar(0, S32)
626 .scalarize(0)
627 .lower();
628 }
629
631 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
632 .customFor({S32, S64})
633 .clampScalar(0, S32, S64)
635 .scalarize(0);
636
637 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
638 .legalFor({S32})
639 .maxScalar(0, S32);
640
641 if (ST.hasVOP3PInsts()) {
642 Mulh
643 .clampMaxNumElements(0, S8, 2)
644 .lowerFor({V2S8});
645 }
646
647 Mulh
648 .scalarize(0)
649 .lower();
650
651 // Report legal for any types we can handle anywhere. For the cases only legal
652 // on the SALU, RegBankSelect will be able to re-legalize.
653 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
654 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
655 .clampScalar(0, S32, S64)
659 .scalarize(0);
660
662 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
663 .legalFor({{S32, S1}, {S32, S32}})
664 .clampScalar(0, S32, S32)
665 .scalarize(0);
666
668 // Don't worry about the size constraint.
670 .lower();
671
672
674 .legalFor({S1, S32, S64, S16, GlobalPtr,
675 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
676 .legalIf(isPointer(0))
677 .clampScalar(0, S32, S64)
679
680 getActionDefinitionsBuilder(G_FCONSTANT)
681 .legalFor({S32, S64, S16})
682 .clampScalar(0, S16, S64);
683
684 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
685 .legalIf(isRegisterType(0))
686 // s1 and s16 are special cases because they have legal operations on
687 // them, but don't really occupy registers in the normal way.
688 .legalFor({S1, S16})
689 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
690 .clampScalarOrElt(0, S32, MaxScalar)
692 .clampMaxNumElements(0, S32, 16);
693
694 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
695
696 // If the amount is divergent, we have to do a wave reduction to get the
697 // maximum value, so this is expanded during RegBankSelect.
698 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
699 .legalFor({{PrivatePtr, S32}});
700
701 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
702 .customIf(typeIsNot(0, PrivatePtr));
703
704 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
705
706 auto &FPOpActions = getActionDefinitionsBuilder(
707 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
708 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
709 .legalFor({S32, S64});
710 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
711 .customFor({S32, S64});
712 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
713 .customFor({S32, S64});
714
715 if (ST.has16BitInsts()) {
716 if (ST.hasVOP3PInsts())
717 FPOpActions.legalFor({S16, V2S16});
718 else
719 FPOpActions.legalFor({S16});
720
721 TrigActions.customFor({S16});
722 FDIVActions.customFor({S16});
723 }
724
725 auto &MinNumMaxNum = getActionDefinitionsBuilder({
726 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
727
728 if (ST.hasVOP3PInsts()) {
729 MinNumMaxNum.customFor(FPTypesPK16)
731 .clampMaxNumElements(0, S16, 2)
732 .clampScalar(0, S16, S64)
733 .scalarize(0);
734 } else if (ST.has16BitInsts()) {
735 MinNumMaxNum.customFor(FPTypes16)
736 .clampScalar(0, S16, S64)
737 .scalarize(0);
738 } else {
739 MinNumMaxNum.customFor(FPTypesBase)
740 .clampScalar(0, S32, S64)
741 .scalarize(0);
742 }
743
744 if (ST.hasVOP3PInsts())
745 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
746
747 FPOpActions
748 .scalarize(0)
749 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
750
751 TrigActions
752 .scalarize(0)
753 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
754
755 FDIVActions
756 .scalarize(0)
757 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
758
759 getActionDefinitionsBuilder({G_FNEG, G_FABS})
760 .legalFor(FPTypesPK16)
761 .clampMaxNumElementsStrict(0, S16, 2)
762 .scalarize(0)
763 .clampScalar(0, S16, S64);
764
765 if (ST.has16BitInsts()) {
766 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
767 .legalFor({S32, S64, S16})
768 .scalarize(0)
769 .clampScalar(0, S16, S64);
770 } else {
772 .legalFor({S32, S64})
773 .scalarize(0)
774 .clampScalar(0, S32, S64);
775
776 if (ST.hasFractBug()) {
778 .customFor({S64})
779 .legalFor({S32, S64})
780 .scalarize(0)
781 .clampScalar(0, S32, S64);
782 } else {
784 .legalFor({S32, S64})
785 .scalarize(0)
786 .clampScalar(0, S32, S64);
787 }
788 }
789
791 .legalFor({{S32, S64}, {S16, S32}})
792 .scalarize(0)
793 .lower();
794
796 .legalFor({{S64, S32}, {S32, S16}})
797 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
798 .scalarize(0);
799
800 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
801 if (ST.has16BitInsts()) {
802 FSubActions
803 // Use actual fsub instruction
804 .legalFor({S32, S16})
805 // Must use fadd + fneg
806 .lowerFor({S64, V2S16});
807 } else {
808 FSubActions
809 // Use actual fsub instruction
810 .legalFor({S32})
811 // Must use fadd + fneg
812 .lowerFor({S64, S16, V2S16});
813 }
814
815 FSubActions
816 .scalarize(0)
817 .clampScalar(0, S32, S64);
818
819 // Whether this is legal depends on the floating point mode for the function.
820 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
821 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
822 FMad.customFor({S32, S16});
823 else if (ST.hasMadMacF32Insts())
824 FMad.customFor({S32});
825 else if (ST.hasMadF16())
826 FMad.customFor({S16});
827 FMad.scalarize(0)
828 .lower();
829
830 auto &FRem = getActionDefinitionsBuilder(G_FREM);
831 if (ST.has16BitInsts()) {
832 FRem.customFor({S16, S32, S64});
833 } else {
834 FRem.minScalar(0, S32)
835 .customFor({S32, S64});
836 }
837 FRem.scalarize(0);
838
839 // TODO: Do we need to clamp maximum bitwidth?
841 .legalIf(isScalar(0))
842 .legalFor({{V2S16, V2S32}})
843 .clampMaxNumElements(0, S16, 2)
844 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
845 // situations (like an invalid implicit use), we don't want to infinite loop
846 // in the legalizer.
848 .alwaysLegal();
849
850 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
851 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
852 {S32, S1}, {S64, S1}, {S16, S1}})
853 .scalarize(0)
854 .clampScalar(0, S32, S64)
855 .widenScalarToNextPow2(1, 32);
856
857 // TODO: Split s1->s64 during regbankselect for VALU.
858 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
859 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
860 .lowerIf(typeIs(1, S1))
861 .customFor({{S32, S64}, {S64, S64}});
862 if (ST.has16BitInsts())
863 IToFP.legalFor({{S16, S16}});
864 IToFP.clampScalar(1, S32, S64)
865 .minScalar(0, S32)
866 .scalarize(0)
868
869 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
870 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
871 .customFor({{S64, S32}, {S64, S64}})
872 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
873 if (ST.has16BitInsts())
874 FPToI.legalFor({{S16, S16}});
875 else
876 FPToI.minScalar(1, S32);
877
878 FPToI.minScalar(0, S32)
880 .scalarize(0)
881 .lower();
882
883 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
884 .customFor({S16, S32})
885 .scalarize(0)
886 .lower();
887
888 // Lower roundeven into G_FRINT
889 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
890 .scalarize(0)
891 .lower();
892
893 if (ST.has16BitInsts()) {
894 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
895 .legalFor({S16, S32, S64})
896 .clampScalar(0, S16, S64)
897 .scalarize(0);
898 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
899 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
900 .legalFor({S32, S64})
901 .clampScalar(0, S32, S64)
902 .scalarize(0);
903 } else {
904 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
905 .legalFor({S32})
906 .customFor({S64})
907 .clampScalar(0, S32, S64)
908 .scalarize(0);
909 }
910
911 getActionDefinitionsBuilder(G_PTR_ADD)
912 .legalIf(all(isPointer(0), sameSize(0, 1)))
913 .scalarize(0)
914 .scalarSameSizeAs(1, 0);
915
916 getActionDefinitionsBuilder(G_PTRMASK)
917 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
918 .scalarSameSizeAs(1, 0)
919 .scalarize(0);
920
921 auto &CmpBuilder =
922 getActionDefinitionsBuilder(G_ICMP)
923 // The compare output type differs based on the register bank of the output,
924 // so make both s1 and s32 legal.
925 //
926 // Scalar compares producing output in scc will be promoted to s32, as that
927 // is the allocatable register type that will be needed for the copy from
928 // scc. This will be promoted during RegBankSelect, and we assume something
929 // before that won't try to use s32 result types.
930 //
931 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
932 // bank.
933 .legalForCartesianProduct(
934 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
935 .legalForCartesianProduct(
936 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
937 if (ST.has16BitInsts()) {
938 CmpBuilder.legalFor({{S1, S16}});
939 }
940
941 CmpBuilder
942 .widenScalarToNextPow2(1)
943 .clampScalar(1, S32, S64)
944 .scalarize(0)
945 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
946
947 getActionDefinitionsBuilder(G_FCMP)
948 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
949 .widenScalarToNextPow2(1)
950 .clampScalar(1, S32, S64)
951 .scalarize(0);
952
953 // FIXME: fpow has a selection pattern that should move to custom lowering.
954 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
955 if (ST.has16BitInsts())
956 Exp2Ops.legalFor({S32, S16});
957 else
958 Exp2Ops.legalFor({S32});
959 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
960 Exp2Ops.scalarize(0);
961
962 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
963 if (ST.has16BitInsts())
964 ExpOps.customFor({{S32}, {S16}});
965 else
966 ExpOps.customFor({S32});
967 ExpOps.clampScalar(0, MinScalarFPTy, S32)
968 .scalarize(0);
969
970 getActionDefinitionsBuilder(G_FPOWI)
971 .clampScalar(0, MinScalarFPTy, S32)
972 .lower();
973
974 // The 64-bit versions produce 32-bit results, but only on the SALU.
975 getActionDefinitionsBuilder(G_CTPOP)
976 .legalFor({{S32, S32}, {S32, S64}})
977 .clampScalar(0, S32, S32)
978 .widenScalarToNextPow2(1, 32)
979 .clampScalar(1, S32, S64)
980 .scalarize(0)
981 .widenScalarToNextPow2(0, 32);
982
983 // If no 16 bit instr is available, lower into different instructions.
984 if (ST.has16BitInsts())
985 getActionDefinitionsBuilder(G_IS_FPCLASS)
986 .legalForCartesianProduct({S1}, FPTypes16)
987 .widenScalarToNextPow2(1)
988 .scalarize(0)
989 .lower();
990 else
991 getActionDefinitionsBuilder(G_IS_FPCLASS)
992 .legalForCartesianProduct({S1}, FPTypesBase)
993 .lowerFor({S1, S16})
994 .widenScalarToNextPow2(1)
995 .scalarize(0)
996 .lower();
997
998 // The hardware instructions return a different result on 0 than the generic
999 // instructions expect. The hardware produces -1, but these produce the
1000 // bitwidth.
1001 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1002 .scalarize(0)
1003 .clampScalar(0, S32, S32)
1004 .clampScalar(1, S32, S64)
1005 .widenScalarToNextPow2(0, 32)
1006 .widenScalarToNextPow2(1, 32)
1007 .custom();
1008
1009 // The 64-bit versions produce 32-bit results, but only on the SALU.
1010 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1011 .legalFor({{S32, S32}, {S32, S64}})
1012 .clampScalar(0, S32, S32)
1013 .clampScalar(1, S32, S64)
1014 .scalarize(0)
1015 .widenScalarToNextPow2(0, 32)
1016 .widenScalarToNextPow2(1, 32);
1017
1018 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1019 // RegBankSelect.
1020 getActionDefinitionsBuilder(G_BITREVERSE)
1021 .legalFor({S32, S64})
1022 .clampScalar(0, S32, S64)
1023 .scalarize(0)
1024 .widenScalarToNextPow2(0);
1025
1026 if (ST.has16BitInsts()) {
1027 getActionDefinitionsBuilder(G_BSWAP)
1028 .legalFor({S16, S32, V2S16})
1029 .clampMaxNumElementsStrict(0, S16, 2)
1030 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1031 // narrowScalar limitation.
1032 .widenScalarToNextPow2(0)
1033 .clampScalar(0, S16, S32)
1034 .scalarize(0);
1035
1036 if (ST.hasVOP3PInsts()) {
1037 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1038 .legalFor({S32, S16, V2S16})
1039 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1040 .clampMaxNumElements(0, S16, 2)
1041 .minScalar(0, S16)
1042 .widenScalarToNextPow2(0)
1043 .scalarize(0)
1044 .lower();
1045 } else {
1046 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1047 .legalFor({S32, S16})
1048 .widenScalarToNextPow2(0)
1049 .minScalar(0, S16)
1050 .scalarize(0)
1051 .lower();
1052 }
1053 } else {
1054 // TODO: Should have same legality without v_perm_b32
1055 getActionDefinitionsBuilder(G_BSWAP)
1056 .legalFor({S32})
1057 .lowerIf(scalarNarrowerThan(0, 32))
1058 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1059 // narrowScalar limitation.
1060 .widenScalarToNextPow2(0)
1061 .maxScalar(0, S32)
1062 .scalarize(0)
1063 .lower();
1064
1065 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1066 .legalFor({S32})
1067 .minScalar(0, S32)
1068 .widenScalarToNextPow2(0)
1069 .scalarize(0)
1070 .lower();
1071 }
1072
1073 getActionDefinitionsBuilder(G_INTTOPTR)
1074 // List the common cases
1075 .legalForCartesianProduct(AddrSpaces64, {S64})
1076 .legalForCartesianProduct(AddrSpaces32, {S32})
1077 .scalarize(0)
1078 // Accept any address space as long as the size matches
1079 .legalIf(sameSize(0, 1))
1080 .widenScalarIf(smallerThan(1, 0),
1081 [](const LegalityQuery &Query) {
1082 return std::pair(
1083 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1084 })
1085 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1086 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1087 });
1088
1089 getActionDefinitionsBuilder(G_PTRTOINT)
1090 // List the common cases
1091 .legalForCartesianProduct(AddrSpaces64, {S64})
1092 .legalForCartesianProduct(AddrSpaces32, {S32})
1093 .scalarize(0)
1094 // Accept any address space as long as the size matches
1095 .legalIf(sameSize(0, 1))
1096 .widenScalarIf(smallerThan(0, 1),
1097 [](const LegalityQuery &Query) {
1098 return std::pair(
1099 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1100 })
1101 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1102 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1103 });
1104
1105 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1106 .scalarize(0)
1107 .custom();
1108
1109 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1110 bool IsLoad) -> bool {
1111 const LLT DstTy = Query.Types[0];
1112
1113 // Split vector extloads.
1114 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1115
1116 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1117 return true;
1118
1119 const LLT PtrTy = Query.Types[1];
1120 unsigned AS = PtrTy.getAddressSpace();
1121 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1122 Query.MMODescrs[0].Ordering !=
1124 return true;
1125
1126 // Catch weird sized loads that don't evenly divide into the access sizes
1127 // TODO: May be able to widen depending on alignment etc.
1128 unsigned NumRegs = (MemSize + 31) / 32;
1129 if (NumRegs == 3) {
1130 if (!ST.hasDwordx3LoadStores())
1131 return true;
1132 } else {
1133 // If the alignment allows, these should have been widened.
1134 if (!isPowerOf2_32(NumRegs))
1135 return true;
1136 }
1137
1138 return false;
1139 };
1140
1141 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1142 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1143 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1144
1145 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1146 // LDS
1147 // TODO: Unsupported flat for SI.
1148
1149 for (unsigned Op : {G_LOAD, G_STORE}) {
1150 const bool IsStore = Op == G_STORE;
1151
1152 auto &Actions = getActionDefinitionsBuilder(Op);
1153 // Explicitly list some common cases.
1154 // TODO: Does this help compile time at all?
1155 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1156 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1157 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1158 {S64, GlobalPtr, S64, GlobalAlign32},
1159 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1160 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1161 {S32, GlobalPtr, S8, GlobalAlign8},
1162 {S32, GlobalPtr, S16, GlobalAlign16},
1163
1164 {S32, LocalPtr, S32, 32},
1165 {S64, LocalPtr, S64, 32},
1166 {V2S32, LocalPtr, V2S32, 32},
1167 {S32, LocalPtr, S8, 8},
1168 {S32, LocalPtr, S16, 16},
1169 {V2S16, LocalPtr, S32, 32},
1170
1171 {S32, PrivatePtr, S32, 32},
1172 {S32, PrivatePtr, S8, 8},
1173 {S32, PrivatePtr, S16, 16},
1174 {V2S16, PrivatePtr, S32, 32},
1175
1176 {S32, ConstantPtr, S32, GlobalAlign32},
1177 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1178 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1179 {S64, ConstantPtr, S64, GlobalAlign32},
1180 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1181 Actions.legalIf(
1182 [=](const LegalityQuery &Query) -> bool {
1183 return isLoadStoreLegal(ST, Query);
1184 });
1185
1186 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1187 // 64-bits.
1188 //
1189 // TODO: Should generalize bitcast action into coerce, which will also cover
1190 // inserting addrspacecasts.
1191 Actions.customIf(typeIs(1, Constant32Ptr));
1192
1193 // Turn any illegal element vectors into something easier to deal
1194 // with. These will ultimately produce 32-bit scalar shifts to extract the
1195 // parts anyway.
1196 //
1197 // For odd 16-bit element vectors, prefer to split those into pieces with
1198 // 16-bit vector parts.
1199 Actions.bitcastIf(
1200 [=](const LegalityQuery &Query) -> bool {
1201 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1202 Query.MMODescrs[0].MemoryTy);
1203 }, bitcastToRegisterType(0));
1204
1205 if (!IsStore) {
1206 // Widen suitably aligned loads by loading extra bytes. The standard
1207 // legalization actions can't properly express widening memory operands.
1208 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1209 return shouldWidenLoad(ST, Query, G_LOAD);
1210 });
1211 }
1212
1213 // FIXME: load/store narrowing should be moved to lower action
1214 Actions
1215 .narrowScalarIf(
1216 [=](const LegalityQuery &Query) -> bool {
1217 return !Query.Types[0].isVector() &&
1218 needToSplitMemOp(Query, Op == G_LOAD);
1219 },
1220 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1221 const LLT DstTy = Query.Types[0];
1222 const LLT PtrTy = Query.Types[1];
1223
1224 const unsigned DstSize = DstTy.getSizeInBits();
1225 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1226
1227 // Split extloads.
1228 if (DstSize > MemSize)
1229 return std::pair(0, LLT::scalar(MemSize));
1230
1231 unsigned MaxSize = maxSizeForAddrSpace(
1232 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1233 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1234 if (MemSize > MaxSize)
1235 return std::pair(0, LLT::scalar(MaxSize));
1236
1237 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1238 return std::pair(0, LLT::scalar(Align));
1239 })
1240 .fewerElementsIf(
1241 [=](const LegalityQuery &Query) -> bool {
1242 return Query.Types[0].isVector() &&
1243 needToSplitMemOp(Query, Op == G_LOAD);
1244 },
1245 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1246 const LLT DstTy = Query.Types[0];
1247 const LLT PtrTy = Query.Types[1];
1248
1249 LLT EltTy = DstTy.getElementType();
1250 unsigned MaxSize = maxSizeForAddrSpace(
1251 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1252 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1253
1254 // FIXME: Handle widened to power of 2 results better. This ends
1255 // up scalarizing.
1256 // FIXME: 3 element stores scalarized on SI
1257
1258 // Split if it's too large for the address space.
1259 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1260 if (MemSize > MaxSize) {
1261 unsigned NumElts = DstTy.getNumElements();
1262 unsigned EltSize = EltTy.getSizeInBits();
1263
1264 if (MaxSize % EltSize == 0) {
1265 return std::pair(
1267 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1268 }
1269
1270 unsigned NumPieces = MemSize / MaxSize;
1271
1272 // FIXME: Refine when odd breakdowns handled
1273 // The scalars will need to be re-legalized.
1274 if (NumPieces == 1 || NumPieces >= NumElts ||
1275 NumElts % NumPieces != 0)
1276 return std::pair(0, EltTy);
1277
1278 return std::pair(0,
1279 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1280 }
1281
1282 // FIXME: We could probably handle weird extending loads better.
1283 if (DstTy.getSizeInBits() > MemSize)
1284 return std::pair(0, EltTy);
1285
1286 unsigned EltSize = EltTy.getSizeInBits();
1287 unsigned DstSize = DstTy.getSizeInBits();
1288 if (!isPowerOf2_32(DstSize)) {
1289 // We're probably decomposing an odd sized store. Try to split
1290 // to the widest type. TODO: Account for alignment. As-is it
1291 // should be OK, since the new parts will be further legalized.
1292 unsigned FloorSize = llvm::bit_floor(DstSize);
1293 return std::pair(
1295 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1296 }
1297
1298 // May need relegalization for the scalars.
1299 return std::pair(0, EltTy);
1300 })
1301 .minScalar(0, S32)
1302 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1303 .widenScalarToNextPow2(0)
1304 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1305 .lower();
1306 }
1307
1308 // FIXME: Unaligned accesses not lowered.
1309 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1310 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1311 {S32, GlobalPtr, S16, 2 * 8},
1312 {S32, LocalPtr, S8, 8},
1313 {S32, LocalPtr, S16, 16},
1314 {S32, PrivatePtr, S8, 8},
1315 {S32, PrivatePtr, S16, 16},
1316 {S32, ConstantPtr, S8, 8},
1317 {S32, ConstantPtr, S16, 2 * 8}})
1318 .legalIf(
1319 [=](const LegalityQuery &Query) -> bool {
1320 return isLoadStoreLegal(ST, Query);
1321 });
1322
1323 if (ST.hasFlatAddressSpace()) {
1324 ExtLoads.legalForTypesWithMemDesc(
1325 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1326 }
1327
1328 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1329 // 64-bits.
1330 //
1331 // TODO: Should generalize bitcast action into coerce, which will also cover
1332 // inserting addrspacecasts.
1333 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1334
1335 ExtLoads.clampScalar(0, S32, S32)
1336 .widenScalarToNextPow2(0)
1337 .lower();
1338
1339 auto &Atomics = getActionDefinitionsBuilder(
1340 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1341 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1342 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1343 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1344 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1345 {S64, GlobalPtr}, {S64, LocalPtr},
1346 {S32, RegionPtr}, {S64, RegionPtr}});
1347 if (ST.hasFlatAddressSpace()) {
1348 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1349 }
1350
1351 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1352 if (ST.hasLDSFPAtomicAdd()) {
1353 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1354 if (ST.hasGFX90AInsts())
1355 Atomic.legalFor({{S64, LocalPtr}});
1356 if (ST.hasGFX940Insts())
1357 Atomic.legalFor({{V2S16, LocalPtr}});
1358 }
1359 if (ST.hasAtomicFaddInsts())
1360 Atomic.legalFor({{S32, GlobalPtr}});
1361 if (ST.hasFlatAtomicFaddF32Inst())
1362 Atomic.legalFor({{S32, FlatPtr}});
1363
1364 if (ST.hasGFX90AInsts()) {
1365 // These are legal with some caveats, and should have undergone expansion in
1366 // the IR in most situations
1367 // TODO: Move atomic expansion into legalizer
1368 Atomic.legalFor({
1369 {S32, GlobalPtr},
1370 {S64, GlobalPtr},
1371 {S64, FlatPtr}
1372 });
1373 }
1374
1375 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1376 // demarshalling
1377 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1378 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1379 {S32, FlatPtr}, {S64, FlatPtr}})
1380 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1381 {S32, RegionPtr}, {S64, RegionPtr}});
1382 // TODO: Pointer types, any 32-bit or 64-bit vector
1383
1384 // Condition should be s32 for scalar, s1 for vector.
1385 getActionDefinitionsBuilder(G_SELECT)
1386 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1387 LocalPtr, FlatPtr, PrivatePtr,
1388 LLT::fixed_vector(2, LocalPtr),
1389 LLT::fixed_vector(2, PrivatePtr)},
1390 {S1, S32})
1391 .clampScalar(0, S16, S64)
1392 .scalarize(1)
1393 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1394 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1395 .clampMaxNumElements(0, S32, 2)
1396 .clampMaxNumElements(0, LocalPtr, 2)
1397 .clampMaxNumElements(0, PrivatePtr, 2)
1398 .scalarize(0)
1399 .widenScalarToNextPow2(0)
1400 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1401
1402 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1403 // be more flexible with the shift amount type.
1404 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1405 .legalFor({{S32, S32}, {S64, S32}});
1406 if (ST.has16BitInsts()) {
1407 if (ST.hasVOP3PInsts()) {
1408 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1409 .clampMaxNumElements(0, S16, 2);
1410 } else
1411 Shifts.legalFor({{S16, S16}});
1412
1413 // TODO: Support 16-bit shift amounts for all types
1414 Shifts.widenScalarIf(
1415 [=](const LegalityQuery &Query) {
1416 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1417 // 32-bit amount.
1418 const LLT ValTy = Query.Types[0];
1419 const LLT AmountTy = Query.Types[1];
1420 return ValTy.getSizeInBits() <= 16 &&
1421 AmountTy.getSizeInBits() < 16;
1422 }, changeTo(1, S16));
1423 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1424 Shifts.clampScalar(1, S32, S32);
1425 Shifts.widenScalarToNextPow2(0, 16);
1426 Shifts.clampScalar(0, S16, S64);
1427
1428 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1429 .minScalar(0, S16)
1430 .scalarize(0)
1431 .lower();
1432 } else {
1433 // Make sure we legalize the shift amount type first, as the general
1434 // expansion for the shifted type will produce much worse code if it hasn't
1435 // been truncated already.
1436 Shifts.clampScalar(1, S32, S32);
1437 Shifts.widenScalarToNextPow2(0, 32);
1438 Shifts.clampScalar(0, S32, S64);
1439
1440 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1441 .minScalar(0, S32)
1442 .scalarize(0)
1443 .lower();
1444 }
1445 Shifts.scalarize(0);
1446
1447 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1448 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1449 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1450 unsigned IdxTypeIdx = 2;
1451
1452 getActionDefinitionsBuilder(Op)
1453 .customIf([=](const LegalityQuery &Query) {
1454 const LLT EltTy = Query.Types[EltTypeIdx];
1455 const LLT VecTy = Query.Types[VecTypeIdx];
1456 const LLT IdxTy = Query.Types[IdxTypeIdx];
1457 const unsigned EltSize = EltTy.getSizeInBits();
1458 return (EltSize == 32 || EltSize == 64) &&
1459 VecTy.getSizeInBits() % 32 == 0 &&
1460 VecTy.getSizeInBits() <= MaxRegisterSize &&
1461 IdxTy.getSizeInBits() == 32;
1462 })
1463 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1464 bitcastToVectorElement32(VecTypeIdx))
1465 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1466 .bitcastIf(
1467 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1468 [=](const LegalityQuery &Query) {
1469 // For > 64-bit element types, try to turn this into a 64-bit
1470 // element vector since we may be able to do better indexing
1471 // if this is scalar. If not, fall back to 32.
1472 const LLT EltTy = Query.Types[EltTypeIdx];
1473 const LLT VecTy = Query.Types[VecTypeIdx];
1474 const unsigned DstEltSize = EltTy.getSizeInBits();
1475 const unsigned VecSize = VecTy.getSizeInBits();
1476
1477 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1478 return std::pair(
1479 VecTypeIdx,
1480 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1481 })
1482 .clampScalar(EltTypeIdx, S32, S64)
1483 .clampScalar(VecTypeIdx, S32, S64)
1484 .clampScalar(IdxTypeIdx, S32, S32)
1485 .clampMaxNumElements(VecTypeIdx, S32, 32)
1486 // TODO: Clamp elements for 64-bit vectors?
1487 // It should only be necessary with variable indexes.
1488 // As a last resort, lower to the stack
1489 .lower();
1490 }
1491
1492 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1493 .unsupportedIf([=](const LegalityQuery &Query) {
1494 const LLT &EltTy = Query.Types[1].getElementType();
1495 return Query.Types[0] != EltTy;
1496 });
1497
1498 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1499 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1500 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1501
1502 // FIXME: Doesn't handle extract of illegal sizes.
1503 getActionDefinitionsBuilder(Op)
1504 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1505 .lowerIf([=](const LegalityQuery &Query) {
1506 // Sub-vector(or single element) insert and extract.
1507 // TODO: verify immediate offset here since lower only works with
1508 // whole elements.
1509 const LLT BigTy = Query.Types[BigTyIdx];
1510 return BigTy.isVector();
1511 })
1512 // FIXME: Multiples of 16 should not be legal.
1513 .legalIf([=](const LegalityQuery &Query) {
1514 const LLT BigTy = Query.Types[BigTyIdx];
1515 const LLT LitTy = Query.Types[LitTyIdx];
1516 return (BigTy.getSizeInBits() % 32 == 0) &&
1517 (LitTy.getSizeInBits() % 16 == 0);
1518 })
1519 .widenScalarIf(
1520 [=](const LegalityQuery &Query) {
1521 const LLT BigTy = Query.Types[BigTyIdx];
1522 return (BigTy.getScalarSizeInBits() < 16);
1523 },
1525 .widenScalarIf(
1526 [=](const LegalityQuery &Query) {
1527 const LLT LitTy = Query.Types[LitTyIdx];
1528 return (LitTy.getScalarSizeInBits() < 16);
1529 },
1531 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1532 .widenScalarToNextPow2(BigTyIdx, 32);
1533
1534 }
1535
1536 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1537 .legalForCartesianProduct(AllS32Vectors, {S32})
1538 .legalForCartesianProduct(AllS64Vectors, {S64})
1539 .clampNumElements(0, V16S32, V32S32)
1540 .clampNumElements(0, V2S64, V16S64)
1541 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1542
1543 if (ST.hasScalarPackInsts()) {
1544 BuildVector
1545 // FIXME: Should probably widen s1 vectors straight to s32
1546 .minScalarOrElt(0, S16)
1547 .minScalar(1, S16);
1548
1549 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1550 .legalFor({V2S16, S32})
1551 .lower();
1552 } else {
1553 BuildVector.customFor({V2S16, S16});
1554 BuildVector.minScalarOrElt(0, S32);
1555
1556 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1557 .customFor({V2S16, S32})
1558 .lower();
1559 }
1560
1561 BuildVector.legalIf(isRegisterType(0));
1562
1563 // FIXME: Clamp maximum size
1564 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1565 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1566 .clampMaxNumElements(0, S32, 32)
1567 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1568 .clampMaxNumElements(0, S16, 64);
1569
1570 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1571
1572 // Merge/Unmerge
1573 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1574 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1575 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1576
1577 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1578 const LLT Ty = Query.Types[TypeIdx];
1579 if (Ty.isVector()) {
1580 const LLT &EltTy = Ty.getElementType();
1581 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1582 return true;
1583 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1584 return true;
1585 }
1586 return false;
1587 };
1588
1589 auto &Builder = getActionDefinitionsBuilder(Op)
1590 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1591 .lowerFor({{S16, V2S16}})
1592 .lowerIf([=](const LegalityQuery &Query) {
1593 const LLT BigTy = Query.Types[BigTyIdx];
1594 return BigTy.getSizeInBits() == 32;
1595 })
1596 // Try to widen to s16 first for small types.
1597 // TODO: Only do this on targets with legal s16 shifts
1598 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1599 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1600 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1601 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1602 elementTypeIs(1, S16)),
1603 changeTo(1, V2S16))
1604 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1605 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1606 // valid.
1607 .clampScalar(LitTyIdx, S32, S512)
1608 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1609 // Break up vectors with weird elements into scalars
1610 .fewerElementsIf(
1611 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1612 scalarize(0))
1613 .fewerElementsIf(
1614 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1615 scalarize(1))
1616 .clampScalar(BigTyIdx, S32, MaxScalar);
1617
1618 if (Op == G_MERGE_VALUES) {
1619 Builder.widenScalarIf(
1620 // TODO: Use 16-bit shifts if legal for 8-bit values?
1621 [=](const LegalityQuery &Query) {
1622 const LLT Ty = Query.Types[LitTyIdx];
1623 return Ty.getSizeInBits() < 32;
1624 },
1625 changeTo(LitTyIdx, S32));
1626 }
1627
1628 Builder.widenScalarIf(
1629 [=](const LegalityQuery &Query) {
1630 const LLT Ty = Query.Types[BigTyIdx];
1631 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1632 Ty.getSizeInBits() % 16 != 0;
1633 },
1634 [=](const LegalityQuery &Query) {
1635 // Pick the next power of 2, or a multiple of 64 over 128.
1636 // Whichever is smaller.
1637 const LLT &Ty = Query.Types[BigTyIdx];
1638 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1639 if (NewSizeInBits >= 256) {
1640 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1641 if (RoundedTo < NewSizeInBits)
1642 NewSizeInBits = RoundedTo;
1643 }
1644 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1645 })
1646 // Any vectors left are the wrong size. Scalarize them.
1647 .scalarize(0)
1648 .scalarize(1);
1649 }
1650
1651 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1652 // RegBankSelect.
1653 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1654 .legalFor({{S32}, {S64}});
1655
1656 if (ST.hasVOP3PInsts()) {
1657 SextInReg.lowerFor({{V2S16}})
1658 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1659 // get more vector shift opportunities, since we'll get those when
1660 // expanded.
1661 .clampMaxNumElementsStrict(0, S16, 2);
1662 } else if (ST.has16BitInsts()) {
1663 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1664 } else {
1665 // Prefer to promote to s32 before lowering if we don't have 16-bit
1666 // shifts. This avoid a lot of intermediate truncate and extend operations.
1667 SextInReg.lowerFor({{S32}, {S64}});
1668 }
1669
1670 SextInReg
1671 .scalarize(0)
1672 .clampScalar(0, S32, S64)
1673 .lower();
1674
1675 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1676 .scalarize(0)
1677 .lower();
1678
1679 // TODO: Only Try to form v2s16 with legal packed instructions.
1680 getActionDefinitionsBuilder(G_FSHR)
1681 .legalFor({{S32, S32}})
1682 .lowerFor({{V2S16, V2S16}})
1683 .clampMaxNumElementsStrict(0, S16, 2)
1684 .scalarize(0)
1685 .lower();
1686
1687 if (ST.hasVOP3PInsts()) {
1688 getActionDefinitionsBuilder(G_FSHL)
1689 .lowerFor({{V2S16, V2S16}})
1690 .clampMaxNumElementsStrict(0, S16, 2)
1691 .scalarize(0)
1692 .lower();
1693 } else {
1694 getActionDefinitionsBuilder(G_FSHL)
1695 .scalarize(0)
1696 .lower();
1697 }
1698
1699 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1700 .legalFor({S64});
1701
1702 getActionDefinitionsBuilder(G_FENCE)
1703 .alwaysLegal();
1704
1705 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1706 .scalarize(0)
1707 .minScalar(0, S32)
1708 .lower();
1709
1710 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1711 .legalFor({{S32, S32}, {S64, S32}})
1712 .clampScalar(1, S32, S32)
1713 .clampScalar(0, S32, S64)
1714 .widenScalarToNextPow2(0)
1715 .scalarize(0);
1716
1717 getActionDefinitionsBuilder({
1718 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1719 G_FCOPYSIGN,
1720
1721 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1722 G_ATOMICRMW_NAND,
1723 G_ATOMICRMW_FSUB,
1724 G_READ_REGISTER,
1725 G_WRITE_REGISTER,
1726
1727 G_SADDO, G_SSUBO,
1728
1729 // TODO: Implement
1730 G_FMINIMUM, G_FMAXIMUM}).lower();
1731
1732 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1733 .lower();
1734
1735 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1736 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1737 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1738 .unsupported();
1739
1740 getLegacyLegalizerInfo().computeTables();
1741 verify(*ST.getInstrInfo());
1742}
1743
1745 MachineInstr &MI) const {
1746 MachineIRBuilder &B = Helper.MIRBuilder;
1747 MachineRegisterInfo &MRI = *B.getMRI();
1748
1749 switch (MI.getOpcode()) {
1750 case TargetOpcode::G_ADDRSPACE_CAST:
1751 return legalizeAddrSpaceCast(MI, MRI, B);
1752 case TargetOpcode::G_FRINT:
1753 return legalizeFrint(MI, MRI, B);
1754 case TargetOpcode::G_FCEIL:
1755 return legalizeFceil(MI, MRI, B);
1756 case TargetOpcode::G_FREM:
1757 return legalizeFrem(MI, MRI, B);
1758 case TargetOpcode::G_INTRINSIC_TRUNC:
1759 return legalizeIntrinsicTrunc(MI, MRI, B);
1760 case TargetOpcode::G_SITOFP:
1761 return legalizeITOFP(MI, MRI, B, true);
1762 case TargetOpcode::G_UITOFP:
1763 return legalizeITOFP(MI, MRI, B, false);
1764 case TargetOpcode::G_FPTOSI:
1765 return legalizeFPTOI(MI, MRI, B, true);
1766 case TargetOpcode::G_FPTOUI:
1767 return legalizeFPTOI(MI, MRI, B, false);
1768 case TargetOpcode::G_FMINNUM:
1769 case TargetOpcode::G_FMAXNUM:
1770 case TargetOpcode::G_FMINNUM_IEEE:
1771 case TargetOpcode::G_FMAXNUM_IEEE:
1772 return legalizeMinNumMaxNum(Helper, MI);
1773 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1774 return legalizeExtractVectorElt(MI, MRI, B);
1775 case TargetOpcode::G_INSERT_VECTOR_ELT:
1776 return legalizeInsertVectorElt(MI, MRI, B);
1777 case TargetOpcode::G_FSIN:
1778 case TargetOpcode::G_FCOS:
1779 return legalizeSinCos(MI, MRI, B);
1780 case TargetOpcode::G_GLOBAL_VALUE:
1781 return legalizeGlobalValue(MI, MRI, B);
1782 case TargetOpcode::G_LOAD:
1783 case TargetOpcode::G_SEXTLOAD:
1784 case TargetOpcode::G_ZEXTLOAD:
1785 return legalizeLoad(Helper, MI);
1786 case TargetOpcode::G_FMAD:
1787 return legalizeFMad(MI, MRI, B);
1788 case TargetOpcode::G_FDIV:
1789 return legalizeFDIV(MI, MRI, B);
1790 case TargetOpcode::G_UDIV:
1791 case TargetOpcode::G_UREM:
1792 case TargetOpcode::G_UDIVREM:
1793 return legalizeUnsignedDIV_REM(MI, MRI, B);
1794 case TargetOpcode::G_SDIV:
1795 case TargetOpcode::G_SREM:
1796 case TargetOpcode::G_SDIVREM:
1797 return legalizeSignedDIV_REM(MI, MRI, B);
1798 case TargetOpcode::G_ATOMIC_CMPXCHG:
1799 return legalizeAtomicCmpXChg(MI, MRI, B);
1800 case TargetOpcode::G_FLOG:
1801 return legalizeFlog(MI, B, numbers::ln2f);
1802 case TargetOpcode::G_FLOG10:
1804 case TargetOpcode::G_FEXP:
1805 return legalizeFExp(MI, B);
1806 case TargetOpcode::G_FPOW:
1807 return legalizeFPow(MI, B);
1808 case TargetOpcode::G_FFLOOR:
1809 return legalizeFFloor(MI, MRI, B);
1810 case TargetOpcode::G_BUILD_VECTOR:
1811 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
1812 return legalizeBuildVector(MI, MRI, B);
1813 case TargetOpcode::G_MUL:
1814 return legalizeMul(Helper, MI);
1815 case TargetOpcode::G_CTLZ:
1816 case TargetOpcode::G_CTTZ:
1817 return legalizeCTLZ_CTTZ(MI, MRI, B);
1818 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1819 return legalizeFPTruncRound(MI, B);
1820 default:
1821 return false;
1822 }
1823
1824 llvm_unreachable("expected switch to return");
1825}
1826
1828 unsigned AS,
1830 MachineIRBuilder &B) const {
1831 MachineFunction &MF = B.getMF();
1832 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1833 const LLT S32 = LLT::scalar(32);
1834 const LLT S64 = LLT::scalar(64);
1835
1837
1838 if (ST.hasApertureRegs()) {
1839 // Note: this register is somewhat broken. When used as a 32-bit operand,
1840 // it only returns zeroes. The real value is in the upper 32 bits.
1841 // Thus, we must emit extract the high 32 bits.
1842 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
1843 ? AMDGPU::SRC_SHARED_BASE
1844 : AMDGPU::SRC_PRIVATE_BASE;
1845 // FIXME: It would be more natural to emit a COPY here, but then copy
1846 // coalescing would kick in and it would think it's okay to use the "HI"
1847 // subregister (instead of extracting the HI 32 bits) which is an artificial
1848 // (unusable) register.
1849 // Register TableGen definitions would need an overhaul to get rid of the
1850 // artificial "HI" aperture registers and prevent this kind of issue from
1851 // happening.
1852 Register Dst = MRI.createGenericVirtualRegister(S64);
1853 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
1854 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
1855 return B.buildUnmerge(S32, Dst).getReg(1);
1856 }
1857
1858 // TODO: can we be smarter about machine pointer info?
1860 Register LoadAddr = MRI.createGenericVirtualRegister(
1862 // For code object version 5, private_base and shared_base are passed through
1863 // implicit kernargs.
1869 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1870
1871 Register KernargPtrReg = MRI.createGenericVirtualRegister(
1873
1874 if (!loadInputValue(KernargPtrReg, B,
1876 return Register();
1877
1879 PtrInfo,
1883
1884 // Pointer address
1885 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1886 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1887 // Load address
1888 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1889 }
1890
1891 Register QueuePtr = MRI.createGenericVirtualRegister(
1893
1895 return Register();
1896
1897 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1898 // private_segment_aperture_base_hi.
1899 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1900
1902 PtrInfo,
1905 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1906
1907 B.buildPtrAdd(LoadAddr, QueuePtr,
1908 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1909 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1910}
1911
1912/// Return true if the value is a known valid address, such that a null check is
1913/// not necessary.
1915 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1916 MachineInstr *Def = MRI.getVRegDef(Val);
1917 switch (Def->getOpcode()) {
1918 case AMDGPU::G_FRAME_INDEX:
1919 case AMDGPU::G_GLOBAL_VALUE:
1920 case AMDGPU::G_BLOCK_ADDR:
1921 return true;
1922 case AMDGPU::G_CONSTANT: {
1923 const ConstantInt *CI = Def->getOperand(1).getCImm();
1924 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1925 }
1926 default:
1927 return false;
1928 }
1929
1930 return false;
1931}
1932
1935 MachineIRBuilder &B) const {
1936 MachineFunction &MF = B.getMF();
1937
1938 const LLT S32 = LLT::scalar(32);
1939 Register Dst = MI.getOperand(0).getReg();
1940 Register Src = MI.getOperand(1).getReg();
1941
1942 LLT DstTy = MRI.getType(Dst);
1943 LLT SrcTy = MRI.getType(Src);
1944 unsigned DestAS = DstTy.getAddressSpace();
1945 unsigned SrcAS = SrcTy.getAddressSpace();
1946
1947 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1948 // vector element.
1949 assert(!DstTy.isVector());
1950
1951 const AMDGPUTargetMachine &TM
1952 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1953
1954 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1955 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1956 return true;
1957 }
1958
1959 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1960 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1961 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1962 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1963 // Extract low 32-bits of the pointer.
1964 B.buildExtract(Dst, Src, 0);
1965 MI.eraseFromParent();
1966 return true;
1967 }
1968
1969 unsigned NullVal = TM.getNullPointerValue(DestAS);
1970
1971 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1972 auto FlatNull = B.buildConstant(SrcTy, 0);
1973
1974 // Extract low 32-bits of the pointer.
1975 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1976
1977 auto CmpRes =
1978 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1979 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1980
1981 MI.eraseFromParent();
1982 return true;
1983 }
1984
1985 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1986 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1987 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1988 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1989 if (!ApertureReg.isValid())
1990 return false;
1991
1992 // Coerce the type of the low half of the result so we can use merge_values.
1993 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1994
1995 // TODO: Should we allow mismatched types but matching sizes in merges to
1996 // avoid the ptrtoint?
1997 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
1998
1999 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2000 B.buildCopy(Dst, BuildPtr);
2001 MI.eraseFromParent();
2002 return true;
2003 }
2004
2005 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2006 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2007
2008 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2009 SegmentNull.getReg(0));
2010
2011 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2012
2013 MI.eraseFromParent();
2014 return true;
2015 }
2016
2017 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2018 SrcTy.getSizeInBits() == 64) {
2019 // Truncate.
2020 B.buildExtract(Dst, Src, 0);
2021 MI.eraseFromParent();
2022 return true;
2023 }
2024
2025 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2026 DstTy.getSizeInBits() == 64) {
2028 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2029 auto PtrLo = B.buildPtrToInt(S32, Src);
2030 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2031 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2032 MI.eraseFromParent();
2033 return true;
2034 }
2035
2036 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2037 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2038
2039 LLVMContext &Ctx = MF.getFunction().getContext();
2040 Ctx.diagnose(InvalidAddrSpaceCast);
2041 B.buildUndef(Dst);
2042 MI.eraseFromParent();
2043 return true;
2044}
2045
2048 MachineIRBuilder &B) const {
2049 Register Src = MI.getOperand(1).getReg();
2050 LLT Ty = MRI.getType(Src);
2051 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2052
2053 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2054 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2055
2056 auto C1 = B.buildFConstant(Ty, C1Val);
2057 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2058
2059 // TODO: Should this propagate fast-math-flags?
2060 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2061 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2062
2063 auto C2 = B.buildFConstant(Ty, C2Val);
2064 auto Fabs = B.buildFAbs(Ty, Src);
2065
2066 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2067 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2068 MI.eraseFromParent();
2069 return true;
2070}
2071
2074 MachineIRBuilder &B) const {
2075
2076 const LLT S1 = LLT::scalar(1);
2077 const LLT S64 = LLT::scalar(64);
2078
2079 Register Src = MI.getOperand(1).getReg();
2080 assert(MRI.getType(Src) == S64);
2081
2082 // result = trunc(src)
2083 // if (src > 0.0 && src != result)
2084 // result += 1.0
2085
2086 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2087
2088 const auto Zero = B.buildFConstant(S64, 0.0);
2089 const auto One = B.buildFConstant(S64, 1.0);
2090 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2091 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2092 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2093 auto Add = B.buildSelect(S64, And, One, Zero);
2094
2095 // TODO: Should this propagate fast-math-flags?
2096 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2097 MI.eraseFromParent();
2098 return true;
2099}
2100
2103 MachineIRBuilder &B) const {
2104 Register DstReg = MI.getOperand(0).getReg();
2105 Register Src0Reg = MI.getOperand(1).getReg();
2106 Register Src1Reg = MI.getOperand(2).getReg();
2107 auto Flags = MI.getFlags();
2108 LLT Ty = MRI.getType(DstReg);
2109
2110 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2111 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2112 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2113 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2114 MI.eraseFromParent();
2115 return true;
2116}
2117
2120 const unsigned FractBits = 52;
2121 const unsigned ExpBits = 11;
2122 LLT S32 = LLT::scalar(32);
2123
2124 auto Const0 = B.buildConstant(S32, FractBits - 32);
2125 auto Const1 = B.buildConstant(S32, ExpBits);
2126
2127 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2128 .addUse(Hi)
2129 .addUse(Const0.getReg(0))
2130 .addUse(Const1.getReg(0));
2131
2132 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2133}
2134
2137 MachineIRBuilder &B) const {
2138 const LLT S1 = LLT::scalar(1);
2139 const LLT S32 = LLT::scalar(32);
2140 const LLT S64 = LLT::scalar(64);
2141
2142 Register Src = MI.getOperand(1).getReg();
2143 assert(MRI.getType(Src) == S64);
2144
2145 // TODO: Should this use extract since the low half is unused?
2146 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2147 Register Hi = Unmerge.getReg(1);
2148
2149 // Extract the upper half, since this is where we will find the sign and
2150 // exponent.
2151 auto Exp = extractF64Exponent(Hi, B);
2152
2153 const unsigned FractBits = 52;
2154
2155 // Extract the sign bit.
2156 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2157 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2158
2159 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2160
2161 const auto Zero32 = B.buildConstant(S32, 0);
2162
2163 // Extend back to 64-bits.
2164 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2165
2166 auto Shr = B.buildAShr(S64, FractMask, Exp);
2167 auto Not = B.buildNot(S64, Shr);
2168 auto Tmp0 = B.buildAnd(S64, Src, Not);
2169 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2170
2171 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2172 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2173
2174 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2175 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2176 MI.eraseFromParent();
2177 return true;
2178}
2179
2182 MachineIRBuilder &B, bool Signed) const {
2183
2184 Register Dst = MI.getOperand(0).getReg();
2185 Register Src = MI.getOperand(1).getReg();
2186
2187 const LLT S64 = LLT::scalar(64);
2188 const LLT S32 = LLT::scalar(32);
2189
2190 assert(MRI.getType(Src) == S64);
2191
2192 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2193 auto ThirtyTwo = B.buildConstant(S32, 32);
2194
2195 if (MRI.getType(Dst) == S64) {
2196 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2197 : B.buildUITOFP(S64, Unmerge.getReg(1));
2198
2199 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2200 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2201 .addUse(CvtHi.getReg(0))
2202 .addUse(ThirtyTwo.getReg(0));
2203
2204 // TODO: Should this propagate fast-math-flags?
2205 B.buildFAdd(Dst, LdExp, CvtLo);
2206 MI.eraseFromParent();
2207 return true;
2208 }
2209
2210 assert(MRI.getType(Dst) == S32);
2211
2212 auto One = B.buildConstant(S32, 1);
2213
2214 MachineInstrBuilder ShAmt;
2215 if (Signed) {
2216 auto ThirtyOne = B.buildConstant(S32, 31);
2217 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2218 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2219 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2220 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2221 /*HasSideEffects=*/false)
2222 .addUse(Unmerge.getReg(1));
2223 auto LS2 = B.buildSub(S32, LS, One);
2224 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2225 } else
2226 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2227 auto Norm = B.buildShl(S64, Src, ShAmt);
2228 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2229 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2230 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2231 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2232 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2233 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2234 /*HasSideEffects=*/false)
2235 .addUse(FVal.getReg(0))
2236 .addUse(Scale.getReg(0));
2237 MI.eraseFromParent();
2238 return true;
2239}
2240
2241// TODO: Copied from DAG implementation. Verify logic and document how this
2242// actually works.
2246 bool Signed) const {
2247
2248 Register Dst = MI.getOperand(0).getReg();
2249 Register Src = MI.getOperand(1).getReg();
2250
2251 const LLT S64 = LLT::scalar(64);
2252 const LLT S32 = LLT::scalar(32);
2253
2254 const LLT SrcLT = MRI.getType(Src);
2255 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2256
2257 unsigned Flags = MI.getFlags();
2258
2259 // The basic idea of converting a floating point number into a pair of 32-bit
2260 // integers is illustrated as follows:
2261 //
2262 // tf := trunc(val);
2263 // hif := floor(tf * 2^-32);
2264 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2265 // hi := fptoi(hif);
2266 // lo := fptoi(lof);
2267 //
2268 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2270 if (Signed && SrcLT == S32) {
2271 // However, a 32-bit floating point number has only 23 bits mantissa and
2272 // it's not enough to hold all the significant bits of `lof` if val is
2273 // negative. To avoid the loss of precision, We need to take the absolute
2274 // value after truncating and flip the result back based on the original
2275 // signedness.
2276 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2277 Trunc = B.buildFAbs(S32, Trunc, Flags);
2278 }
2279 MachineInstrBuilder K0, K1;
2280 if (SrcLT == S64) {
2281 K0 = B.buildFConstant(S64,
2282 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2283 K1 = B.buildFConstant(S64,
2284 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2285 } else {
2286 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2287 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2288 }
2289
2290 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2291 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2292 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2293
2294 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2295 : B.buildFPTOUI(S32, FloorMul);
2296 auto Lo = B.buildFPTOUI(S32, Fma);
2297
2298 if (Signed && SrcLT == S32) {
2299 // Flip the result based on the signedness, which is either all 0s or 1s.
2300 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2301 // r := xor({lo, hi}, sign) - sign;
2302 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2303 Sign);
2304 } else
2305 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2306 MI.eraseFromParent();
2307
2308 return true;
2309}
2310
2312 MachineInstr &MI) const {
2313 MachineFunction &MF = Helper.MIRBuilder.getMF();
2315
2316 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2317 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2318
2319 // With ieee_mode disabled, the instructions have the correct behavior
2320 // already for G_FMINNUM/G_FMAXNUM
2321 if (!MFI->getMode().IEEE)
2322 return !IsIEEEOp;
2323
2324 if (IsIEEEOp)
2325 return true;
2326
2328}
2329
2332 MachineIRBuilder &B) const {
2333 // TODO: Should move some of this into LegalizerHelper.
2334
2335 // TODO: Promote dynamic indexing of s16 to s32
2336
2337 // FIXME: Artifact combiner probably should have replaced the truncated
2338 // constant before this, so we shouldn't need
2339 // getIConstantVRegValWithLookThrough.
2340 std::optional<ValueAndVReg> MaybeIdxVal =
2341 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2342 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2343 return true;
2344 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2345
2346 Register Dst = MI.getOperand(0).getReg();
2347 Register Vec = MI.getOperand(1).getReg();
2348
2349 LLT VecTy = MRI.getType(Vec);
2350 LLT EltTy = VecTy.getElementType();
2351 assert(EltTy == MRI.getType(Dst));
2352
2353 if (IdxVal < VecTy.getNumElements()) {
2354 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2355 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2356 } else {
2357 B.buildUndef(Dst);
2358 }
2359
2360 MI.eraseFromParent();
2361 return true;
2362}
2363
2366 MachineIRBuilder &B) const {
2367 // TODO: Should move some of this into LegalizerHelper.
2368
2369 // TODO: Promote dynamic indexing of s16 to s32
2370
2371 // FIXME: Artifact combiner probably should have replaced the truncated
2372 // constant before this, so we shouldn't need
2373 // getIConstantVRegValWithLookThrough.
2374 std::optional<ValueAndVReg> MaybeIdxVal =
2375 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2376 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2377 return true;
2378
2379 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2380 Register Dst = MI.getOperand(0).getReg();
2381 Register Vec = MI.getOperand(1).getReg();
2382 Register Ins = MI.getOperand(2).getReg();
2383
2384 LLT VecTy = MRI.getType(Vec);
2385 LLT EltTy = VecTy.getElementType();
2386 assert(EltTy == MRI.getType(Ins));
2387 (void)Ins;
2388
2389 unsigned NumElts = VecTy.getNumElements();
2390 if (IdxVal < NumElts) {
2392 for (unsigned i = 0; i < NumElts; ++i)
2393 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2394 B.buildUnmerge(SrcRegs, Vec);
2395
2396 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2397 B.buildMergeLikeInstr(Dst, SrcRegs);
2398 } else {
2399 B.buildUndef(Dst);
2400 }
2401
2402 MI.eraseFromParent();
2403 return true;
2404}
2405
2408 MachineIRBuilder &B) const {
2409
2410 Register DstReg = MI.getOperand(0).getReg();
2411 Register SrcReg = MI.getOperand(1).getReg();
2412 LLT Ty = MRI.getType(DstReg);
2413 unsigned Flags = MI.getFlags();
2414
2415 Register TrigVal;
2416 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2417 if (ST.hasTrigReducedRange()) {
2418 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2419 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2420 .addUse(MulVal.getReg(0))
2421 .setMIFlags(Flags).getReg(0);
2422 } else
2423 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2424
2425 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2426 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2427 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
2428 .addUse(TrigVal)
2429 .setMIFlags(Flags);
2430 MI.eraseFromParent();
2431 return true;
2432}
2433
2436 const GlobalValue *GV,
2437 int64_t Offset,
2438 unsigned GAFlags) const {
2439 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2440 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2441 // to the following code sequence:
2442 //
2443 // For constant address space:
2444 // s_getpc_b64 s[0:1]
2445 // s_add_u32 s0, s0, $symbol
2446 // s_addc_u32 s1, s1, 0
2447 //
2448 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2449 // a fixup or relocation is emitted to replace $symbol with a literal
2450 // constant, which is a pc-relative offset from the encoding of the $symbol
2451 // operand to the global variable.
2452 //
2453 // For global address space:
2454 // s_getpc_b64 s[0:1]
2455 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2456 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2457 //
2458 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2459 // fixups or relocations are emitted to replace $symbol@*@lo and
2460 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2461 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2462 // operand to the global variable.
2463 //
2464 // What we want here is an offset from the value returned by s_getpc
2465 // (which is the address of the s_add_u32 instruction) to the global
2466 // variable, but since the encoding of $symbol starts 4 bytes after the start
2467 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2468 // small. This requires us to add 4 to the global variable offset in order to
2469 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2470 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2471 // instruction.
2472
2474
2475 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2476 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2477
2478 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2479 .addDef(PCReg);
2480
2481 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2482 if (GAFlags == SIInstrInfo::MO_NONE)
2483 MIB.addImm(0);
2484 else
2485 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2486
2487 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2488
2489 if (PtrTy.getSizeInBits() == 32)
2490 B.buildExtract(DstReg, PCReg, 0);
2491 return true;
2492 }
2493
2496 MachineIRBuilder &B) const {
2497 Register DstReg = MI.getOperand(0).getReg();
2498 LLT Ty = MRI.getType(DstReg);
2499 unsigned AS = Ty.getAddressSpace();
2500
2501 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2502 MachineFunction &MF = B.getMF();
2504
2506 if (!MFI->isModuleEntryFunction() &&
2507 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2508 const Function &Fn = MF.getFunction();
2509 DiagnosticInfoUnsupported BadLDSDecl(
2510 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2511 DS_Warning);
2512 Fn.getContext().diagnose(BadLDSDecl);
2513
2514 // We currently don't have a way to correctly allocate LDS objects that
2515 // aren't directly associated with a kernel. We do force inlining of
2516 // functions that use local objects. However, if these dead functions are
2517 // not eliminated, we don't want a compile time error. Just emit a warning
2518 // and a trap, since there should be no callable path here.
2519 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2520 B.buildUndef(DstReg);
2521 MI.eraseFromParent();
2522 return true;
2523 }
2524
2525 // TODO: We could emit code to handle the initialization somewhere.
2526 // We ignore the initializer for now and legalize it to allow selection.
2527 // The initializer will anyway get errored out during assembly emission.
2528 const SITargetLowering *TLI = ST.getTargetLowering();
2529 if (!TLI->shouldUseLDSConstAddress(GV)) {
2530 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2531 return true; // Leave in place;
2532 }
2533
2534 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2535 Type *Ty = GV->getValueType();
2536 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2537 // zero-sized type in other languages to declare the dynamic shared
2538 // memory which size is not known at the compile time. They will be
2539 // allocated by the runtime and placed directly after the static
2540 // allocated ones. They all share the same offset.
2541 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2542 // Adjust alignment for that dynamic shared memory array.
2543 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2544 LLT S32 = LLT::scalar(32);
2545 auto Sz =
2546 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2547 B.buildIntToPtr(DstReg, Sz);
2548 MI.eraseFromParent();
2549 return true;
2550 }
2551 }
2552
2553 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2554 *cast<GlobalVariable>(GV)));
2555 MI.eraseFromParent();
2556 return true;
2557 }
2558
2559 const SITargetLowering *TLI = ST.getTargetLowering();
2560
2561 if (TLI->shouldEmitFixup(GV)) {
2562 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2563 MI.eraseFromParent();
2564 return true;
2565 }
2566
2567 if (TLI->shouldEmitPCReloc(GV)) {
2568 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2569 MI.eraseFromParent();
2570 return true;
2571 }
2572
2574 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2575
2576 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2581 LoadTy, Align(8));
2582
2583 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2584
2585 if (Ty.getSizeInBits() == 32) {
2586 // Truncate if this is a 32-bit constant address.
2587 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2588 B.buildExtract(DstReg, Load, 0);
2589 } else
2590 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2591
2592 MI.eraseFromParent();
2593 return true;
2594}
2595
2597 if (Ty.isVector())
2598 return Ty.changeElementCount(
2601}
2602
2604 MachineInstr &MI) const {
2605 MachineIRBuilder &B = Helper.MIRBuilder;
2606 MachineRegisterInfo &MRI = *B.getMRI();
2607 GISelChangeObserver &Observer = Helper.Observer;
2608
2609 Register PtrReg = MI.getOperand(1).getReg();
2610 LLT PtrTy = MRI.getType(PtrReg);
2611 unsigned AddrSpace = PtrTy.getAddressSpace();
2612
2613 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2615 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2616 Observer.changingInstr(MI);
2617 MI.getOperand(1).setReg(Cast.getReg(0));
2618 Observer.changedInstr(MI);
2619 return true;
2620 }
2621
2622 if (MI.getOpcode() != AMDGPU::G_LOAD)
2623 return false;
2624
2625 Register ValReg = MI.getOperand(0).getReg();
2626 LLT ValTy = MRI.getType(ValReg);
2627
2628 MachineMemOperand *MMO = *MI.memoperands_begin();
2629 const unsigned ValSize = ValTy.getSizeInBits();
2630 const LLT MemTy = MMO->getMemoryType();
2631 const Align MemAlign = MMO->getAlign();
2632 const unsigned MemSize = MemTy.getSizeInBits();
2633 const uint64_t AlignInBits = 8 * MemAlign.value();
2634
2635 // Widen non-power-of-2 loads to the alignment if needed
2636 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2637 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2638
2639 // This was already the correct extending load result type, so just adjust
2640 // the memory type.
2641 if (WideMemSize == ValSize) {
2642 MachineFunction &MF = B.getMF();
2643
2644 MachineMemOperand *WideMMO =
2645 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2646 Observer.changingInstr(MI);
2647 MI.setMemRefs(MF, {WideMMO});
2648 Observer.changedInstr(MI);
2649 return true;
2650 }
2651
2652 // Don't bother handling edge case that should probably never be produced.
2653 if (ValSize > WideMemSize)
2654 return false;
2655
2656 LLT WideTy = widenToNextPowerOf2(ValTy);
2657
2658 Register WideLoad;
2659 if (!WideTy.isVector()) {
2660 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2661 B.buildTrunc(ValReg, WideLoad).getReg(0);
2662 } else {
2663 // Extract the subvector.
2664
2665 if (isRegisterType(ValTy)) {
2666 // If this a case where G_EXTRACT is legal, use it.
2667 // (e.g. <3 x s32> -> <4 x s32>)
2668 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2669 B.buildExtract(ValReg, WideLoad, 0);
2670 } else {
2671 // For cases where the widened type isn't a nice register value, unmerge
2672 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2673 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2674 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2675 }
2676 }
2677
2678 MI.eraseFromParent();
2679 return true;
2680 }
2681
2682 return false;
2683}
2684
2687 MachineIRBuilder &B) const {
2688 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2689 assert(Ty.isScalar());
2690
2691 MachineFunction &MF = B.getMF();
2693
2694 // TODO: Always legal with future ftz flag.
2695 // FIXME: Do we need just output?
2696 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2697 return true;
2698 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2699 return true;
2700
2701 MachineIRBuilder HelperBuilder(MI);
2702 GISelObserverWrapper DummyObserver;
2703 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2704 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2705}
2706
2709 Register DstReg = MI.getOperand(0).getReg();
2710 Register PtrReg = MI.getOperand(1).getReg();
2711 Register CmpVal = MI.getOperand(2).getReg();
2712 Register NewVal = MI.getOperand(3).getReg();
2713
2714 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2715 "this should not have been custom lowered");
2716
2717 LLT ValTy = MRI.getType(CmpVal);
2718 LLT VecTy = LLT::fixed_vector(2, ValTy);
2719
2720 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2721
2722 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2723 .addDef(DstReg)
2724 .addUse(PtrReg)
2725 .addUse(PackedVal)
2726 .setMemRefs(MI.memoperands());
2727
2728 MI.eraseFromParent();
2729 return true;
2730}
2731
2733 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2734 Register Dst = MI.getOperand(0).getReg();
2735 Register Src = MI.getOperand(1).getReg();
2736 LLT Ty = B.getMRI()->getType(Dst);
2737 unsigned Flags = MI.getFlags();
2738
2739 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2740 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2741
2742 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2743 MI.eraseFromParent();
2744 return true;
2745}
2746
2748 MachineIRBuilder &B) const {
2749 Register Dst = MI.getOperand(0).getReg();
2750 Register Src = MI.getOperand(1).getReg();
2751 unsigned Flags = MI.getFlags();
2752 LLT Ty = B.getMRI()->getType(Dst);
2753
2754 auto K = B.buildFConstant(Ty, numbers::log2e);
2755 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2756 B.buildFExp2(Dst, Mul, Flags);
2757 MI.eraseFromParent();
2758 return true;
2759}
2760
2762 MachineIRBuilder &B) const {
2763 Register Dst = MI.getOperand(0).getReg();
2764 Register Src0 = MI.getOperand(1).getReg();
2765 Register Src1 = MI.getOperand(2).getReg();
2766 unsigned Flags = MI.getFlags();
2767 LLT Ty = B.getMRI()->getType(Dst);
2768 const LLT S16 = LLT::scalar(16);
2769 const LLT S32 = LLT::scalar(32);
2770
2771 if (Ty == S32) {
2772 auto Log = B.buildFLog2(S32, Src0, Flags);
2773 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2774 .addUse(Log.getReg(0))
2775 .addUse(Src1)
2776 .setMIFlags(Flags);
2777 B.buildFExp2(Dst, Mul, Flags);
2778 } else if (Ty == S16) {
2779 // There's no f16 fmul_legacy, so we need to convert for it.
2780 auto Log = B.buildFLog2(S16, Src0, Flags);
2781 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2782 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2783 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2784 .addUse(Ext0.getReg(0))
2785 .addUse(Ext1.getReg(0))
2786 .setMIFlags(Flags);
2787
2788 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2789 } else
2790 return false;
2791
2792 MI.eraseFromParent();
2793 return true;
2794}
2795
2796// Find a source register, ignoring any possible source modifiers.
2798 Register ModSrc = OrigSrc;
2799 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2800 ModSrc = SrcFNeg->getOperand(1).getReg();
2801 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2802 ModSrc = SrcFAbs->getOperand(1).getReg();
2803 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2804 ModSrc = SrcFAbs->getOperand(1).getReg();
2805 return ModSrc;
2806}
2807
2810 MachineIRBuilder &B) const {
2811
2812 const LLT S1 = LLT::scalar(1);
2813 const LLT S64 = LLT::scalar(64);
2814 Register Dst = MI.getOperand(0).getReg();
2815 Register OrigSrc = MI.getOperand(1).getReg();
2816 unsigned Flags = MI.getFlags();
2817 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2818 "this should not have been custom lowered");
2819
2820 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2821 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2822 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2823 // V_FRACT bug is:
2824 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2825 //
2826 // Convert floor(x) to (x - fract(x))
2827
2828 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2829 .addUse(OrigSrc)
2830 .setMIFlags(Flags);
2831
2832 // Give source modifier matching some assistance before obscuring a foldable
2833 // pattern.
2834
2835 // TODO: We can avoid the neg on the fract? The input sign to fract
2836 // shouldn't matter?
2837 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2838
2839 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2840
2841 Register Min = MRI.createGenericVirtualRegister(S64);
2842
2843 // We don't need to concern ourselves with the snan handling difference, so
2844 // use the one which will directly select.
2845 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2846 if (MFI->getMode().IEEE)
2847 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2848 else
2849 B.buildFMinNum(Min, Fract, Const, Flags);
2850
2851 Register CorrectedFract = Min;
2852 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2853 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2854 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2855 }
2856
2857 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2858 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2859
2860 MI.eraseFromParent();
2861 return true;
2862}
2863
2864// Turn an illegal packed v2s16 build vector into bit operations.
2865// TODO: This should probably be a bitcast action in LegalizerHelper.
2868 Register Dst = MI.getOperand(0).getReg();
2869 const LLT S32 = LLT::scalar(32);
2870 const LLT S16 = LLT::scalar(16);
2871 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2872
2873 Register Src0 = MI.getOperand(1).getReg();
2874 Register Src1 = MI.getOperand(2).getReg();
2875
2876 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
2877 assert(MRI.getType(Src0) == S32);
2878 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
2879 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
2880 }
2881
2882 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
2883 B.buildBitcast(Dst, Merge);
2884
2885 MI.eraseFromParent();
2886 return true;
2887}
2888
2889// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2890//
2891// Source and accumulation registers must all be 32-bits.
2892//
2893// TODO: When the multiply is uniform, we should produce a code sequence
2894// that is better suited to instruction selection on the SALU. Instead of
2895// the outer loop going over parts of the result, the outer loop should go
2896// over parts of one of the factors. This should result in instruction
2897// selection that makes full use of S_ADDC_U32 instructions.
2901 bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2902 // Use (possibly empty) vectors of S1 registers to represent the set of
2903 // carries from one pair of positions to the next.
2904 using Carry = SmallVector<Register, 2>;
2905
2906 MachineIRBuilder &B = Helper.MIRBuilder;
2907
2908 const LLT S1 = LLT::scalar(1);
2909 const LLT S32 = LLT::scalar(32);
2910 const LLT S64 = LLT::scalar(64);
2911
2912 Register Zero32;
2913 Register Zero64;
2914
2915 auto getZero32 = [&]() -> Register {
2916 if (!Zero32)
2917 Zero32 = B.buildConstant(S32, 0).getReg(0);
2918 return Zero32;
2919 };
2920 auto getZero64 = [&]() -> Register {
2921 if (!Zero64)
2922 Zero64 = B.buildConstant(S64, 0).getReg(0);
2923 return Zero64;
2924 };
2925
2926 // Merge the given carries into the 32-bit LocalAccum, which is modified
2927 // in-place.
2928 //
2929 // Returns the carry-out, which is a single S1 register or null.
2930 auto mergeCarry =
2931 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2932 if (CarryIn.empty())
2933 return Register();
2934
2935 bool HaveCarryOut = true;
2936 Register CarryAccum;
2937 if (CarryIn.size() == 1) {
2938 if (!LocalAccum) {
2939 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2940 return Register();
2941 }
2942
2943 CarryAccum = getZero32();
2944 } else {
2945 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2946 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2947 CarryAccum =
2948 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2949 .getReg(0);
2950 }
2951
2952 if (!LocalAccum) {
2953 LocalAccum = getZero32();
2954 HaveCarryOut = false;
2955 }
2956 }
2957
2958 auto Add =
2959 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2960 LocalAccum = Add.getReg(0);
2961 return HaveCarryOut ? Add.getReg(1) : Register();
2962 };
2963
2964 // Build a multiply-add chain to compute
2965 //
2966 // LocalAccum + (partial products at DstIndex)
2967 // + (opportunistic subset of CarryIn)
2968 //
2969 // LocalAccum is an array of one or two 32-bit registers that are updated
2970 // in-place. The incoming registers may be null.
2971 //
2972 // In some edge cases, carry-ins can be consumed "for free". In that case,
2973 // the consumed carry bits are removed from CarryIn in-place.
2974 auto buildMadChain =
2975 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2976 -> Carry {
2977 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2978 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2979
2980 Carry CarryOut;
2981 unsigned j0 = 0;
2982
2983 // Use plain 32-bit multiplication for the most significant part of the
2984 // result by default.
2985 if (LocalAccum.size() == 1 &&
2986 (!UsePartialMad64_32 || !CarryIn.empty())) {
2987 do {
2988 unsigned j1 = DstIndex - j0;
2989 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
2990 if (!LocalAccum[0]) {
2991 LocalAccum[0] = Mul.getReg(0);
2992 } else {
2993 if (CarryIn.empty()) {
2994 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
2995 } else {
2996 LocalAccum[0] =
2997 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
2998 .getReg(0);
2999 CarryIn.pop_back();
3000 }
3001 }
3002 ++j0;
3003 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3004 }
3005
3006 // Build full 64-bit multiplies.
3007 if (j0 <= DstIndex) {
3008 bool HaveSmallAccum = false;
3009 Register Tmp;
3010
3011 if (LocalAccum[0]) {
3012 if (LocalAccum.size() == 1) {
3013 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3014 HaveSmallAccum = true;
3015 } else if (LocalAccum[1]) {
3016 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3017 HaveSmallAccum = false;
3018 } else {
3019 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3020 HaveSmallAccum = true;
3021 }
3022 } else {
3023 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3024 Tmp = getZero64();
3025 HaveSmallAccum = true;
3026 }
3027
3028 do {
3029 unsigned j1 = DstIndex - j0;
3030 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3031 {Src0[j0], Src1[j1], Tmp});
3032 Tmp = Mad.getReg(0);
3033 if (!HaveSmallAccum)
3034 CarryOut.push_back(Mad.getReg(1));
3035 HaveSmallAccum = false;
3036 ++j0;
3037 } while (j0 <= DstIndex);
3038
3039 auto Unmerge = B.buildUnmerge(S32, Tmp);
3040 LocalAccum[0] = Unmerge.getReg(0);
3041 if (LocalAccum.size() > 1)
3042 LocalAccum[1] = Unmerge.getReg(1);
3043 }
3044
3045 return CarryOut;
3046 };
3047
3048 // Outer multiply loop, iterating over destination parts from least
3049 // significant to most significant parts.
3050 //
3051 // The columns of the following diagram correspond to the destination parts
3052 // affected by one iteration of the outer loop (ignoring boundary
3053 // conditions).
3054 //
3055 // Dest index relative to 2 * i: 1 0 -1
3056 // ------
3057 // Carries from previous iteration: e o
3058 // Even-aligned partial product sum: E E .
3059 // Odd-aligned partial product sum: O O
3060 //
3061 // 'o' is OddCarry, 'e' is EvenCarry.
3062 // EE and OO are computed from partial products via buildMadChain and use
3063 // accumulation where possible and appropriate.
3064 //
3065 Register SeparateOddCarry;
3066 Carry EvenCarry;
3067 Carry OddCarry;
3068
3069 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3070 Carry OddCarryIn = std::move(OddCarry);
3071 Carry EvenCarryIn = std::move(EvenCarry);
3072 OddCarry.clear();
3073 EvenCarry.clear();
3074
3075 // Partial products at offset 2 * i.
3076 if (2 * i < Accum.size()) {
3077 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3078 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3079 }
3080
3081 // Partial products at offset 2 * i - 1.
3082 if (i > 0) {
3083 if (!SeparateOddAlignedProducts) {
3084 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3085 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3086 } else {
3087 bool IsHighest = 2 * i >= Accum.size();
3088 Register SeparateOddOut[2];
3089 auto LocalAccum = MutableArrayRef(SeparateOddOut)
3090 .take_front(IsHighest ? 1 : 2);
3091 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3092
3094
3095 if (i == 1) {
3096 if (!IsHighest)
3097 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3098 else
3099 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3100 } else {
3101 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3102 SeparateOddCarry);
3103 }
3104 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3105
3106 if (!IsHighest) {
3107 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3108 Lo->getOperand(1).getReg());
3109 Accum[2 * i] = Hi.getReg(0);
3110 SeparateOddCarry = Hi.getReg(1);
3111 }
3112 }
3113 }
3114
3115 // Add in the carries from the previous iteration
3116 if (i > 0) {
3117 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3118 EvenCarryIn.push_back(CarryOut);
3119
3120 if (2 * i < Accum.size()) {
3121 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3122 OddCarry.push_back(CarryOut);
3123 }
3124 }
3125 }
3126}
3127
3128// Custom narrowing of wide multiplies using wide multiply-add instructions.
3129//
3130// TODO: If the multiply is followed by an addition, we should attempt to
3131// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3133 MachineInstr &MI) const {
3134 assert(ST.hasMad64_32());
3135 assert(MI.getOpcode() == TargetOpcode::G_MUL);
3136
3137 MachineIRBuilder &B = Helper.MIRBuilder;
3138 MachineRegisterInfo &MRI = *B.getMRI();
3139
3140 Register DstReg = MI.getOperand(0).getReg();
3141 Register Src0 = MI.getOperand(1).getReg();
3142 Register Src1 = MI.getOperand(2).getReg();
3143
3144 LLT Ty = MRI.getType(DstReg);
3145 assert(Ty.isScalar());
3146
3147 unsigned Size = Ty.getSizeInBits();
3148 unsigned NumParts = Size / 32;
3149 assert((Size % 32) == 0);
3150 assert(NumParts >= 2);
3151
3152 // Whether to use MAD_64_32 for partial products whose high half is
3153 // discarded. This avoids some ADD instructions but risks false dependency
3154 // stalls on some subtargets in some cases.
3155 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3156
3157 // Whether to compute odd-aligned partial products separately. This is
3158 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3159 // in an even-aligned VGPR.
3160 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3161
3162 LLT S32 = LLT::scalar(32);
3163 SmallVector<Register, 2> Src0Parts, Src1Parts;
3164 for (unsigned i = 0; i < NumParts; ++i) {
3165 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3166 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3167 }
3168 B.buildUnmerge(Src0Parts, Src0);
3169 B.buildUnmerge(Src1Parts, Src1);
3170
3171 SmallVector<Register, 2> AccumRegs(NumParts);
3172 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3173 SeparateOddAlignedProducts);
3174
3175 B.buildMergeLikeInstr(DstReg, AccumRegs);
3176 MI.eraseFromParent();
3177 return true;
3178
3179}
3180
3181// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3182// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3183// case with a single min instruction instead of a compare+select.
3186 MachineIRBuilder &B) const {
3187 Register Dst = MI.getOperand(0).getReg();
3188 Register Src = MI.getOperand(1).getReg();
3189 LLT DstTy = MRI.getType(Dst);
3190 LLT SrcTy = MRI.getType(Src);
3191
3192 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3193 ? AMDGPU::G_AMDGPU_FFBH_U32
3194 : AMDGPU::G_AMDGPU_FFBL_B32;
3195 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3196 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3197
3198 MI.eraseFromParent();
3199 return true;
3200}
3201
3202// Check that this is a G_XOR x, -1
3203static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3204 if (MI.getOpcode() != TargetOpcode::G_XOR)
3205 return false;
3206 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3207 return ConstVal && *ConstVal == -1;
3208}
3209
3210// Return the use branch instruction, otherwise null if the usage is invalid.
3211static MachineInstr *
3213 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3214 Register CondDef = MI.getOperand(0).getReg();
3215 if (!MRI.hasOneNonDBGUse(CondDef))
3216 return nullptr;
3217
3218 MachineBasicBlock *Parent = MI.getParent();
3219 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3220
3221 if (isNot(MRI, *UseMI)) {
3222 Register NegatedCond = UseMI->getOperand(0).getReg();
3223 if (!MRI.hasOneNonDBGUse(NegatedCond))
3224 return nullptr;
3225
3226 // We're deleting the def of this value, so we need to remove it.
3227 eraseInstr(*UseMI, MRI);
3228
3229 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3230 Negated = true;
3231 }
3232
3233 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3234 return nullptr;
3235
3236 // Make sure the cond br is followed by a G_BR, or is the last instruction.
3237 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3238 if (Next == Parent->end()) {
3239 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3240 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3241 return nullptr;
3242 UncondBrTarget = &*NextMBB;
3243 } else {
3244 if (Next->getOpcode() != AMDGPU::G_BR)
3245 return nullptr;
3246 Br = &*Next;
3247 UncondBrTarget = Br->getOperand(0).getMBB();
3248 }
3249
3250 return UseMI;
3251}
3252
3254 const ArgDescriptor *Arg,
3255 const TargetRegisterClass *ArgRC,
3256 LLT ArgTy) const {
3257 MCRegister SrcReg = Arg->getRegister();
3258 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3259 assert(DstReg.isVirtual() && "Virtual register expected");
3260
3261 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3262 *ArgRC, B.getDebugLoc(), ArgTy);
3263 if (Arg->isMasked()) {
3264 // TODO: Should we try to emit this once in the entry block?
3265 const LLT S32 = LLT::scalar(32);
3266 const unsigned Mask = Arg->getMask();
3267 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
3268
3269 Register AndMaskSrc = LiveIn;
3270
3271 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3272 // 0.
3273 if (Shift != 0) {
3274 auto ShiftAmt = B.buildConstant(S32, Shift);
3275 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3276 }
3277
3278 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3279 } else {
3280 B.buildCopy(DstReg, LiveIn);
3281 }
3282
3283 return true;
3284}
3285
3287 Register DstReg, MachineIRBuilder &B,
3289 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3290 const ArgDescriptor *Arg;
3291 const TargetRegisterClass *ArgRC;
3292 LLT ArgTy;
3293 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3294
3295 if (!Arg) {
3297 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3298 // case the pointer argument may be missing and we use null.
3299 B.buildConstant(DstReg, 0);
3300 return true;
3301 }
3302
3303 // It's undefined behavior if a function marked with the amdgpu-no-*
3304 // attributes uses the corresponding intrinsic.
3305 B.buildUndef(DstReg);
3306 return true;
3307 }
3308
3309 if (!Arg->isRegister() || !Arg->getRegister().isValid())
3310 return false; // TODO: Handle these
3311 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3312}
3313
3317 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3318 return false;
3319
3320 MI.eraseFromParent();
3321 return true;
3322}
3323
3325 int64_t C) {
3326 B.buildConstant(MI.getOperand(0).getReg(), C);
3327 MI.eraseFromParent();
3328 return true;
3329}
3330
3333 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3334 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3335 if (MaxID == 0)
3336 return replaceWithConstant(B, MI, 0);
3337
3338 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3339 const ArgDescriptor *Arg;
3340 const TargetRegisterClass *ArgRC;
3341 LLT ArgTy;
3342 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3343
3344 Register DstReg = MI.getOperand(0).getReg();
3345 if (!Arg) {
3346 // It's undefined behavior if a function marked with the amdgpu-no-*
3347 // attributes uses the corresponding intrinsic.
3348 B.buildUndef(DstReg);
3349 MI.eraseFromParent();
3350 return true;
3351 }
3352
3353 if (Arg->isMasked()) {
3354 // Don't bother inserting AssertZext for packed IDs since we're emitting the
3355 // masking operations anyway.
3356 //
3357 // TODO: We could assert the top bit is 0 for the source copy.
3358 if (!loadInputValue(DstReg, B, ArgType))
3359 return false;
3360 } else {
3361 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3362 if (!loadInputValue(TmpReg, B, ArgType))
3363 return false;
3364 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
3365 }
3366
3367 MI.eraseFromParent();
3368 return true;
3369}
3370
3372 int64_t Offset) const {
3374 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3375
3376 // TODO: If we passed in the base kernel offset we could have a better
3377 // alignment than 4, but we don't really need it.
3378 if (!loadInputValue(KernArgReg, B,
3380 llvm_unreachable("failed to find kernarg segment ptr");
3381
3382 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3383 // TODO: Should get nuw
3384 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3385}
3386
3387/// Legalize a value that's loaded from kernel arguments. This is only used by
3388/// legacy intrinsics.
3392 Align Alignment) const {
3393 Register DstReg = MI.getOperand(0).getReg();
3394
3395 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3396 "unexpected kernarg parameter type");
3397
3400 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3403 MI.eraseFromParent();
3404 return true;
3405}
3406
3409 MachineIRBuilder &B) const {
3410 Register Dst = MI.getOperand(0).getReg();
3411 LLT DstTy = MRI.getType(Dst);
3412 LLT S16 = LLT::scalar(16);
3413 LLT S32 = LLT::scalar(32);
3414 LLT S64 = LLT::scalar(64);
3415
3416 if (DstTy == S16)
3417 return legalizeFDIV16(MI, MRI, B);
3418 if (DstTy == S32)
3419 return legalizeFDIV32(MI, MRI, B);
3420 if (DstTy == S64)
3421 return legalizeFDIV64(MI, MRI, B);
3422
3423 return false;
3424}
3425
3427 Register DstDivReg,
3428 Register DstRemReg,
3429 Register X,
3430 Register Y) const {
3431 const LLT S1 = LLT::scalar(1);
3432 const LLT S32 = LLT::scalar(32);
3433
3434 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3435 // algorithm used here.
3436
3437 // Initial estimate of inv(y).
3438 auto FloatY = B.buildUITOFP(S32, Y);
3439 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3440 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3441 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3442 auto Z = B.buildFPTOUI(S32, ScaledY);
3443
3444 // One round of UNR.
3445 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3446 auto NegYZ = B.buildMul(S32, NegY, Z);
3447 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3448
3449 // Quotient/remainder estimate.
3450 auto Q = B.buildUMulH(S32, X, Z);
3451 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3452
3453 // First quotient/remainder refinement.
3454 auto One = B.buildConstant(S32, 1);
3455 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3456 if (DstDivReg)
3457 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3458 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3459
3460 // Second quotient/remainder refinement.
3461 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3462 if (DstDivReg)
3463 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3464
3465 if (DstRemReg)
3466 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3467}
3468
3469// Build integer reciprocal sequence around V_RCP_IFLAG_F32
3470//
3471// Return lo, hi of result
3472//
3473// %cvt.lo = G_UITOFP Val.lo
3474// %cvt.hi = G_UITOFP Val.hi
3475// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3476// %rcp = G_AMDGPU_RCP_IFLAG %mad
3477// %mul1 = G_FMUL %rcp, 0x5f7ffffc
3478// %mul2 = G_FMUL %mul1, 2**(-32)
3479// %trunc = G_INTRINSIC_TRUNC %mul2
3480// %mad2 = G_FMAD %trunc, -(2**32), %mul1
3481// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3482static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3483 Register Val) {
3484 const LLT S32 = LLT::scalar(32);
3485 auto Unmerge = B.buildUnmerge(S32, Val);
3486
3487 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3488 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3489
3490 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3491 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3492
3493 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3494 auto Mul1 =
3495 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3496
3497 // 2**(-32)
3498 auto Mul2 =
3499 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3500 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3501
3502 // -(2**32)
3503 auto Mad2 = B.buildFMAD(S32, Trunc,
3504 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3505
3506 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3507 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3508
3509 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3510}
3511
3513 Register DstDivReg,
3514 Register DstRemReg,
3515 Register Numer,
3516 Register Denom) const {
3517 const LLT S32 = LLT::scalar(32);
3518 const LLT S64 = LLT::scalar(64);
3519 const LLT S1 = LLT::scalar(1);
3520 Register RcpLo, RcpHi;
3521
3522 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3523
3524 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
3525
3526 auto Zero64 = B.buildConstant(S64, 0);
3527 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3528
3529 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3530 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3531
3532 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3533 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3534 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3535
3536 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3537 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3538 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
3539
3540 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3541 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3542 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3543 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3544 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3545
3546 auto Zero32 = B.buildConstant(S32, 0);
3547 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3548 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3549 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
3550
3551 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3552 Register NumerLo = UnmergeNumer.getReg(0);
3553 Register NumerHi = UnmergeNumer.getReg(1);
3554
3555 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3556 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3557 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3558 Register Mul3_Lo = UnmergeMul3.getReg(0);
3559 Register Mul3_Hi = UnmergeMul3.getReg(1);
3560 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3561 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3562 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3563 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
3564
3565 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3566 Register DenomLo = UnmergeDenom.getReg(0);
3567 Register DenomHi = UnmergeDenom.getReg(1);
3568
3569 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3570 auto C1 = B.buildSExt(S32, CmpHi);
3571
3572 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3573 auto C2 = B.buildSExt(S32, CmpLo);
3574
3575 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3576 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3577
3578 // TODO: Here and below portions of the code can be enclosed into if/endif.
3579 // Currently control flow is unconditional and we have 4 selects after
3580 // potential endif to substitute PHIs.
3581
3582 // if C3 != 0 ...
3583 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3584 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3585 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3586 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
3587
3588 auto One64 = B.buildConstant(S64, 1);
3589 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3590
3591 auto C4 =
3592 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3593 auto C5 =
3594 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3595 auto C6 = B.buildSelect(
3596 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3597
3598 // if (C6 != 0)
3599 auto Add4 = B.buildAdd(S64, Add3, One64);
3600 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3601
3602 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3603 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3604 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
3605
3606 // endif C6
3607 // endif C3
3608
3609 if (DstDivReg) {
3610 auto Sel1 = B.buildSelect(
3611 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3612 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3613 Sel1, MulHi3);
3614 }
3615
3616 if (DstRemReg) {
3617 auto Sel2 = B.buildSelect(
3618 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3619 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3620 Sel2, Sub1);
3621 }
3622}
3623
3626 MachineIRBuilder &B) const {
3627 Register DstDivReg, DstRemReg;
3628 switch (MI.getOpcode()) {
3629 default:
3630 llvm_unreachable("Unexpected opcode!");
3631 case AMDGPU::G_UDIV: {
3632 DstDivReg = MI.getOperand(0).getReg();
3633 break;
3634 }
3635 case AMDGPU::G_UREM: {
3636 DstRemReg = MI.getOperand(0).getReg();
3637 break;
3638 }
3639 case AMDGPU::G_UDIVREM: {
3640 DstDivReg = MI.getOperand(0).getReg();
3641 DstRemReg = MI.getOperand(1).getReg();
3642 break;
3643 }
3644 }
3645
3646 const LLT S64 = LLT::scalar(64);
3647 const LLT S32 = LLT::scalar(32);
3648 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3649 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3650 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3651 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3652
3653 if (Ty == S32)
3654 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3655 else if (Ty == S64)
3656 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3657 else
3658 return false;
3659
3660 MI.eraseFromParent();
3661 return true;
3662}
3663
3666 MachineIRBuilder &B) const {
3667 const LLT S64 = LLT::scalar(64);
3668 const LLT S32 = LLT::scalar(32);
3669
3670 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3671 if (Ty != S32 && Ty != S64)
3672 return false;
3673
3674 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3675 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3676 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3677
3678 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3679 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3680 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3681
3682 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3683 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3684
3685 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3686 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3687
3688 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3689 switch (MI.getOpcode()) {
3690 default:
3691 llvm_unreachable("Unexpected opcode!");
3692 case AMDGPU::G_SDIV: {
3693 DstDivReg = MI.getOperand(0).getReg();
3694 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3695 break;
3696 }
3697 case AMDGPU::G_SREM: {
3698 DstRemReg = MI.getOperand(0).getReg();
3699 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3700 break;
3701 }
3702 case AMDGPU::G_SDIVREM: {
3703 DstDivReg = MI.getOperand(0).getReg();
3704 DstRemReg = MI.getOperand(1).getReg();
3705 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3706 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3707 break;
3708 }
3709 }
3710
3711 if (Ty == S32)
3712 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3713 else
3714 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3715
3716 if (DstDivReg) {
3717 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3718 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3719 B.buildSub(DstDivReg, SignXor, Sign);
3720 }
3721
3722 if (DstRemReg) {
3723 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3724 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3725 B.buildSub(DstRemReg, SignXor, Sign);
3726 }
3727
3728 MI.eraseFromParent();
3729 return true;
3730}
3731
3734 MachineIRBuilder &B) const {
3735 Register Res = MI.getOperand(0).getReg();
3736 Register LHS = MI.getOperand(1).getReg();
3737 Register RHS = MI.getOperand(2).getReg();
3738 uint16_t Flags = MI.getFlags();
3739 LLT ResTy = MRI.getType(Res);
3740
3741 const MachineFunction &MF = B.getMF();
3742 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3743 MI.getFlag(MachineInstr::FmAfn);
3744
3745 if (!AllowInaccurateRcp)
3746 return false;
3747
3748 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3749 // 1 / x -> RCP(x)
3750 if (CLHS->isExactlyValue(1.0)) {
3751 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3752 .addUse(RHS)
3753 .setMIFlags(Flags);
3754
3755 MI.eraseFromParent();
3756 return true;
3757 }
3758
3759 // -1 / x -> RCP( FNEG(x) )
3760 if (CLHS->isExactlyValue(-1.0)) {
3761 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3762 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3763 .addUse(FNeg.getReg(0))
3764 .setMIFlags(Flags);
3765
3766 MI.eraseFromParent();
3767 return true;
3768 }
3769 }
3770
3771 // x / y -> x * (1.0 / y)
3772 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3773 .addUse(RHS)
3774 .setMIFlags(Flags);
3775 B.buildFMul(Res, LHS, RCP, Flags);
3776
3777 MI.eraseFromParent();
3778 return true;
3779}
3780
3783 MachineIRBuilder &B) const {
3784 Register Res = MI.getOperand(0).getReg();
3785 Register X = MI.getOperand(1).getReg();
3786 Register Y = MI.getOperand(2).getReg();
3787 uint16_t Flags = MI.getFlags();
3788 LLT ResTy = MRI.getType(Res);
3789
3790 const MachineFunction &MF = B.getMF();
3791 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3792 MI.getFlag(MachineInstr::FmAfn);
3793
3794 if (!AllowInaccurateRcp)
3795 return false;
3796
3797 auto NegY = B.buildFNeg(ResTy, Y);
3798 auto One = B.buildFConstant(ResTy, 1.0);
3799
3800 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3801 .addUse(Y)
3802 .setMIFlags(Flags);
3803
3804 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3805 R = B.buildFMA(ResTy, Tmp0, R, R);
3806
3807 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3808 R = B.buildFMA(ResTy, Tmp1, R, R);
3809
3810 auto Ret = B.buildFMul(ResTy, X, R);
3811 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3812
3813 B.buildFMA(Res, Tmp2, R, Ret);
3814 MI.eraseFromParent();
3815 return true;
3816}
3817
3820 MachineIRBuilder &B) const {
3822 return true;
3823
3824 Register Res = MI.getOperand(0).getReg();
3825 Register LHS = MI.getOperand(1).getReg();
3826 Register RHS = MI.getOperand(2).getReg();
3827
3828 uint16_t Flags = MI.getFlags();
3829
3830 LLT S16 = LLT::scalar(16);
3831 LLT S32 = LLT::scalar(32);
3832
3833 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3834 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3835
3836 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3837 .addUse(RHSExt.getReg(0))
3838 .setMIFlags(Flags);
3839
3840 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3841 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3842
3843 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3844 .addUse(RDst.getReg(0))
3845 .addUse(RHS)
3846 .addUse(LHS)
3847 .setMIFlags(Flags);
3848
3849 MI.eraseFromParent();
3850 return true;
3851}
3852
3853// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3854// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3857 const GCNSubtarget &ST,
3859 // Set SP denorm mode to this value.
3860 unsigned SPDenormMode =
3861 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3862
3863 if (ST.hasDenormModeInst()) {
3864 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3865 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3866
3867 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3868 B.buildInstr(AMDGPU::S_DENORM_MODE)
3869 .addImm(NewDenormModeValue);
3870
3871 } else {
3872 // Select FP32 bit field in mode register.
3873 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3876
3877 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3878 .addImm(SPDenormMode)
3879 .addImm(SPDenormModeBitField);
3880 }
3881}
3882
3885 MachineIRBuilder &B) const {
3887 return true;
3888
3889 Register Res = MI.getOperand(0).getReg();
3890 Register LHS = MI.getOperand(1).getReg();
3891 Register RHS = MI.getOperand(2).getReg();
3892 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3894
3895 uint16_t Flags = MI.getFlags();
3896
3897 LLT S32 = LLT::scalar(32);
3898 LLT S1 = LLT::scalar(1);
3899
3900 auto One = B.buildFConstant(S32, 1.0f);
3901
3902 auto DenominatorScaled =
3903 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3904 .addUse(LHS)
3905 .addUse(RHS)
3906 .addImm(0)
3907 .setMIFlags(Flags);
3908 auto NumeratorScaled =
3909 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3910 .addUse(LHS)
3911 .addUse(RHS)
3912 .addImm(1)
3913 .setMIFlags(Flags);
3914
3915 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3916 .addUse(DenominatorScaled.getReg(0))
3917 .setMIFlags(Flags);
3918 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3919
3920 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3921 // aren't modeled as reading it.
3922 if (!Mode.allFP32Denormals())
3923 toggleSPDenormMode(true, B, ST, Mode);
3924
3925 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3926 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3927 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3928 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3929 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3930 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3931
3932 if (!Mode.allFP32Denormals())
3933 toggleSPDenormMode(false, B, ST, Mode);
3934
3935 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3936 .addUse(Fma4.getReg(0))
3937 .addUse(Fma1.getReg(0))
3938 .addUse(Fma3.getReg(0))
3939 .addUse(NumeratorScaled.getReg(1))
3940 .setMIFlags(Flags);
3941
3942 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3943 .addUse(Fmas.getReg(0))
3944 .addUse(RHS)
3945 .addUse(LHS)
3946 .setMIFlags(Flags);
3947
3948 MI.eraseFromParent();
3949 return true;
3950}
3951
3954 MachineIRBuilder &B) const {
3956 return true;
3957
3958 Register Res = MI.getOperand(0).getReg();
3959 Register LHS = MI.getOperand(1).getReg();
3960 Register RHS = MI.getOperand(2).getReg();
3961
3962 uint16_t Flags = MI.getFlags();
3963
3964 LLT S64 = LLT::scalar(64);
3965 LLT S1 = LLT::scalar(1);
3966
3967 auto One = B.buildFConstant(S64, 1.0);
3968
3969 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3970 .addUse(LHS)
3971 .addUse(RHS)
3972 .addImm(0)
3973 .setMIFlags(Flags);
3974
3975 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3976
3977 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3978 .addUse(DivScale0.getReg(0))
3979 .setMIFlags(Flags);
3980
3981 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3982 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3983 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3984
3985 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3986 .addUse(LHS)
3987 .addUse(RHS)
3988 .addImm(1)
3989 .setMIFlags(Flags);
3990
3991 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3992 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3993 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3994
3995 Register Scale;
3997 // Workaround a hardware bug on SI where the condition output from div_scale
3998 // is not usable.
3999
4000 LLT S32 = LLT::scalar(32);
4001
4002 auto NumUnmerge = B.buildUnmerge(S32, LHS);
4003 auto DenUnmerge = B.buildUnmerge(S32, RHS);
4004 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4005 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4006
4007 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4008 Scale1Unmerge.getReg(1));
4009 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4010 Scale0Unmerge.getReg(1));
4011 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4012 } else {
4013 Scale = DivScale1.getReg(1);
4014 }
4015
4016 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4017 .addUse(Fma4.getReg(0))
4018 .addUse(Fma3.getReg(0))
4019 .addUse(Mul.getReg(0))
4020 .addUse(Scale)
4021 .setMIFlags(Flags);
4022
4023 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
4024 .addUse(Fmas.getReg(0))
4025 .addUse(RHS)
4026 .addUse(LHS)
4027 .setMIFlags(Flags);
4028
4029 MI.eraseFromParent();
4030 return true;
4031}
4032
4035 MachineIRBuilder &B) const {
4036 Register Res = MI.getOperand(0).getReg();
4037 Register LHS = MI.getOperand(2).getReg();
4038 Register RHS = MI.getOperand(3).getReg();
4039 uint16_t Flags = MI.getFlags();
4040
4041 LLT S32 = LLT::scalar(32);
4042 LLT S1 = LLT::scalar(1);
4043
4044 auto Abs = B.buildFAbs(S32, RHS, Flags);
4045 const APFloat C0Val(1.0f);
4046
4047 auto C0 = B.buildConstant(S32, 0x6f800000);
4048 auto C1 = B.buildConstant(S32, 0x2f800000);
4049 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4050
4051 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4052 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4053
4054 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4055
4056 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4057 .addUse(Mul0.getReg(0))
4058 .setMIFlags(Flags);
4059
4060 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4061
4062 B.buildFMul(Res, Sel, Mul1, Flags);
4063
4064 MI.eraseFromParent();
4065 return true;
4066}
4067
4068// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4069// FIXME: Why do we handle this one but not other removed instructions?
4070//
4071// Reciprocal square root. The clamp prevents infinite results, clamping
4072// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
4073// +-max_float.
4076 MachineIRBuilder &B) const {
4078 return true;
4079
4080 Register Dst = MI.getOperand(0).getReg();
4081 Register Src = MI.getOperand(2).getReg();
4082 auto Flags = MI.getFlags();
4083
4084 LLT Ty = MRI.getType(Dst);
4085
4086 const fltSemantics *FltSemantics;
4087 if (Ty == LLT::scalar(32))
4088 FltSemantics = &APFloat::IEEEsingle();
4089 else if (Ty == LLT::scalar(64))
4090 FltSemantics = &APFloat::IEEEdouble();
4091 else
4092 return false;
4093
4094 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4095 .addUse(Src)
4096 .setMIFlags(Flags);
4097
4098 // We don't need to concern ourselves with the snan handling difference, since
4099 // the rsq quieted (or not) so use the one which will directly select.
4100 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4101 const bool UseIEEE = MFI->getMode().IEEE;
4102
4103 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4104 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4105 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4106
4107 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4108
4109 if (UseIEEE)
4110 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4111 else
4112 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4113 MI.eraseFromParent();
4114 return true;
4115}
4116
4118 switch (IID) {
4119 case Intrinsic::amdgcn_ds_fadd:
4120 return AMDGPU::G_ATOMICRMW_FADD;
4121 case Intrinsic::amdgcn_ds_fmin:
4122 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4123 case Intrinsic::amdgcn_ds_fmax:
4124 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4125 default:
4126 llvm_unreachable("not a DS FP intrinsic");
4127 }
4128}
4129
4132 Intrinsic::ID IID) const {
4133 GISelChangeObserver &Observer = Helper.Observer;
4134 Observer.changingInstr(MI);
4135
4136 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4137
4138 // The remaining operands were used to set fields in the MemOperand on
4139 // construction.
4140 for (int I = 6; I > 3; --I)
4141 MI.removeOperand(I);
4142
4143 MI.removeOperand(1); // Remove the intrinsic ID.
4144 Observer.changedInstr(MI);
4145 return true;
4146}
4147
4150 MachineIRBuilder &B) const {
4154 LLT DstTy = MRI.getType(DstReg);
4155 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4156
4157 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4158 if (!loadInputValue(KernargPtrReg, B,
4160 return false;
4161
4162 // FIXME: This should be nuw
4163 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4164 return true;
4165}
4166
4169 MachineIRBuilder &B) const {
4170 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4171 if (!MFI->isEntryFunction()) {
4174 }
4175
4176 Register DstReg = MI.getOperand(0).getReg();
4177 if (!getImplicitArgPtr(DstReg, MRI, B))
4178 return false;
4179
4180 MI.eraseFromParent();
4181 return true;
4182}
4183
4186 MachineIRBuilder &B) const {
4187 Function &F = B.getMF().getFunction();
4188 std::optional<uint32_t> KnownSize =
4190 if (KnownSize.has_value())
4191 B.buildConstant(DstReg, *KnownSize);
4192 return false;
4193}
4194
4197 MachineIRBuilder &B) const {
4198
4199 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4200 if (!MFI->isEntryFunction()) {
4203 }
4204
4205 Register DstReg = MI.getOperand(0).getReg();
4206 if (!getLDSKernelId(DstReg, MRI, B))
4207 return false;
4208
4209 MI.eraseFromParent();
4210 return true;
4211}
4212
4216 unsigned AddrSpace) const {
4217 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4218 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4219 Register Hi32 = Unmerge.getReg(1);
4220
4221 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4222 MI.eraseFromParent();
4223 return true;
4224}
4225
4226// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4227// offset (the offset that is included in bounds checking and swizzling, to be
4228// split between the instruction's voffset and immoffset fields) and soffset
4229// (the offset that is excluded from bounds checking and swizzling, to go in
4230// the instruction's soffset field). This function takes the first kind of
4231// offset and figures out how to split it between voffset and immoffset.
4232std::pair<Register, unsigned>
4234 Register OrigOffset) const {
4235 const unsigned MaxImm = 4095;
4236 Register BaseReg;
4237 unsigned ImmOffset;
4238 const LLT S32 = LLT::scalar(32);
4239 MachineRegisterInfo &MRI = *B.getMRI();
4240
4241 std::tie(BaseReg, ImmOffset) =
4243
4244 // If BaseReg is a pointer, convert it to int.
4245 if (MRI.getType(BaseReg).isPointer())
4246 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4247
4248 // If the immediate value is too big for the immoffset field, put the value
4249 // and -4096 into the immoffset field so that the value that is copied/added
4250 // for the voffset field is a multiple of 4096, and it stands more chance
4251 // of being CSEd with the copy/add for another similar load/store.
4252 // However, do not do that rounding down to a multiple of 4096 if that is a
4253 // negative number, as it appears to be illegal to have a negative offset
4254 // in the vgpr, even if adding the immediate offset makes it positive.
4255 unsigned Overflow = ImmOffset & ~MaxImm;
4256 ImmOffset -= Overflow;
4257 if ((int32_t)Overflow < 0) {
4258 Overflow += ImmOffset;
4259 ImmOffset = 0;
4260 }
4261
4262 if (Overflow != 0) {
4263 if (!BaseReg) {
4264 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4265 } else {
4266 auto OverflowVal = B.buildConstant(S32, Overflow);
4267 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4268 }
4269 }
4270
4271 if (!BaseReg)
4272 BaseReg = B.buildConstant(S32, 0).getReg(0);
4273
4274 return std::pair(BaseReg, ImmOffset);
4275}
4276
4277/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4279 Register VOffset, Register SOffset,
4280 unsigned ImmOffset, Register VIndex,
4281 MachineRegisterInfo &MRI) const {
4282 std::optional<ValueAndVReg> MaybeVOffsetVal =
4284 std::optional<ValueAndVReg> MaybeSOffsetVal =
4286 std::optional<ValueAndVReg> MaybeVIndexVal =
4288 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4289 // update the MMO with that offset. The stride is unknown so we can only do
4290 // this if VIndex is constant 0.
4291 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4292 MaybeVIndexVal->Value == 0) {
4293 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4294 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4295 MMO->setOffset(TotalOffset);
4296 } else {
4297 // We don't have a constant combined offset to use in the MMO. Give up.
4298 MMO->setValue((Value *)nullptr);
4299 }
4300}
4301
4302/// Handle register layout difference for f16 images for some subtargets.
4305 Register Reg,
4306 bool ImageStore) const {
4307 const LLT S16 = LLT::scalar(16);
4308 const LLT S32 = LLT::scalar(32);
4309 LLT StoreVT = MRI.getType(Reg);
4310 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4311
4312 if (ST.hasUnpackedD16VMem()) {
4313 auto Unmerge = B.buildUnmerge(S16, Reg);
4314
4315 SmallVector<Register, 4> WideRegs;
4316 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4317 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4318
4319 int NumElts = StoreVT.getNumElements();
4320
4321 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4322 .getReg(0);
4323 }
4324
4325 if (ImageStore && ST.hasImageStoreD16Bug()) {
4326 if (StoreVT.getNumElements() == 2) {
4327 SmallVector<Register, 4> PackedRegs;
4328 Reg = B.buildBitcast(S32, Reg).getReg(0);
4329 PackedRegs.push_back(Reg);
4330 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4331 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4332 .getReg(0);
4333 }
4334
4335 if (StoreVT.getNumElements() == 3) {
4336 SmallVector<Register, 4> PackedRegs;
4337 auto Unmerge = B.buildUnmerge(S16, Reg);
4338 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4339 PackedRegs.push_back(Unmerge.getReg(I));
4340 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4341 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4342 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4343 }
4344
4345 if (StoreVT.getNumElements() == 4) {
4346 SmallVector<Register, 4> PackedRegs;
4347 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4348 auto Unmerge = B.buildUnmerge(S32, Reg);
4349 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4350 PackedRegs.push_back(Unmerge.getReg(I));
4351 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4352 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4353 .getReg(0);
4354 }
4355
4356 llvm_unreachable("invalid data type");
4357 }
4358
4359 if (StoreVT == LLT::fixed_vector(3, S16)) {
4360 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4361 .getReg(0);
4362 }
4363 return Reg;
4364}
4365
4367 MachineIRBuilder &B, Register VData, bool IsFormat) const {
4368 MachineRegisterInfo *MRI = B.getMRI();
4369 LLT Ty = MRI->getType(VData);
4370
4371 const LLT S16 = LLT::scalar(16);
4372
4373 // Fixup illegal register types for i8 stores.
4374 if (Ty == LLT::scalar(8) || Ty == S16) {
4375 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4376 return AnyExt;
4377 }
4378
4379 if (Ty.isVector()) {
4380 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4381 if (IsFormat)
4382 return handleD16VData(B, *MRI, VData);
4383 }
4384 }
4385
4386 return VData;
4387}
4388
4392 bool IsTyped,
4393 bool IsFormat) const {
4394 Register VData = MI.getOperand(1).getReg();
4395 LLT Ty = MRI.getType(VData);
4396 LLT EltTy = Ty.getScalarType();
4397 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4398 const LLT S32 = LLT::scalar(32);
4399
4400 VData = fixStoreSourceType(B, VData, IsFormat);
4401 Register RSrc = MI.getOperand(2).getReg();
4402
4403 MachineMemOperand *MMO = *MI.memoperands_begin();
4404 const int MemSize = MMO->getSize();
4405
4406 unsigned ImmOffset;
4407
4408 // The typed intrinsics add an immediate after the registers.
4409 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4410
4411 // The struct intrinsic variants add one additional operand over raw.
4412 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4413 Register VIndex;
4414 int OpOffset = 0;
4415 if (HasVIndex) {
4416 VIndex = MI.getOperand(3).getReg();
4417 OpOffset = 1;
4418 } else {
4419 VIndex = B.buildConstant(S32, 0).getReg(0);
4420 }
4421
4422 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4423 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4424
4425 unsigned Format = 0;
4426 if (IsTyped) {
4427 Format = MI.getOperand(5 + OpOffset).getImm();
4428 ++OpOffset;
4429 }
4430
4431 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4432
4433 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4434 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4435
4436 unsigned Opc;
4437 if (IsTyped) {
4438 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4439 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4440 } else if (IsFormat) {
4441 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4442 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4443 } else {
4444 switch (MemSize) {
4445 case 1:
4446 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4447 break;
4448 case 2:
4449 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4450 break;
4451 default:
4452 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4453 break;
4454 }
4455 }
4456
4457 auto MIB = B.buildInstr(Opc)
4458 .addUse(VData) // vdata
4459 .addUse(RSrc) // rsrc
4460 .addUse(VIndex) // vindex
4461 .addUse(VOffset) // voffset
4462 .addUse(SOffset) // soffset
4463 .addImm(ImmOffset); // offset(imm)
4464
4465 if (IsTyped)
4466 MIB.addImm(Format);
4467
4468 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4469 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4470 .addMemOperand(MMO);
4471
4472 MI.eraseFromParent();
4473 return true;
4474}
4475
4476static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
4477 Register VIndex, Register VOffset, Register SOffset,
4478 unsigned ImmOffset, unsigned Format,
4479 unsigned AuxiliaryData, MachineMemOperand *MMO,
4480 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
4481 auto MIB = B.buildInstr(Opc)
4482 .addDef(LoadDstReg) // vdata
4483 .addUse(RSrc) // rsrc
4484 .addUse(VIndex) // vindex
4485 .addUse(VOffset) // voffset
4486 .addUse(SOffset) // soffset
4487 .addImm(ImmOffset); // offset(imm)
4488
4489 if (IsTyped)
4490 MIB.addImm(Format);
4491
4492 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4493 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4494 .addMemOperand(MMO);
4495}
4496
4500 bool IsFormat,
4501 bool IsTyped) const {
4502 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4503 MachineMemOperand *MMO = *MI.memoperands_begin();
4504 const LLT MemTy = MMO->getMemoryType();
4505 const LLT S32 = LLT::scalar(32);
4506
4507 Register Dst = MI.getOperand(0).getReg();
4508
4509 Register StatusDst;
4510 int OpOffset = 0;
4511 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
4512 bool IsTFE = MI.getNumExplicitDefs() == 2;
4513 if (IsTFE) {
4514 StatusDst = MI.getOperand(1).getReg();
4515 ++OpOffset;
4516 }
4517
4518 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
4519
4520 // The typed intrinsics add an immediate after the registers.
4521 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4522
4523 // The struct intrinsic variants add one additional operand over raw.
4524 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
4525 Register VIndex;
4526 if (HasVIndex) {
4527 VIndex = MI.getOperand(3 + OpOffset).getReg();
4528 ++OpOffset;
4529 } else {
4530 VIndex = B.buildConstant(S32, 0).getReg(0);
4531 }
4532
4533 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4534 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4535
4536 unsigned Format = 0;
4537 if (IsTyped) {
4538 Format = MI.getOperand(5 + OpOffset).getImm();
4539 ++OpOffset;
4540 }
4541
4542 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4543 unsigned ImmOffset;
4544
4545 LLT Ty = MRI.getType(Dst);
4546 LLT EltTy = Ty.getScalarType();
4547 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4548 const bool Unpacked = ST.hasUnpackedD16VMem();
4549
4550 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4551 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4552
4553 unsigned Opc;
4554
4555 // TODO: Support TFE for typed and narrow loads.
4556 if (IsTyped) {
4557 if (IsTFE)
4558 return false;
4559 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4560 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4561 } else if (IsFormat) {
4562 if (IsD16) {
4563 if (IsTFE)
4564 return false;
4565 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
4566 } else {
4567 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
4568 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4569 }
4570 } else {
4571 if (IsTFE)
4572 return false;
4573 switch (MemTy.getSizeInBits()) {
4574 case 8:
4575 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4576 break;
4577 case 16:
4578 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4579 break;
4580 default:
4581 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4582 break;
4583 }
4584 }
4585
4586 if (IsTFE) {
4587 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
4588 unsigned NumLoadDWords = NumValueDWords + 1;
4589 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
4590 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
4591 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4592 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4593 if (NumValueDWords == 1) {
4594 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
4595 } else {
4596 SmallVector<Register, 5> LoadElts;
4597 for (unsigned I = 0; I != NumValueDWords; ++I)
4598 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
4599 LoadElts.push_back(StatusDst);
4600 B.buildUnmerge(LoadElts, LoadDstReg);
4601 LoadElts.truncate(NumValueDWords);
4602 B.buildMergeLikeInstr(Dst, LoadElts);
4603 }
4604 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
4605 (IsD16 && !Ty.isVector())) {
4606 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4607 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4608 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4609 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4610 B.buildTrunc(Dst, LoadDstReg);
4611 } else if (Unpacked && IsD16 && Ty.isVector()) {
4612 LLT UnpackedTy = Ty.changeElementSize(32);
4613 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4614 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4615 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4616 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4617 // FIXME: G_TRUNC should work, but legalization currently fails
4618 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4620 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4621 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4622 B.buildMergeLikeInstr(Dst, Repack);
4623 } else {
4624 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
4625 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4626 }
4627
4628 MI.eraseFromParent();
4629 return true;
4630}
4631
4634 bool IsInc) const {
4635 unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP :
4636 AMDGPU::G_ATOMICRMW_UDEC_WRAP;
4637 B.buildInstr(Opc)
4638 .addDef(MI.getOperand(0).getReg())
4639 .addUse(MI.getOperand(2).getReg())
4640 .addUse(MI.getOperand(3).getReg())
4641 .cloneMemRefs(MI);
4642 MI.eraseFromParent();
4643 return true;
4644}
4645
4646static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4647 switch (IntrID) {
4648 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4649 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4650 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4651 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4652 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4653 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4654 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4655 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4656 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4657 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4658 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4659 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4660 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4661 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4662 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4663 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4664 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4665 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4666 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4667 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4668 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4669 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4670 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4671 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4672 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4673 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4674 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4675 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4676 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4677 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4678 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4679 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4680 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4681 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4682 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4683 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4684 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4685 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4686 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4687 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4688 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4689 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4690 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4691 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4692 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4693 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4694 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4695 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4696 default:
4697 llvm_unreachable("unhandled atomic opcode");
4698 }
4699}
4700
4703 Intrinsic::ID IID) const {
4704 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4705 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4706 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4707
4708 Register Dst;
4709
4710 int OpOffset = 0;
4711 if (HasReturn) {
4712 // A few FP atomics do not support return values.
4713 Dst = MI.getOperand(0).getReg();
4714 } else {
4715 OpOffset = -1;
4716 }
4717
4718 Register VData = MI.getOperand(2 + OpOffset).getReg();
4719 Register CmpVal;
4720
4721 if (IsCmpSwap) {
4722 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4723 ++OpOffset;
4724 }
4725
4726 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4727 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4728
4729 // The struct intrinsic variants add one additional operand over raw.
4730 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4731 Register VIndex;
4732 if (HasVIndex) {
4733 VIndex = MI.getOperand(4 + OpOffset).getReg();
4734 ++OpOffset;
4735 } else {
4736 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4737 }
4738
4739 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4740 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4741 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4742
4743 MachineMemOperand *MMO = *MI.memoperands_begin();
4744
4745 unsigned ImmOffset;
4746 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4747 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4748
4749 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4750
4751 if (HasReturn)
4752 MIB.addDef(Dst);
4753
4754 MIB.addUse(VData); // vdata
4755
4756 if (IsCmpSwap)
4757 MIB.addReg(CmpVal);
4758
4759 MIB.addUse(RSrc) // rsrc
4760 .addUse(VIndex) // vindex
4761 .addUse(VOffset) // voffset
4762 .addUse(SOffset) // soffset
4763 .addImm(ImmOffset) // offset(imm)
4764 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4765 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4766 .addMemOperand(MMO);
4767
4768 MI.eraseFromParent();
4769 return true;
4770}
4771
4772/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4773/// vector with s16 typed elements.
4775 SmallVectorImpl<Register> &PackedAddrs,
4776 unsigned ArgOffset,
4778 bool IsA16, bool IsG16) {
4779 const LLT S16 = LLT::scalar(16);
4780 const LLT V2S16 = LLT::fixed_vector(2, 16);
4781 auto EndIdx = Intr->VAddrEnd;
4782
4783 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4784 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4785 if (!SrcOp.isReg())
4786 continue; // _L to _LZ may have eliminated this.
4787
4788 Register AddrReg = SrcOp.getReg();
4789
4790 if ((I < Intr->GradientStart) ||
4791 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4792 (I >= Intr->CoordStart && !IsA16)) {
4793 if ((I < Intr->GradientStart) && IsA16 &&
4794 (B.getMRI()->getType(AddrReg) == S16)) {
4795 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4796 // Special handling of bias when A16 is on. Bias is of type half but
4797 // occupies full 32-bit.
4798 PackedAddrs.push_back(
4799 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4800 .getReg(0));
4801 } else {
4802 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4803 "Bias needs to be converted to 16 bit in A16 mode");
4804 // Handle any gradient or coordinate operands that should not be packed
4805 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4806 PackedAddrs.push_back(AddrReg);
4807 }
4808 } else {
4809 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4810 // derivatives dx/dh and dx/dv are packed with undef.
4811 if (((I + 1) >= EndIdx) ||
4812 ((Intr->NumGradients / 2) % 2 == 1 &&
4813 (I == static_cast<unsigned>(Intr->GradientStart +
4814 (Intr->NumGradients / 2) - 1) ||
4815 I == static_cast<unsigned>(Intr->GradientStart +
4816 Intr->NumGradients - 1))) ||
4817 // Check for _L to _LZ optimization
4818 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4819 PackedAddrs.push_back(
4820 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4821 .getReg(0));
4822 } else {
4823 PackedAddrs.push_back(
4824 B.buildBuildVector(
4825 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4826 .getReg(0));
4827 ++I;
4828 }
4829 }
4830 }
4831}
4832
4833/// Convert from separate vaddr components to a single vector address register,
4834/// and replace the remaining operands with $noreg.
4836 int DimIdx, int NumVAddrs) {
4837 const LLT S32 = LLT::scalar(32);
4838 (void)S32;
4839 SmallVector<Register, 8> AddrRegs;
4840 for (int I = 0; I != NumVAddrs; ++I) {
4841 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4842 if (SrcOp.isReg()) {
4843 AddrRegs.push_back(SrcOp.getReg());
4844 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4845 }
4846 }
4847
4848 int NumAddrRegs = AddrRegs.size();
4849 if (NumAddrRegs != 1) {
4850 auto VAddr =
4851 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4852 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4853 }
4854
4855 for (int I = 1; I != NumVAddrs; ++I) {
4856 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4857 if (SrcOp.isReg())
4858 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4859 }
4860}
4861
4862/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4863///
4864/// Depending on the subtarget, load/store with 16-bit element data need to be
4865/// rewritten to use the low half of 32-bit registers, or directly use a packed
4866/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4867/// registers.
4868///
4869/// We don't want to directly select image instructions just yet, but also want
4870/// to exposes all register repacking to the legalizer/combiners. We also don't
4871/// want a selected instruction entering RegBankSelect. In order to avoid
4872/// defining a multitude of intermediate image instructions, directly hack on
4873/// the intrinsic's arguments. In cases like a16 addresses, this requires
4874/// padding now unnecessary arguments with $noreg.
4877 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4878
4879 const MachineFunction &MF = *MI.getMF();
4880 const unsigned NumDefs = MI.getNumExplicitDefs();
4881 const unsigned ArgOffset = NumDefs + 1;
4882 bool IsTFE = NumDefs == 2;
4883 // We are only processing the operands of d16 image operations on subtargets
4884 // that use the unpacked register layout, or need to repack the TFE result.
4885
4886 // TODO: Do we need to guard against already legalized intrinsics?
4887 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4889
4890 MachineRegisterInfo *MRI = B.getMRI();
4891 const LLT S32 = LLT::scalar(32);
4892 const LLT S16 = LLT::scalar(16);
4893 const LLT V2S16 = LLT::fixed_vector(2, 16);
4894
4895 unsigned DMask = 0;
4896 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4897 LLT Ty = MRI->getType(VData);
4898
4899 // Check for 16 bit addresses and pack if true.
4900 LLT GradTy =
4901 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4902 LLT AddrTy =
4903 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4904 const bool IsG16 = GradTy == S16;
4905 const bool IsA16 = AddrTy == S16;
4906 const bool IsD16 = Ty.getScalarType() == S16;
4907
4908 int DMaskLanes = 0;
4909 if (!BaseOpcode->Atomic) {
4910 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4911 if (BaseOpcode->Gather4) {
4912 DMaskLanes = 4;
4913 } else if (DMask != 0) {
4914 DMaskLanes = llvm::popcount(DMask);
4915 } else if (!IsTFE && !BaseOpcode->Store) {
4916 // If dmask is 0, this is a no-op load. This can be eliminated.
4917 B.buildUndef(MI.getOperand(0));
4918 MI.eraseFromParent();
4919 return true;
4920 }
4921 }
4922
4923 Observer.changingInstr(MI);
4924 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4925
4926 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4927 : AMDGPU::G_AMDGPU_INTRI