LLVM  16.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
41  "amdgpu-global-isel-new-legality",
42  cl::desc("Use GlobalISel desired legality, rather than try to use"
43  "rules compatible with selection patterns"),
44  cl::init(false),
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
51  unsigned NElts = Ty.getNumElements();
52  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53  return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55 
56 // Round the number of bits to the next power of two bits
58  unsigned Bits = Ty.getSizeInBits();
59  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60  return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67  return [=](const LegalityQuery &Query) {
68  const LLT Ty = Query.Types[TypeIdx];
69  if (!Ty.isVector())
70  return false;
71 
72  const LLT EltTy = Ty.getElementType();
73  const unsigned EltSize = EltTy.getSizeInBits();
74  return Ty.getNumElements() % 2 != 0 &&
75  EltSize > 1 && EltSize < 32 &&
76  Ty.getSizeInBits() % 32 != 0;
77  };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81  return [=](const LegalityQuery &Query) {
82  const LLT Ty = Query.Types[TypeIdx];
83  return Ty.getSizeInBits() % 32 == 0;
84  };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88  return [=](const LegalityQuery &Query) {
89  const LLT Ty = Query.Types[TypeIdx];
90  const LLT EltTy = Ty.getScalarType();
91  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92  };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96  return [=](const LegalityQuery &Query) {
97  const LLT Ty = Query.Types[TypeIdx];
98  const LLT EltTy = Ty.getElementType();
99  return std::make_pair(TypeIdx,
100  LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101  };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105  return [=](const LegalityQuery &Query) {
106  const LLT Ty = Query.Types[TypeIdx];
107  const LLT EltTy = Ty.getElementType();
108  unsigned Size = Ty.getSizeInBits();
109  unsigned Pieces = (Size + 63) / 64;
110  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111  return std::make_pair(
112  TypeIdx,
113  LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114  };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120  return [=](const LegalityQuery &Query) {
121  const LLT Ty = Query.Types[TypeIdx];
122 
123  const LLT EltTy = Ty.getElementType();
124  const int Size = Ty.getSizeInBits();
125  const int EltSize = EltTy.getSizeInBits();
126  const int NextMul32 = (Size + 31) / 32;
127 
128  assert(EltSize < 32);
129 
130  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131  return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132  };
133 }
134 
135 static LLT getBitcastRegisterType(const LLT Ty) {
136  const unsigned Size = Ty.getSizeInBits();
137 
138  if (Size <= 32) {
139  // <2 x s8> -> s16
140  // <4 x s8> -> s32
141  return LLT::scalar(Size);
142  }
143 
144  return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148  return [=](const LegalityQuery &Query) {
149  const LLT Ty = Query.Types[TypeIdx];
150  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151  };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155  return [=](const LegalityQuery &Query) {
156  const LLT Ty = Query.Types[TypeIdx];
157  unsigned Size = Ty.getSizeInBits();
158  assert(Size % 32 == 0);
159  return std::make_pair(
160  TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161  };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165  return [=](const LegalityQuery &Query) {
166  const LLT QueryTy = Query.Types[TypeIdx];
167  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168  };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172  return [=](const LegalityQuery &Query) {
173  const LLT QueryTy = Query.Types[TypeIdx];
174  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175  };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179  return [=](const LegalityQuery &Query) {
180  const LLT QueryTy = Query.Types[TypeIdx];
181  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182  };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186  return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190  const int EltSize = EltTy.getSizeInBits();
191  return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195  const int EltSize = Ty.getElementType().getSizeInBits();
196  return EltSize == 32 || EltSize == 64 ||
197  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198  EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202  if (!isRegisterSize(Ty.getSizeInBits()))
203  return false;
204 
205  if (Ty.isVector())
206  return isRegisterVectorType(Ty);
207 
208  return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214  return [=](const LegalityQuery &Query) {
215  return isRegisterType(Query.Types[TypeIdx]);
216  };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220  return [=](const LegalityQuery &Query) {
221  const LLT QueryTy = Query.Types[TypeIdx];
222  if (!QueryTy.isVector())
223  return false;
224  const LLT EltTy = QueryTy.getElementType();
225  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226  };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
232  return [=](const LegalityQuery &Query) {
233  const LLT Ty = Query.Types[TypeIdx];
234  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235  Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236  };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243  bool IsLoad) {
244  switch (AS) {
246  // FIXME: Private element size.
247  return ST.enableFlatScratch() ? 128 : 32;
249  return ST.useDS128() ? 128 : 64;
253  // Treat constant and global as identical. SMRD loads are sometimes usable for
254  // global loads (ideally constant address space should be eliminated)
255  // depending on the context. Legality cannot be context dependent, but
256  // RegBankSelect can split the load as necessary depending on the pointer
257  // register bank/uniformity and if the memory is invariant or not written in a
258  // kernel.
259  return IsLoad ? 512 : 128;
260  default:
261  // Flat addresses may contextually need to be split to 32-bit parts if they
262  // may alias scratch depending on the subtarget.
263  return 128;
264  }
265 }
266 
268  const LegalityQuery &Query) {
269  const LLT Ty = Query.Types[0];
270 
271  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272  const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274  unsigned RegSize = Ty.getSizeInBits();
275  uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276  uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277  unsigned AS = Query.Types[1].getAddressSpace();
278 
279  // All of these need to be custom lowered to cast the pointer operand.
281  return false;
282 
283  // Do not handle extending vector loads.
284  if (Ty.isVector() && MemSize != RegSize)
285  return false;
286 
287  // TODO: We should be able to widen loads if the alignment is high enough, but
288  // we also need to modify the memory access size.
289 #if 0
290  // Accept widening loads based on alignment.
291  if (IsLoad && MemSize < Size)
292  MemSize = std::max(MemSize, Align);
293 #endif
294 
295  // Only 1-byte and 2-byte to 32-bit extloads are valid.
296  if (MemSize != RegSize && RegSize != 32)
297  return false;
298 
299  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300  return false;
301 
302  switch (MemSize) {
303  case 8:
304  case 16:
305  case 32:
306  case 64:
307  case 128:
308  break;
309  case 96:
310  if (!ST.hasDwordx3LoadStores())
311  return false;
312  break;
313  case 256:
314  case 512:
315  // These may contextually need to be broken down.
316  break;
317  default:
318  return false;
319  }
320 
321  assert(RegSize >= MemSize);
322 
323  if (AlignBits < MemSize) {
324  const SITargetLowering *TLI = ST.getTargetLowering();
325  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326  Align(AlignBits / 8)))
327  return false;
328  }
329 
330  return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338  if (EnableNewLegality)
339  return false;
340 
341  const unsigned Size = Ty.getSizeInBits();
342  if (Size <= 64)
343  return false;
344  if (!Ty.isVector())
345  return true;
346 
347  LLT EltTy = Ty.getElementType();
348  if (EltTy.isPointer())
349  return true;
350 
351  unsigned EltSize = EltTy.getSizeInBits();
352  return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356  const LLT Ty = Query.Types[0];
357  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364  const LLT MemTy) {
365  const unsigned MemSizeInBits = MemTy.getSizeInBits();
366  const unsigned Size = Ty.getSizeInBits();
367  if (Size != MemSizeInBits)
368  return Size <= 32 && Ty.isVector();
369 
371  return true;
372 
373  // Don't try to handle bitcasting vector ext loads for now.
374  return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375  (Size <= 32 || isRegisterSize(Size)) &&
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383  uint64_t AlignInBits, unsigned AddrSpace,
384  unsigned Opcode) {
385  unsigned SizeInBits = MemoryTy.getSizeInBits();
386  // We don't want to widen cases that are naturally legal.
387  if (isPowerOf2_32(SizeInBits))
388  return false;
389 
390  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391  // end up widening these for a scalar load during RegBankSelect, since there
392  // aren't 96-bit scalar loads.
393  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394  return false;
395 
396  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397  return false;
398 
399  // A load is known dereferenceable up to the alignment, so it's legal to widen
400  // to it.
401  //
402  // TODO: Could check dereferenceable for less aligned cases.
403  unsigned RoundedSize = NextPowerOf2(SizeInBits);
404  if (AlignInBits < RoundedSize)
405  return false;
406 
407  // Do not widen if it would introduce a slow unaligned load.
408  const SITargetLowering *TLI = ST.getTargetLowering();
409  bool Fast = false;
411  RoundedSize, AddrSpace, Align(AlignInBits / 8),
412  MachineMemOperand::MOLoad, &Fast) &&
413  Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417  unsigned Opcode) {
418  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419  return false;
420 
421  return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422  Query.MMODescrs[0].AlignInBits,
423  Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
427  const GCNTargetMachine &TM)
428  : ST(ST_) {
429  using namespace TargetOpcode;
430 
431  auto GetAddrSpacePtr = [&TM](unsigned AS) {
432  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433  };
434 
435  const LLT S1 = LLT::scalar(1);
436  const LLT S8 = LLT::scalar(8);
437  const LLT S16 = LLT::scalar(16);
438  const LLT S32 = LLT::scalar(32);
439  const LLT S64 = LLT::scalar(64);
440  const LLT S128 = LLT::scalar(128);
441  const LLT S256 = LLT::scalar(256);
442  const LLT S512 = LLT::scalar(512);
443  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445  const LLT V2S8 = LLT::fixed_vector(2, 8);
446  const LLT V2S16 = LLT::fixed_vector(2, 16);
447  const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449  const LLT V2S32 = LLT::fixed_vector(2, 32);
450  const LLT V3S32 = LLT::fixed_vector(3, 32);
451  const LLT V4S32 = LLT::fixed_vector(4, 32);
452  const LLT V5S32 = LLT::fixed_vector(5, 32);
453  const LLT V6S32 = LLT::fixed_vector(6, 32);
454  const LLT V7S32 = LLT::fixed_vector(7, 32);
455  const LLT V8S32 = LLT::fixed_vector(8, 32);
456  const LLT V9S32 = LLT::fixed_vector(9, 32);
457  const LLT V10S32 = LLT::fixed_vector(10, 32);
458  const LLT V11S32 = LLT::fixed_vector(11, 32);
459  const LLT V12S32 = LLT::fixed_vector(12, 32);
460  const LLT V13S32 = LLT::fixed_vector(13, 32);
461  const LLT V14S32 = LLT::fixed_vector(14, 32);
462  const LLT V15S32 = LLT::fixed_vector(15, 32);
463  const LLT V16S32 = LLT::fixed_vector(16, 32);
464  const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466  const LLT V2S64 = LLT::fixed_vector(2, 64);
467  const LLT V3S64 = LLT::fixed_vector(3, 64);
468  const LLT V4S64 = LLT::fixed_vector(4, 64);
469  const LLT V5S64 = LLT::fixed_vector(5, 64);
470  const LLT V6S64 = LLT::fixed_vector(6, 64);
471  const LLT V7S64 = LLT::fixed_vector(7, 64);
472  const LLT V8S64 = LLT::fixed_vector(8, 64);
473  const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475  std::initializer_list<LLT> AllS32Vectors =
476  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478  std::initializer_list<LLT> AllS64Vectors =
479  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489  const LLT CodePtr = FlatPtr;
490 
491  const std::initializer_list<LLT> AddrSpaces64 = {
492  GlobalPtr, ConstantPtr, FlatPtr
493  };
494 
495  const std::initializer_list<LLT> AddrSpaces32 = {
496  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497  };
498 
499  const std::initializer_list<LLT> FPTypesBase = {
500  S32, S64
501  };
502 
503  const std::initializer_list<LLT> FPTypes16 = {
504  S32, S64, S16
505  };
506 
507  const std::initializer_list<LLT> FPTypesPK16 = {
508  S32, S64, S16, V2S16
509  };
510 
511  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513  // s1 for VCC branches, s32 for SCC branches.
514  getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517  // elements for v3s16
519  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520  .legalFor(AllS32Vectors)
521  .legalFor(AllS64Vectors)
522  .legalFor(AddrSpaces64)
523  .legalFor(AddrSpaces32)
524  .legalIf(isPointer(0))
525  .clampScalar(0, S16, S256)
526  .widenScalarToNextPow2(0, 32)
527  .clampMaxNumElements(0, S32, 16)
529  .scalarize(0);
530 
531  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532  // Full set of gfx9 features.
533  getActionDefinitionsBuilder({G_ADD, G_SUB})
534  .legalFor({S32, S16, V2S16})
535  .clampMaxNumElementsStrict(0, S16, 2)
536  .scalarize(0)
537  .minScalar(0, S16)
539  .maxScalar(0, S32);
540 
542  .legalFor({S32, S16, V2S16})
543  .clampMaxNumElementsStrict(0, S16, 2)
544  .scalarize(0)
545  .minScalar(0, S16)
547  .custom();
548  assert(ST.hasMad64_32());
549 
550  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
551  .legalFor({S32, S16, V2S16}) // Clamp modifier
552  .minScalarOrElt(0, S16)
553  .clampMaxNumElementsStrict(0, S16, 2)
554  .scalarize(0)
555  .widenScalarToNextPow2(0, 32)
556  .lower();
557  } else if (ST.has16BitInsts()) {
558  getActionDefinitionsBuilder({G_ADD, G_SUB})
559  .legalFor({S32, S16})
560  .minScalar(0, S16)
562  .maxScalar(0, S32)
563  .scalarize(0);
564 
566  .legalFor({S32, S16})
567  .scalarize(0)
568  .minScalar(0, S16)
569  .widenScalarToNextMultipleOf(0, 32)
570  .custom();
571  assert(ST.hasMad64_32());
572 
573  // Technically the saturating operations require clamp bit support, but this
574  // was introduced at the same time as 16-bit operations.
575  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576  .legalFor({S32, S16}) // Clamp modifier
577  .minScalar(0, S16)
578  .scalarize(0)
579  .widenScalarToNextPow2(0, 16)
580  .lower();
581 
582  // We're just lowering this, but it helps get a better result to try to
583  // coerce to the desired type first.
584  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
585  .minScalar(0, S16)
586  .scalarize(0)
587  .lower();
588  } else {
589  getActionDefinitionsBuilder({G_ADD, G_SUB})
590  .legalFor({S32})
591  .widenScalarToNextMultipleOf(0, 32)
592  .clampScalar(0, S32, S32)
593  .scalarize(0);
594 
595  auto &Mul = getActionDefinitionsBuilder(G_MUL)
596  .legalFor({S32})
597  .scalarize(0)
598  .minScalar(0, S32)
599  .widenScalarToNextMultipleOf(0, 32);
600 
601  if (ST.hasMad64_32())
602  Mul.custom();
603  else
604  Mul.maxScalar(0, S32);
605 
606  if (ST.hasIntClamp()) {
607  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
608  .legalFor({S32}) // Clamp modifier.
609  .scalarize(0)
610  .minScalarOrElt(0, S32)
611  .lower();
612  } else {
613  // Clamp bit support was added in VI, along with 16-bit operations.
614  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
615  .minScalar(0, S32)
616  .scalarize(0)
617  .lower();
618  }
619 
620  // FIXME: DAG expansion gets better results. The widening uses the smaller
621  // range values and goes for the min/max lowering directly.
622  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
623  .minScalar(0, S32)
624  .scalarize(0)
625  .lower();
626  }
627 
629  {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
630  .customFor({S32, S64})
631  .clampScalar(0, S32, S64)
632  .widenScalarToNextPow2(0, 32)
633  .scalarize(0);
634 
635  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
636  .legalFor({S32})
637  .maxScalar(0, S32);
638 
639  if (ST.hasVOP3PInsts()) {
640  Mulh
641  .clampMaxNumElements(0, S8, 2)
642  .lowerFor({V2S8});
643  }
644 
645  Mulh
646  .scalarize(0)
647  .lower();
648 
649  // Report legal for any types we can handle anywhere. For the cases only legal
650  // on the SALU, RegBankSelect will be able to re-legalize.
651  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
652  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
653  .clampScalar(0, S32, S64)
657  .scalarize(0);
658 
659  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
660  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661  .legalFor({{S32, S1}, {S32, S32}})
662  .minScalar(0, S32)
663  .scalarize(0)
664  .lower();
665 
666  getActionDefinitionsBuilder(G_BITCAST)
667  // Don't worry about the size constraint.
669  .lower();
670 
671 
672  getActionDefinitionsBuilder(G_CONSTANT)
673  .legalFor({S1, S32, S64, S16, GlobalPtr,
674  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
675  .legalIf(isPointer(0))
676  .clampScalar(0, S32, S64)
678 
679  getActionDefinitionsBuilder(G_FCONSTANT)
680  .legalFor({S32, S64, S16})
681  .clampScalar(0, S16, S64);
682 
683  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
684  .legalIf(isRegisterType(0))
685  // s1 and s16 are special cases because they have legal operations on
686  // them, but don't really occupy registers in the normal way.
687  .legalFor({S1, S16})
688  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689  .clampScalarOrElt(0, S32, MaxScalar)
690  .widenScalarToNextPow2(0, 32)
691  .clampMaxNumElements(0, S32, 16);
692 
693  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
694 
695  // If the amount is divergent, we have to do a wave reduction to get the
696  // maximum value, so this is expanded during RegBankSelect.
697  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
698  .legalFor({{PrivatePtr, S32}});
699 
700  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
701  .customIf(typeIsNot(0, PrivatePtr));
702 
703  getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
704 
705  auto &FPOpActions = getActionDefinitionsBuilder(
706  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
707  .legalFor({S32, S64});
708  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
709  .customFor({S32, S64});
710  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
711  .customFor({S32, S64});
712 
713  if (ST.has16BitInsts()) {
714  if (ST.hasVOP3PInsts())
715  FPOpActions.legalFor({S16, V2S16});
716  else
717  FPOpActions.legalFor({S16});
718 
719  TrigActions.customFor({S16});
720  FDIVActions.customFor({S16});
721  }
722 
723  auto &MinNumMaxNum = getActionDefinitionsBuilder({
724  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
725 
726  if (ST.hasVOP3PInsts()) {
727  MinNumMaxNum.customFor(FPTypesPK16)
729  .clampMaxNumElements(0, S16, 2)
730  .clampScalar(0, S16, S64)
731  .scalarize(0);
732  } else if (ST.has16BitInsts()) {
733  MinNumMaxNum.customFor(FPTypes16)
734  .clampScalar(0, S16, S64)
735  .scalarize(0);
736  } else {
737  MinNumMaxNum.customFor(FPTypesBase)
738  .clampScalar(0, S32, S64)
739  .scalarize(0);
740  }
741 
742  if (ST.hasVOP3PInsts())
743  FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
744 
745  FPOpActions
746  .scalarize(0)
747  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
748 
749  TrigActions
750  .scalarize(0)
751  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
752 
753  FDIVActions
754  .scalarize(0)
755  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
756 
757  getActionDefinitionsBuilder({G_FNEG, G_FABS})
758  .legalFor(FPTypesPK16)
759  .clampMaxNumElementsStrict(0, S16, 2)
760  .scalarize(0)
761  .clampScalar(0, S16, S64);
762 
763  if (ST.has16BitInsts()) {
764  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
765  .legalFor({S32, S64, S16})
766  .scalarize(0)
767  .clampScalar(0, S16, S64);
768  } else {
770  .legalFor({S32, S64})
771  .scalarize(0)
772  .clampScalar(0, S32, S64);
773 
774  if (ST.hasFractBug()) {
776  .customFor({S64})
777  .legalFor({S32, S64})
778  .scalarize(0)
779  .clampScalar(0, S32, S64);
780  } else {
782  .legalFor({S32, S64})
783  .scalarize(0)
784  .clampScalar(0, S32, S64);
785  }
786  }
787 
788  getActionDefinitionsBuilder(G_FPTRUNC)
789  .legalFor({{S32, S64}, {S16, S32}})
790  .scalarize(0)
791  .lower();
792 
794  .legalFor({{S64, S32}, {S32, S16}})
795  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
796  .scalarize(0);
797 
798  auto &FSubActions = getActionDefinitionsBuilder(G_FSUB);
799  if (ST.has16BitInsts()) {
800  FSubActions
801  // Use actual fsub instruction
802  .legalFor({S32, S16})
803  // Must use fadd + fneg
804  .lowerFor({S64, V2S16});
805  } else {
806  FSubActions
807  // Use actual fsub instruction
808  .legalFor({S32})
809  // Must use fadd + fneg
810  .lowerFor({S64, S16, V2S16});
811  }
812 
813  FSubActions
814  .scalarize(0)
815  .clampScalar(0, S32, S64);
816 
817  // Whether this is legal depends on the floating point mode for the function.
818  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
819  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
820  FMad.customFor({S32, S16});
821  else if (ST.hasMadMacF32Insts())
822  FMad.customFor({S32});
823  else if (ST.hasMadF16())
824  FMad.customFor({S16});
825  FMad.scalarize(0)
826  .lower();
827 
828  auto &FRem = getActionDefinitionsBuilder(G_FREM);
829  if (ST.has16BitInsts()) {
830  FRem.customFor({S16, S32, S64});
831  } else {
832  FRem.minScalar(0, S32)
833  .customFor({S32, S64});
834  }
835  FRem.scalarize(0);
836 
837  // TODO: Do we need to clamp maximum bitwidth?
839  .legalIf(isScalar(0))
840  .legalFor({{V2S16, V2S32}})
841  .clampMaxNumElements(0, S16, 2)
842  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
843  // situations (like an invalid implicit use), we don't want to infinite loop
844  // in the legalizer.
846  .alwaysLegal();
847 
848  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
849  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
850  {S32, S1}, {S64, S1}, {S16, S1}})
851  .scalarize(0)
852  .clampScalar(0, S32, S64)
853  .widenScalarToNextPow2(1, 32);
854 
855  // TODO: Split s1->s64 during regbankselect for VALU.
856  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
857  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
858  .lowerIf(typeIs(1, S1))
859  .customFor({{S32, S64}, {S64, S64}});
860  if (ST.has16BitInsts())
861  IToFP.legalFor({{S16, S16}});
862  IToFP.clampScalar(1, S32, S64)
863  .minScalar(0, S32)
864  .scalarize(0)
866 
867  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
868  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
869  .customFor({{S64, S32}, {S64, S64}})
870  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
871  if (ST.has16BitInsts())
872  FPToI.legalFor({{S16, S16}});
873  else
874  FPToI.minScalar(1, S32);
875 
876  FPToI.minScalar(0, S32)
877  .widenScalarToNextPow2(0, 32)
878  .scalarize(0)
879  .lower();
880 
881  getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
882  .customFor({S16, S32})
883  .scalarize(0)
884  .lower();
885 
886  // Lower roundeven into G_FRINT
887  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
888  .scalarize(0)
889  .lower();
890 
891  if (ST.has16BitInsts()) {
892  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
893  .legalFor({S16, S32, S64})
894  .clampScalar(0, S16, S64)
895  .scalarize(0);
896  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
897  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
898  .legalFor({S32, S64})
899  .clampScalar(0, S32, S64)
900  .scalarize(0);
901  } else {
902  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
903  .legalFor({S32})
904  .customFor({S64})
905  .clampScalar(0, S32, S64)
906  .scalarize(0);
907  }
908 
909  getActionDefinitionsBuilder(G_PTR_ADD)
910  .legalIf(all(isPointer(0), sameSize(0, 1)))
911  .scalarize(0)
912  .scalarSameSizeAs(1, 0);
913 
914  getActionDefinitionsBuilder(G_PTRMASK)
915  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
916  .scalarSameSizeAs(1, 0)
917  .scalarize(0);
918 
919  auto &CmpBuilder =
920  getActionDefinitionsBuilder(G_ICMP)
921  // The compare output type differs based on the register bank of the output,
922  // so make both s1 and s32 legal.
923  //
924  // Scalar compares producing output in scc will be promoted to s32, as that
925  // is the allocatable register type that will be needed for the copy from
926  // scc. This will be promoted during RegBankSelect, and we assume something
927  // before that won't try to use s32 result types.
928  //
929  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
930  // bank.
931  .legalForCartesianProduct(
932  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
933  .legalForCartesianProduct(
934  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
935  if (ST.has16BitInsts()) {
936  CmpBuilder.legalFor({{S1, S16}});
937  }
938 
939  CmpBuilder
940  .widenScalarToNextPow2(1)
941  .clampScalar(1, S32, S64)
942  .scalarize(0)
943  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
944 
945  getActionDefinitionsBuilder(G_FCMP)
946  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
947  .widenScalarToNextPow2(1)
948  .clampScalar(1, S32, S64)
949  .scalarize(0);
950 
951  // FIXME: fpow has a selection pattern that should move to custom lowering.
952  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
953  if (ST.has16BitInsts())
954  Exp2Ops.legalFor({S32, S16});
955  else
956  Exp2Ops.legalFor({S32});
957  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
958  Exp2Ops.scalarize(0);
959 
960  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
961  if (ST.has16BitInsts())
962  ExpOps.customFor({{S32}, {S16}});
963  else
964  ExpOps.customFor({S32});
965  ExpOps.clampScalar(0, MinScalarFPTy, S32)
966  .scalarize(0);
967 
968  getActionDefinitionsBuilder(G_FPOWI)
969  .clampScalar(0, MinScalarFPTy, S32)
970  .lower();
971 
972  // The 64-bit versions produce 32-bit results, but only on the SALU.
973  getActionDefinitionsBuilder(G_CTPOP)
974  .legalFor({{S32, S32}, {S32, S64}})
975  .clampScalar(0, S32, S32)
976  .widenScalarToNextPow2(1, 32)
977  .clampScalar(1, S32, S64)
978  .scalarize(0)
979  .widenScalarToNextPow2(0, 32);
980 
981 
982  // The hardware instructions return a different result on 0 than the generic
983  // instructions expect. The hardware produces -1, but these produce the
984  // bitwidth.
985  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
986  .scalarize(0)
987  .clampScalar(0, S32, S32)
988  .clampScalar(1, S32, S64)
989  .widenScalarToNextPow2(0, 32)
990  .widenScalarToNextPow2(1, 32)
991  .custom();
992 
993  // The 64-bit versions produce 32-bit results, but only on the SALU.
994  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
995  .legalFor({{S32, S32}, {S32, S64}})
996  .clampScalar(0, S32, S32)
997  .clampScalar(1, S32, S64)
998  .scalarize(0)
999  .widenScalarToNextPow2(0, 32)
1000  .widenScalarToNextPow2(1, 32);
1001 
1002  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1003  // RegBankSelect.
1004  getActionDefinitionsBuilder(G_BITREVERSE)
1005  .legalFor({S32, S64})
1006  .clampScalar(0, S32, S64)
1007  .scalarize(0)
1008  .widenScalarToNextPow2(0);
1009 
1010  if (ST.has16BitInsts()) {
1011  getActionDefinitionsBuilder(G_BSWAP)
1012  .legalFor({S16, S32, V2S16})
1013  .clampMaxNumElementsStrict(0, S16, 2)
1014  // FIXME: Fixing non-power-of-2 before clamp is workaround for
1015  // narrowScalar limitation.
1016  .widenScalarToNextPow2(0)
1017  .clampScalar(0, S16, S32)
1018  .scalarize(0);
1019 
1020  if (ST.hasVOP3PInsts()) {
1021  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1022  .legalFor({S32, S16, V2S16})
1023  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1024  .clampMaxNumElements(0, S16, 2)
1025  .minScalar(0, S16)
1026  .widenScalarToNextPow2(0)
1027  .scalarize(0)
1028  .lower();
1029  } else {
1030  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1031  .legalFor({S32, S16})
1032  .widenScalarToNextPow2(0)
1033  .minScalar(0, S16)
1034  .scalarize(0)
1035  .lower();
1036  }
1037  } else {
1038  // TODO: Should have same legality without v_perm_b32
1039  getActionDefinitionsBuilder(G_BSWAP)
1040  .legalFor({S32})
1041  .lowerIf(scalarNarrowerThan(0, 32))
1042  // FIXME: Fixing non-power-of-2 before clamp is workaround for
1043  // narrowScalar limitation.
1044  .widenScalarToNextPow2(0)
1045  .maxScalar(0, S32)
1046  .scalarize(0)
1047  .lower();
1048 
1049  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1050  .legalFor({S32})
1051  .minScalar(0, S32)
1052  .widenScalarToNextPow2(0)
1053  .scalarize(0)
1054  .lower();
1055  }
1056 
1057  getActionDefinitionsBuilder(G_INTTOPTR)
1058  // List the common cases
1059  .legalForCartesianProduct(AddrSpaces64, {S64})
1060  .legalForCartesianProduct(AddrSpaces32, {S32})
1061  .scalarize(0)
1062  // Accept any address space as long as the size matches
1063  .legalIf(sameSize(0, 1))
1064  .widenScalarIf(smallerThan(1, 0),
1065  [](const LegalityQuery &Query) {
1066  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1067  })
1068  .narrowScalarIf(largerThan(1, 0),
1069  [](const LegalityQuery &Query) {
1070  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1071  });
1072 
1073  getActionDefinitionsBuilder(G_PTRTOINT)
1074  // List the common cases
1075  .legalForCartesianProduct(AddrSpaces64, {S64})
1076  .legalForCartesianProduct(AddrSpaces32, {S32})
1077  .scalarize(0)
1078  // Accept any address space as long as the size matches
1079  .legalIf(sameSize(0, 1))
1080  .widenScalarIf(smallerThan(0, 1),
1081  [](const LegalityQuery &Query) {
1082  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1083  })
1084  .narrowScalarIf(
1085  largerThan(0, 1),
1086  [](const LegalityQuery &Query) {
1087  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1088  });
1089 
1090  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1091  .scalarize(0)
1092  .custom();
1093 
1094  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1095  bool IsLoad) -> bool {
1096  const LLT DstTy = Query.Types[0];
1097 
1098  // Split vector extloads.
1099  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1100 
1101  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1102  return true;
1103 
1104  const LLT PtrTy = Query.Types[1];
1105  unsigned AS = PtrTy.getAddressSpace();
1106  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1107  return true;
1108 
1109  // Catch weird sized loads that don't evenly divide into the access sizes
1110  // TODO: May be able to widen depending on alignment etc.
1111  unsigned NumRegs = (MemSize + 31) / 32;
1112  if (NumRegs == 3) {
1113  if (!ST.hasDwordx3LoadStores())
1114  return true;
1115  } else {
1116  // If the alignment allows, these should have been widened.
1117  if (!isPowerOf2_32(NumRegs))
1118  return true;
1119  }
1120 
1121  return false;
1122  };
1123 
1124  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1125  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1126  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1127 
1128  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1129  // LDS
1130  // TODO: Unsupported flat for SI.
1131 
1132  for (unsigned Op : {G_LOAD, G_STORE}) {
1133  const bool IsStore = Op == G_STORE;
1134 
1135  auto &Actions = getActionDefinitionsBuilder(Op);
1136  // Explicitly list some common cases.
1137  // TODO: Does this help compile time at all?
1138  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1139  {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1140  {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1141  {S64, GlobalPtr, S64, GlobalAlign32},
1142  {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1143  {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1144  {S32, GlobalPtr, S8, GlobalAlign8},
1145  {S32, GlobalPtr, S16, GlobalAlign16},
1146 
1147  {S32, LocalPtr, S32, 32},
1148  {S64, LocalPtr, S64, 32},
1149  {V2S32, LocalPtr, V2S32, 32},
1150  {S32, LocalPtr, S8, 8},
1151  {S32, LocalPtr, S16, 16},
1152  {V2S16, LocalPtr, S32, 32},
1153 
1154  {S32, PrivatePtr, S32, 32},
1155  {S32, PrivatePtr, S8, 8},
1156  {S32, PrivatePtr, S16, 16},
1157  {V2S16, PrivatePtr, S32, 32},
1158 
1159  {S32, ConstantPtr, S32, GlobalAlign32},
1160  {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1161  {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1162  {S64, ConstantPtr, S64, GlobalAlign32},
1163  {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1164  Actions.legalIf(
1165  [=](const LegalityQuery &Query) -> bool {
1166  return isLoadStoreLegal(ST, Query);
1167  });
1168 
1169  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1170  // 64-bits.
1171  //
1172  // TODO: Should generalize bitcast action into coerce, which will also cover
1173  // inserting addrspacecasts.
1174  Actions.customIf(typeIs(1, Constant32Ptr));
1175 
1176  // Turn any illegal element vectors into something easier to deal
1177  // with. These will ultimately produce 32-bit scalar shifts to extract the
1178  // parts anyway.
1179  //
1180  // For odd 16-bit element vectors, prefer to split those into pieces with
1181  // 16-bit vector parts.
1182  Actions.bitcastIf(
1183  [=](const LegalityQuery &Query) -> bool {
1184  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1185  Query.MMODescrs[0].MemoryTy);
1186  }, bitcastToRegisterType(0));
1187 
1188  if (!IsStore) {
1189  // Widen suitably aligned loads by loading extra bytes. The standard
1190  // legalization actions can't properly express widening memory operands.
1191  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1192  return shouldWidenLoad(ST, Query, G_LOAD);
1193  });
1194  }
1195 
1196  // FIXME: load/store narrowing should be moved to lower action
1197  Actions
1198  .narrowScalarIf(
1199  [=](const LegalityQuery &Query) -> bool {
1200  return !Query.Types[0].isVector() &&
1201  needToSplitMemOp(Query, Op == G_LOAD);
1202  },
1203  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1204  const LLT DstTy = Query.Types[0];
1205  const LLT PtrTy = Query.Types[1];
1206 
1207  const unsigned DstSize = DstTy.getSizeInBits();
1208  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1209 
1210  // Split extloads.
1211  if (DstSize > MemSize)
1212  return std::make_pair(0, LLT::scalar(MemSize));
1213 
1214  unsigned MaxSize = maxSizeForAddrSpace(ST,
1215  PtrTy.getAddressSpace(),
1216  Op == G_LOAD);
1217  if (MemSize > MaxSize)
1218  return std::make_pair(0, LLT::scalar(MaxSize));
1219 
1220  uint64_t Align = Query.MMODescrs[0].AlignInBits;
1221  return std::make_pair(0, LLT::scalar(Align));
1222  })
1223  .fewerElementsIf(
1224  [=](const LegalityQuery &Query) -> bool {
1225  return Query.Types[0].isVector() &&
1226  needToSplitMemOp(Query, Op == G_LOAD);
1227  },
1228  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1229  const LLT DstTy = Query.Types[0];
1230  const LLT PtrTy = Query.Types[1];
1231 
1232  LLT EltTy = DstTy.getElementType();
1233  unsigned MaxSize = maxSizeForAddrSpace(ST,
1234  PtrTy.getAddressSpace(),
1235  Op == G_LOAD);
1236 
1237  // FIXME: Handle widened to power of 2 results better. This ends
1238  // up scalarizing.
1239  // FIXME: 3 element stores scalarized on SI
1240 
1241  // Split if it's too large for the address space.
1242  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1243  if (MemSize > MaxSize) {
1244  unsigned NumElts = DstTy.getNumElements();
1245  unsigned EltSize = EltTy.getSizeInBits();
1246 
1247  if (MaxSize % EltSize == 0) {
1248  return std::make_pair(
1250  ElementCount::getFixed(MaxSize / EltSize), EltTy));
1251  }
1252 
1253  unsigned NumPieces = MemSize / MaxSize;
1254 
1255  // FIXME: Refine when odd breakdowns handled
1256  // The scalars will need to be re-legalized.
1257  if (NumPieces == 1 || NumPieces >= NumElts ||
1258  NumElts % NumPieces != 0)
1259  return std::make_pair(0, EltTy);
1260 
1261  return std::make_pair(
1262  0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1263  }
1264 
1265  // FIXME: We could probably handle weird extending loads better.
1266  if (DstTy.getSizeInBits() > MemSize)
1267  return std::make_pair(0, EltTy);
1268 
1269  unsigned EltSize = EltTy.getSizeInBits();
1270  unsigned DstSize = DstTy.getSizeInBits();
1271  if (!isPowerOf2_32(DstSize)) {
1272  // We're probably decomposing an odd sized store. Try to split
1273  // to the widest type. TODO: Account for alignment. As-is it
1274  // should be OK, since the new parts will be further legalized.
1275  unsigned FloorSize = PowerOf2Floor(DstSize);
1276  return std::make_pair(
1278  ElementCount::getFixed(FloorSize / EltSize), EltTy));
1279  }
1280 
1281  // May need relegalization for the scalars.
1282  return std::make_pair(0, EltTy);
1283  })
1284  .minScalar(0, S32)
1285  .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1286  .widenScalarToNextPow2(0)
1287  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1288  .lower();
1289  }
1290 
1291  // FIXME: Unaligned accesses not lowered.
1292  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1293  .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1294  {S32, GlobalPtr, S16, 2 * 8},
1295  {S32, LocalPtr, S8, 8},
1296  {S32, LocalPtr, S16, 16},
1297  {S32, PrivatePtr, S8, 8},
1298  {S32, PrivatePtr, S16, 16},
1299  {S32, ConstantPtr, S8, 8},
1300  {S32, ConstantPtr, S16, 2 * 8}})
1301  .legalIf(
1302  [=](const LegalityQuery &Query) -> bool {
1303  return isLoadStoreLegal(ST, Query);
1304  });
1305 
1306  if (ST.hasFlatAddressSpace()) {
1307  ExtLoads.legalForTypesWithMemDesc(
1308  {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1309  }
1310 
1311  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1312  // 64-bits.
1313  //
1314  // TODO: Should generalize bitcast action into coerce, which will also cover
1315  // inserting addrspacecasts.
1316  ExtLoads.customIf(typeIs(1, Constant32Ptr));
1317 
1318  ExtLoads.clampScalar(0, S32, S32)
1319  .widenScalarToNextPow2(0)
1320  .lower();
1321 
1322  auto &Atomics = getActionDefinitionsBuilder(
1323  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1324  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1325  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1326  G_ATOMICRMW_UMIN})
1327  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1328  {S64, GlobalPtr}, {S64, LocalPtr},
1329  {S32, RegionPtr}, {S64, RegionPtr}});
1330  if (ST.hasFlatAddressSpace()) {
1331  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1332  }
1333 
1334  auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1335  if (ST.hasLDSFPAtomicAdd()) {
1336  Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1337  if (ST.hasGFX90AInsts())
1338  Atomic.legalFor({{S64, LocalPtr}});
1339  if (ST.hasGFX940Insts())
1340  Atomic.legalFor({{V2S16, LocalPtr}});
1341  }
1342  if (ST.hasAtomicFaddInsts())
1343  Atomic.legalFor({{S32, GlobalPtr}});
1344 
1345  if (ST.hasGFX90AInsts()) {
1346  // These are legal with some caveats, and should have undergone expansion in
1347  // the IR in most situations
1348  // TODO: Move atomic expansion into legalizer
1349  // TODO: Also supports <2 x f16>
1350  Atomic.legalFor({
1351  {S32, GlobalPtr},
1352  {S64, GlobalPtr},
1353  {S64, FlatPtr}
1354  });
1355  }
1356 
1357  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1358  // demarshalling
1359  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1360  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1361  {S32, FlatPtr}, {S64, FlatPtr}})
1362  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1363  {S32, RegionPtr}, {S64, RegionPtr}});
1364  // TODO: Pointer types, any 32-bit or 64-bit vector
1365 
1366  // Condition should be s32 for scalar, s1 for vector.
1367  getActionDefinitionsBuilder(G_SELECT)
1368  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1369  LocalPtr, FlatPtr, PrivatePtr,
1370  LLT::fixed_vector(2, LocalPtr),
1371  LLT::fixed_vector(2, PrivatePtr)},
1372  {S1, S32})
1373  .clampScalar(0, S16, S64)
1374  .scalarize(1)
1375  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1376  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1377  .clampMaxNumElements(0, S32, 2)
1378  .clampMaxNumElements(0, LocalPtr, 2)
1379  .clampMaxNumElements(0, PrivatePtr, 2)
1380  .scalarize(0)
1381  .widenScalarToNextPow2(0)
1382  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1383 
1384  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1385  // be more flexible with the shift amount type.
1386  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1387  .legalFor({{S32, S32}, {S64, S32}});
1388  if (ST.has16BitInsts()) {
1389  if (ST.hasVOP3PInsts()) {
1390  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1391  .clampMaxNumElements(0, S16, 2);
1392  } else
1393  Shifts.legalFor({{S16, S16}});
1394 
1395  // TODO: Support 16-bit shift amounts for all types
1396  Shifts.widenScalarIf(
1397  [=](const LegalityQuery &Query) {
1398  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1399  // 32-bit amount.
1400  const LLT ValTy = Query.Types[0];
1401  const LLT AmountTy = Query.Types[1];
1402  return ValTy.getSizeInBits() <= 16 &&
1403  AmountTy.getSizeInBits() < 16;
1404  }, changeTo(1, S16));
1405  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1406  Shifts.clampScalar(1, S32, S32);
1407  Shifts.widenScalarToNextPow2(0, 16);
1408  Shifts.clampScalar(0, S16, S64);
1409 
1410  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1411  .minScalar(0, S16)
1412  .scalarize(0)
1413  .lower();
1414  } else {
1415  // Make sure we legalize the shift amount type first, as the general
1416  // expansion for the shifted type will produce much worse code if it hasn't
1417  // been truncated already.
1418  Shifts.clampScalar(1, S32, S32);
1419  Shifts.widenScalarToNextPow2(0, 32);
1420  Shifts.clampScalar(0, S32, S64);
1421 
1422  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1423  .minScalar(0, S32)
1424  .scalarize(0)
1425  .lower();
1426  }
1427  Shifts.scalarize(0);
1428 
1429  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1430  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1431  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1432  unsigned IdxTypeIdx = 2;
1433 
1434  getActionDefinitionsBuilder(Op)
1435  .customIf([=](const LegalityQuery &Query) {
1436  const LLT EltTy = Query.Types[EltTypeIdx];
1437  const LLT VecTy = Query.Types[VecTypeIdx];
1438  const LLT IdxTy = Query.Types[IdxTypeIdx];
1439  const unsigned EltSize = EltTy.getSizeInBits();
1440  return (EltSize == 32 || EltSize == 64) &&
1441  VecTy.getSizeInBits() % 32 == 0 &&
1442  VecTy.getSizeInBits() <= MaxRegisterSize &&
1443  IdxTy.getSizeInBits() == 32;
1444  })
1445  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1446  bitcastToVectorElement32(VecTypeIdx))
1447  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1448  .bitcastIf(
1449  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1450  [=](const LegalityQuery &Query) {
1451  // For > 64-bit element types, try to turn this into a 64-bit
1452  // element vector since we may be able to do better indexing
1453  // if this is scalar. If not, fall back to 32.
1454  const LLT EltTy = Query.Types[EltTypeIdx];
1455  const LLT VecTy = Query.Types[VecTypeIdx];
1456  const unsigned DstEltSize = EltTy.getSizeInBits();
1457  const unsigned VecSize = VecTy.getSizeInBits();
1458 
1459  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1460  return std::make_pair(
1461  VecTypeIdx,
1462  LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1463  })
1464  .clampScalar(EltTypeIdx, S32, S64)
1465  .clampScalar(VecTypeIdx, S32, S64)
1466  .clampScalar(IdxTypeIdx, S32, S32)
1467  .clampMaxNumElements(VecTypeIdx, S32, 32)
1468  // TODO: Clamp elements for 64-bit vectors?
1469  // It should only be necessary with variable indexes.
1470  // As a last resort, lower to the stack
1471  .lower();
1472  }
1473 
1474  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1475  .unsupportedIf([=](const LegalityQuery &Query) {
1476  const LLT &EltTy = Query.Types[1].getElementType();
1477  return Query.Types[0] != EltTy;
1478  });
1479 
1480  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1481  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1482  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1483 
1484  // FIXME: Doesn't handle extract of illegal sizes.
1485  getActionDefinitionsBuilder(Op)
1486  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1487  .lowerIf([=](const LegalityQuery &Query) {
1488  // Sub-vector(or single element) insert and extract.
1489  // TODO: verify immediate offset here since lower only works with
1490  // whole elements.
1491  const LLT BigTy = Query.Types[BigTyIdx];
1492  return BigTy.isVector();
1493  })
1494  // FIXME: Multiples of 16 should not be legal.
1495  .legalIf([=](const LegalityQuery &Query) {
1496  const LLT BigTy = Query.Types[BigTyIdx];
1497  const LLT LitTy = Query.Types[LitTyIdx];
1498  return (BigTy.getSizeInBits() % 32 == 0) &&
1499  (LitTy.getSizeInBits() % 16 == 0);
1500  })
1501  .widenScalarIf(
1502  [=](const LegalityQuery &Query) {
1503  const LLT BigTy = Query.Types[BigTyIdx];
1504  return (BigTy.getScalarSizeInBits() < 16);
1505  },
1507  .widenScalarIf(
1508  [=](const LegalityQuery &Query) {
1509  const LLT LitTy = Query.Types[LitTyIdx];
1510  return (LitTy.getScalarSizeInBits() < 16);
1511  },
1513  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1514  .widenScalarToNextPow2(BigTyIdx, 32);
1515 
1516  }
1517 
1518  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1519  .legalForCartesianProduct(AllS32Vectors, {S32})
1520  .legalForCartesianProduct(AllS64Vectors, {S64})
1521  .clampNumElements(0, V16S32, V32S32)
1522  .clampNumElements(0, V2S64, V16S64)
1523  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1524 
1525  if (ST.hasScalarPackInsts()) {
1526  BuildVector
1527  // FIXME: Should probably widen s1 vectors straight to s32
1528  .minScalarOrElt(0, S16)
1529  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1530  .minScalar(1, S32);
1531 
1532  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1533  .legalFor({V2S16, S32})
1534  .lower();
1535  BuildVector.minScalarOrElt(0, S32);
1536  } else {
1537  BuildVector.customFor({V2S16, S16});
1538  BuildVector.minScalarOrElt(0, S32);
1539 
1540  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1541  .customFor({V2S16, S32})
1542  .lower();
1543  }
1544 
1545  BuildVector.legalIf(isRegisterType(0));
1546 
1547  // FIXME: Clamp maximum size
1548  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1549  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1550  .clampMaxNumElements(0, S32, 32)
1551  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1552  .clampMaxNumElements(0, S16, 64);
1553 
1554  // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1555  // pre-legalize.
1556  if (ST.hasVOP3PInsts()) {
1557  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1558  .customFor({V2S16, V2S16})
1559  .lower();
1560  } else
1561  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1562 
1563  // Merge/Unmerge
1564  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1565  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1566  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1567 
1568  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1569  const LLT Ty = Query.Types[TypeIdx];
1570  if (Ty.isVector()) {
1571  const LLT &EltTy = Ty.getElementType();
1572  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1573  return true;
1574  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1575  return true;
1576  }
1577  return false;
1578  };
1579 
1580  auto &Builder = getActionDefinitionsBuilder(Op)
1581  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1582  .lowerFor({{S16, V2S16}})
1583  .lowerIf([=](const LegalityQuery &Query) {
1584  const LLT BigTy = Query.Types[BigTyIdx];
1585  return BigTy.getSizeInBits() == 32;
1586  })
1587  // Try to widen to s16 first for small types.
1588  // TODO: Only do this on targets with legal s16 shifts
1589  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1590  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1591  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1592  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1593  elementTypeIs(1, S16)),
1594  changeTo(1, V2S16))
1595  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1596  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1597  // valid.
1598  .clampScalar(LitTyIdx, S32, S512)
1599  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1600  // Break up vectors with weird elements into scalars
1601  .fewerElementsIf(
1602  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1603  scalarize(0))
1604  .fewerElementsIf(
1605  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1606  scalarize(1))
1607  .clampScalar(BigTyIdx, S32, MaxScalar);
1608 
1609  if (Op == G_MERGE_VALUES) {
1610  Builder.widenScalarIf(
1611  // TODO: Use 16-bit shifts if legal for 8-bit values?
1612  [=](const LegalityQuery &Query) {
1613  const LLT Ty = Query.Types[LitTyIdx];
1614  return Ty.getSizeInBits() < 32;
1615  },
1616  changeTo(LitTyIdx, S32));
1617  }
1618 
1619  Builder.widenScalarIf(
1620  [=](const LegalityQuery &Query) {
1621  const LLT Ty = Query.Types[BigTyIdx];
1622  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1623  Ty.getSizeInBits() % 16 != 0;
1624  },
1625  [=](const LegalityQuery &Query) {
1626  // Pick the next power of 2, or a multiple of 64 over 128.
1627  // Whichever is smaller.
1628  const LLT &Ty = Query.Types[BigTyIdx];
1629  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1630  if (NewSizeInBits >= 256) {
1631  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1632  if (RoundedTo < NewSizeInBits)
1633  NewSizeInBits = RoundedTo;
1634  }
1635  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1636  })
1637  // Any vectors left are the wrong size. Scalarize them.
1638  .scalarize(0)
1639  .scalarize(1);
1640  }
1641 
1642  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1643  // RegBankSelect.
1644  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1645  .legalFor({{S32}, {S64}});
1646 
1647  if (ST.hasVOP3PInsts()) {
1648  SextInReg.lowerFor({{V2S16}})
1649  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1650  // get more vector shift opportunities, since we'll get those when
1651  // expanded.
1652  .clampMaxNumElementsStrict(0, S16, 2);
1653  } else if (ST.has16BitInsts()) {
1654  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1655  } else {
1656  // Prefer to promote to s32 before lowering if we don't have 16-bit
1657  // shifts. This avoid a lot of intermediate truncate and extend operations.
1658  SextInReg.lowerFor({{S32}, {S64}});
1659  }
1660 
1661  SextInReg
1662  .scalarize(0)
1663  .clampScalar(0, S32, S64)
1664  .lower();
1665 
1666  getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1667  .scalarize(0)
1668  .lower();
1669 
1670  // TODO: Only Try to form v2s16 with legal packed instructions.
1671  getActionDefinitionsBuilder(G_FSHR)
1672  .legalFor({{S32, S32}})
1673  .lowerFor({{V2S16, V2S16}})
1674  .clampMaxNumElementsStrict(0, S16, 2)
1675  .scalarize(0)
1676  .lower();
1677 
1678  if (ST.hasVOP3PInsts()) {
1679  getActionDefinitionsBuilder(G_FSHL)
1680  .lowerFor({{V2S16, V2S16}})
1681  .clampMaxNumElementsStrict(0, S16, 2)
1682  .scalarize(0)
1683  .lower();
1684  } else {
1685  getActionDefinitionsBuilder(G_FSHL)
1686  .scalarize(0)
1687  .lower();
1688  }
1689 
1690  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1691  .legalFor({S64});
1692 
1693  getActionDefinitionsBuilder(G_FENCE)
1694  .alwaysLegal();
1695 
1696  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1697  .scalarize(0)
1698  .minScalar(0, S32)
1699  .lower();
1700 
1701  getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1702  .legalFor({{S32, S32}, {S64, S32}})
1703  .clampScalar(1, S32, S32)
1704  .clampScalar(0, S32, S64)
1705  .widenScalarToNextPow2(0)
1706  .scalarize(0);
1707 
1708  getActionDefinitionsBuilder({
1709  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1710  G_FCOPYSIGN,
1711 
1712  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1713  G_ATOMICRMW_NAND,
1714  G_ATOMICRMW_FSUB,
1715  G_READ_REGISTER,
1716  G_WRITE_REGISTER,
1717 
1718  G_SADDO, G_SSUBO,
1719 
1720  // TODO: Implement
1721  G_FMINIMUM, G_FMAXIMUM}).lower();
1722 
1723  getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1724  .lower();
1725 
1726  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1727  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1728  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1729  .unsupported();
1730 
1731  getLegacyLegalizerInfo().computeTables();
1732  verify(*ST.getInstrInfo());
1733 }
1734 
1735 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1736  MachineInstr &MI) const {
1737  MachineIRBuilder &B = Helper.MIRBuilder;
1738  MachineRegisterInfo &MRI = *B.getMRI();
1739 
1740  switch (MI.getOpcode()) {
1741  case TargetOpcode::G_ADDRSPACE_CAST:
1742  return legalizeAddrSpaceCast(MI, MRI, B);
1743  case TargetOpcode::G_FRINT:
1744  return legalizeFrint(MI, MRI, B);
1745  case TargetOpcode::G_FCEIL:
1746  return legalizeFceil(MI, MRI, B);
1747  case TargetOpcode::G_FREM:
1748  return legalizeFrem(MI, MRI, B);
1749  case TargetOpcode::G_INTRINSIC_TRUNC:
1750  return legalizeIntrinsicTrunc(MI, MRI, B);
1751  case TargetOpcode::G_SITOFP:
1752  return legalizeITOFP(MI, MRI, B, true);
1753  case TargetOpcode::G_UITOFP:
1754  return legalizeITOFP(MI, MRI, B, false);
1755  case TargetOpcode::G_FPTOSI:
1756  return legalizeFPTOI(MI, MRI, B, true);
1757  case TargetOpcode::G_FPTOUI:
1758  return legalizeFPTOI(MI, MRI, B, false);
1759  case TargetOpcode::G_FMINNUM:
1760  case TargetOpcode::G_FMAXNUM:
1761  case TargetOpcode::G_FMINNUM_IEEE:
1762  case TargetOpcode::G_FMAXNUM_IEEE:
1763  return legalizeMinNumMaxNum(Helper, MI);
1764  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1765  return legalizeExtractVectorElt(MI, MRI, B);
1766  case TargetOpcode::G_INSERT_VECTOR_ELT:
1767  return legalizeInsertVectorElt(MI, MRI, B);
1768  case TargetOpcode::G_SHUFFLE_VECTOR:
1769  return legalizeShuffleVector(MI, MRI, B);
1770  case TargetOpcode::G_FSIN:
1771  case TargetOpcode::G_FCOS:
1772  return legalizeSinCos(MI, MRI, B);
1773  case TargetOpcode::G_GLOBAL_VALUE:
1774  return legalizeGlobalValue(MI, MRI, B);
1775  case TargetOpcode::G_LOAD:
1776  case TargetOpcode::G_SEXTLOAD:
1777  case TargetOpcode::G_ZEXTLOAD:
1778  return legalizeLoad(Helper, MI);
1779  case TargetOpcode::G_FMAD:
1780  return legalizeFMad(MI, MRI, B);
1781  case TargetOpcode::G_FDIV:
1782  return legalizeFDIV(MI, MRI, B);
1783  case TargetOpcode::G_UDIV:
1784  case TargetOpcode::G_UREM:
1785  case TargetOpcode::G_UDIVREM:
1786  return legalizeUnsignedDIV_REM(MI, MRI, B);
1787  case TargetOpcode::G_SDIV:
1788  case TargetOpcode::G_SREM:
1789  case TargetOpcode::G_SDIVREM:
1790  return legalizeSignedDIV_REM(MI, MRI, B);
1791  case TargetOpcode::G_ATOMIC_CMPXCHG:
1792  return legalizeAtomicCmpXChg(MI, MRI, B);
1793  case TargetOpcode::G_FLOG:
1794  return legalizeFlog(MI, B, numbers::ln2f);
1795  case TargetOpcode::G_FLOG10:
1797  case TargetOpcode::G_FEXP:
1798  return legalizeFExp(MI, B);
1799  case TargetOpcode::G_FPOW:
1800  return legalizeFPow(MI, B);
1801  case TargetOpcode::G_FFLOOR:
1802  return legalizeFFloor(MI, MRI, B);
1803  case TargetOpcode::G_BUILD_VECTOR:
1804  return legalizeBuildVector(MI, MRI, B);
1805  case TargetOpcode::G_MUL:
1806  return legalizeMul(Helper, MI);
1807  case TargetOpcode::G_CTLZ:
1808  case TargetOpcode::G_CTTZ:
1809  return legalizeCTLZ_CTTZ(MI, MRI, B);
1810  case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1811  return legalizeFPTruncRound(MI, B);
1812  default:
1813  return false;
1814  }
1815 
1816  llvm_unreachable("expected switch to return");
1817 }
1818 
1820  unsigned AS,
1822  MachineIRBuilder &B) const {
1823  MachineFunction &MF = B.getMF();
1824  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1825  const LLT S32 = LLT::scalar(32);
1826 
1828 
1829  if (ST.hasApertureRegs()) {
1830  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1831  // getreg.
1832  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1835  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1838  unsigned Encoding =
1840  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1841  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1842 
1843  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1844 
1845  B.buildInstr(AMDGPU::S_GETREG_B32)
1846  .addDef(GetReg)
1847  .addImm(Encoding);
1848  MRI.setType(GetReg, S32);
1849 
1850  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1851  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1852  }
1853 
1854  // TODO: can we be smarter about machine pointer info?
1858  // For code object version 5, private_base and shared_base are passed through
1859  // implicit kernargs.
1864  uint64_t Offset =
1865  ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1866 
1867  Register KernargPtrReg = MRI.createGenericVirtualRegister(
1869 
1870  if (!loadInputValue(KernargPtrReg, B,
1872  return Register();
1873 
1875  PtrInfo,
1878  LLT::scalar(32), commonAlignment(Align(64), Offset));
1879 
1880  // Pointer address
1881  B.buildPtrAdd(LoadAddr, KernargPtrReg,
1882  B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1883  // Load address
1884  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1885  }
1886 
1889 
1891  return Register();
1892 
1893  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1894  // private_segment_aperture_base_hi.
1895  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1896 
1898  PtrInfo,
1901  LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1902 
1903  B.buildPtrAdd(LoadAddr, QueuePtr,
1904  B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1905  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1906 }
1907 
1908 /// Return true if the value is a known valid address, such that a null check is
1909 /// not necessary.
1911  const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1912  MachineInstr *Def = MRI.getVRegDef(Val);
1913  switch (Def->getOpcode()) {
1914  case AMDGPU::G_FRAME_INDEX:
1915  case AMDGPU::G_GLOBAL_VALUE:
1916  case AMDGPU::G_BLOCK_ADDR:
1917  return true;
1918  case AMDGPU::G_CONSTANT: {
1919  const ConstantInt *CI = Def->getOperand(1).getCImm();
1920  return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1921  }
1922  default:
1923  return false;
1924  }
1925 
1926  return false;
1927 }
1928 
1931  MachineIRBuilder &B) const {
1932  MachineFunction &MF = B.getMF();
1933 
1934  const LLT S32 = LLT::scalar(32);
1935  Register Dst = MI.getOperand(0).getReg();
1936  Register Src = MI.getOperand(1).getReg();
1937 
1938  LLT DstTy = MRI.getType(Dst);
1939  LLT SrcTy = MRI.getType(Src);
1940  unsigned DestAS = DstTy.getAddressSpace();
1941  unsigned SrcAS = SrcTy.getAddressSpace();
1942 
1943  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1944  // vector element.
1945  assert(!DstTy.isVector());
1946 
1947  const AMDGPUTargetMachine &TM
1948  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1949 
1950  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1951  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1952  return true;
1953  }
1954 
1955  if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1956  (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1957  DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1958  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1959  // Extract low 32-bits of the pointer.
1960  B.buildExtract(Dst, Src, 0);
1961  MI.eraseFromParent();
1962  return true;
1963  }
1964 
1965  unsigned NullVal = TM.getNullPointerValue(DestAS);
1966 
1967  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1968  auto FlatNull = B.buildConstant(SrcTy, 0);
1969 
1970  // Extract low 32-bits of the pointer.
1971  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1972 
1973  auto CmpRes =
1974  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1975  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1976 
1977  MI.eraseFromParent();
1978  return true;
1979  }
1980 
1981  if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1982  (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1983  SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1984  if (!ST.hasFlatAddressSpace())
1985  return false;
1986 
1987  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1988  if (!ApertureReg.isValid())
1989  return false;
1990 
1991  // Coerce the type of the low half of the result so we can use merge_values.
1992  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1993 
1994  // TODO: Should we allow mismatched types but matching sizes in merges to
1995  // avoid the ptrtoint?
1996  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1997 
1998  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1999  B.buildCopy(Dst, BuildPtr);
2000  MI.eraseFromParent();
2001  return true;
2002  }
2003 
2004  auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2005  auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2006 
2007  auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2008  SegmentNull.getReg(0));
2009 
2010  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2011 
2012  MI.eraseFromParent();
2013  return true;
2014  }
2015 
2016  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2017  SrcTy.getSizeInBits() == 64) {
2018  // Truncate.
2019  B.buildExtract(Dst, Src, 0);
2020  MI.eraseFromParent();
2021  return true;
2022  }
2023 
2024  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2025  DstTy.getSizeInBits() == 64) {
2027  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2028 
2029  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
2030  // another. Merge operands are required to be the same type, but creating an
2031  // extra ptrtoint would be kind of pointless.
2032  auto HighAddr = B.buildConstant(
2034  B.buildMerge(Dst, {Src, HighAddr});
2035  MI.eraseFromParent();
2036  return true;
2037  }
2038 
2039  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2040  MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2041 
2042  LLVMContext &Ctx = MF.getFunction().getContext();
2043  Ctx.diagnose(InvalidAddrSpaceCast);
2044  B.buildUndef(Dst);
2045  MI.eraseFromParent();
2046  return true;
2047 }
2048 
2051  MachineIRBuilder &B) const {
2052  Register Src = MI.getOperand(1).getReg();
2053  LLT Ty = MRI.getType(Src);
2054  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2055 
2056  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2057  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2058 
2059  auto C1 = B.buildFConstant(Ty, C1Val);
2060  auto CopySign = B.buildFCopysign(Ty, C1, Src);
2061 
2062  // TODO: Should this propagate fast-math-flags?
2063  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2064  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2065 
2066  auto C2 = B.buildFConstant(Ty, C2Val);
2067  auto Fabs = B.buildFAbs(Ty, Src);
2068 
2069  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2070  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2071  MI.eraseFromParent();
2072  return true;
2073 }
2074 
2077  MachineIRBuilder &B) const {
2078 
2079  const LLT S1 = LLT::scalar(1);
2080  const LLT S64 = LLT::scalar(64);
2081 
2082  Register Src = MI.getOperand(1).getReg();
2083  assert(MRI.getType(Src) == S64);
2084 
2085  // result = trunc(src)
2086  // if (src > 0.0 && src != result)
2087  // result += 1.0
2088 
2089  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2090 
2091  const auto Zero = B.buildFConstant(S64, 0.0);
2092  const auto One = B.buildFConstant(S64, 1.0);
2093  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2094  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2095  auto And = B.buildAnd(S1, Lt0, NeTrunc);
2096  auto Add = B.buildSelect(S64, And, One, Zero);
2097 
2098  // TODO: Should this propagate fast-math-flags?
2099  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2100  MI.eraseFromParent();
2101  return true;
2102 }
2103 
2106  MachineIRBuilder &B) const {
2107  Register DstReg = MI.getOperand(0).getReg();
2108  Register Src0Reg = MI.getOperand(1).getReg();
2109  Register Src1Reg = MI.getOperand(2).getReg();
2110  auto Flags = MI.getFlags();
2111  LLT Ty = MRI.getType(DstReg);
2112 
2113  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2114  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2115  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2116  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2117  MI.eraseFromParent();
2118  return true;
2119 }
2120 
2122  MachineIRBuilder &B) {
2123  const unsigned FractBits = 52;
2124  const unsigned ExpBits = 11;
2125  LLT S32 = LLT::scalar(32);
2126 
2127  auto Const0 = B.buildConstant(S32, FractBits - 32);
2128  auto Const1 = B.buildConstant(S32, ExpBits);
2129 
2130  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2131  .addUse(Hi)
2132  .addUse(Const0.getReg(0))
2133  .addUse(Const1.getReg(0));
2134 
2135  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2136 }
2137 
2140  MachineIRBuilder &B) const {
2141  const LLT S1 = LLT::scalar(1);
2142  const LLT S32 = LLT::scalar(32);
2143  const LLT S64 = LLT::scalar(64);
2144 
2145  Register Src = MI.getOperand(1).getReg();
2146  assert(MRI.getType(Src) == S64);
2147 
2148  // TODO: Should this use extract since the low half is unused?
2149  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2150  Register Hi = Unmerge.getReg(1);
2151 
2152  // Extract the upper half, since this is where we will find the sign and
2153  // exponent.
2154  auto Exp = extractF64Exponent(Hi, B);
2155 
2156  const unsigned FractBits = 52;
2157 
2158  // Extract the sign bit.
2159  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2160  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2161 
2162  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2163 
2164  const auto Zero32 = B.buildConstant(S32, 0);
2165 
2166  // Extend back to 64-bits.
2167  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2168 
2169  auto Shr = B.buildAShr(S64, FractMask, Exp);
2170  auto Not = B.buildNot(S64, Shr);
2171  auto Tmp0 = B.buildAnd(S64, Src, Not);
2172  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2173 
2174  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2175  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2176 
2177  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2178  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2179  MI.eraseFromParent();
2180  return true;
2181 }
2182 
2185  MachineIRBuilder &B, bool Signed) const {
2186 
2187  Register Dst = MI.getOperand(0).getReg();
2188  Register Src = MI.getOperand(1).getReg();
2189 
2190  const LLT S64 = LLT::scalar(64);
2191  const LLT S32 = LLT::scalar(32);
2192 
2193  assert(MRI.getType(Src) == S64);
2194 
2195  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2196  auto ThirtyTwo = B.buildConstant(S32, 32);
2197 
2198  if (MRI.getType(Dst) == S64) {
2199  auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2200  : B.buildUITOFP(S64, Unmerge.getReg(1));
2201 
2202  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2203  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2204  .addUse(CvtHi.getReg(0))
2205  .addUse(ThirtyTwo.getReg(0));
2206 
2207  // TODO: Should this propagate fast-math-flags?
2208  B.buildFAdd(Dst, LdExp, CvtLo);
2209  MI.eraseFromParent();
2210  return true;
2211  }
2212 
2213  assert(MRI.getType(Dst) == S32);
2214 
2215  auto One = B.buildConstant(S32, 1);
2216 
2217  MachineInstrBuilder ShAmt;
2218  if (Signed) {
2219  auto ThirtyOne = B.buildConstant(S32, 31);
2220  auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2221  auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2222  auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2223  auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2224  /*HasSideEffects=*/false)
2225  .addUse(Unmerge.getReg(1));
2226  auto LS2 = B.buildSub(S32, LS, One);
2227  ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2228  } else
2229  ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2230  auto Norm = B.buildShl(S64, Src, ShAmt);
2231  auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2232  auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2233  auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2234  auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2235  auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2236  B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2237  /*HasSideEffects=*/false)
2238  .addUse(FVal.getReg(0))
2239  .addUse(Scale.getReg(0));
2240  MI.eraseFromParent();
2241  return true;
2242 }
2243 
2244 // TODO: Copied from DAG implementation. Verify logic and document how this
2245 // actually works.
2249  bool Signed) const {
2250 
2251  Register Dst = MI.getOperand(0).getReg();
2252  Register Src = MI.getOperand(1).getReg();
2253 
2254  const LLT S64 = LLT::scalar(64);
2255  const LLT S32 = LLT::scalar(32);
2256 
2257  const LLT SrcLT = MRI.getType(Src);
2258  assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2259 
2260  unsigned Flags = MI.getFlags();
2261 
2262  // The basic idea of converting a floating point number into a pair of 32-bit
2263  // integers is illustrated as follows:
2264  //
2265  // tf := trunc(val);
2266  // hif := floor(tf * 2^-32);
2267  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2268  // hi := fptoi(hif);
2269  // lo := fptoi(lof);
2270  //
2271  auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2272  MachineInstrBuilder Sign;
2273  if (Signed && SrcLT == S32) {
2274  // However, a 32-bit floating point number has only 23 bits mantissa and
2275  // it's not enough to hold all the significant bits of `lof` if val is
2276  // negative. To avoid the loss of precision, We need to take the absolute
2277  // value after truncating and flip the result back based on the original
2278  // signedness.
2279  Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2280  Trunc = B.buildFAbs(S32, Trunc, Flags);
2281  }
2282  MachineInstrBuilder K0, K1;
2283  if (SrcLT == S64) {
2284  K0 = B.buildFConstant(S64,
2285  BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2286  K1 = B.buildFConstant(S64,
2287  BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2288  } else {
2289  K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2290  K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2291  }
2292 
2293  auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2294  auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2295  auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2296 
2297  auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2298  : B.buildFPTOUI(S32, FloorMul);
2299  auto Lo = B.buildFPTOUI(S32, Fma);
2300 
2301  if (Signed && SrcLT == S32) {
2302  // Flip the result based on the signedness, which is either all 0s or 1s.
2303  Sign = B.buildMerge(S64, {Sign, Sign});
2304  // r := xor({lo, hi}, sign) - sign;
2305  B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2306  } else
2307  B.buildMerge(Dst, {Lo, Hi});
2308  MI.eraseFromParent();
2309 
2310  return true;
2311 }
2312 
2314  MachineInstr &MI) const {
2315  MachineFunction &MF = Helper.MIRBuilder.getMF();
2317 
2318  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2319  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2320 
2321  // With ieee_mode disabled, the instructions have the correct behavior
2322  // already for G_FMINNUM/G_FMAXNUM
2323  if (!MFI->getMode().IEEE)
2324  return !IsIEEEOp;
2325 
2326  if (IsIEEEOp)
2327  return true;
2328 
2330 }
2331 
2334  MachineIRBuilder &B) const {
2335  // TODO: Should move some of this into LegalizerHelper.
2336 
2337  // TODO: Promote dynamic indexing of s16 to s32
2338 
2339  // FIXME: Artifact combiner probably should have replaced the truncated
2340  // constant before this, so we shouldn't need
2341  // getIConstantVRegValWithLookThrough.
2342  Optional<ValueAndVReg> MaybeIdxVal =
2343  getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2344  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2345  return true;
2346  const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2347 
2348  Register Dst = MI.getOperand(0).getReg();
2349  Register Vec = MI.getOperand(1).getReg();
2350 
2351  LLT VecTy = MRI.getType(Vec);
2352  LLT EltTy = VecTy.getElementType();
2353  assert(EltTy == MRI.getType(Dst));
2354 
2355  if (IdxVal < VecTy.getNumElements()) {
2356  auto Unmerge = B.buildUnmerge(EltTy, Vec);
2357  B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2358  } else {
2359  B.buildUndef(Dst);
2360  }
2361 
2362  MI.eraseFromParent();
2363  return true;
2364 }
2365 
2368  MachineIRBuilder &B) const {
2369  // TODO: Should move some of this into LegalizerHelper.
2370 
2371  // TODO: Promote dynamic indexing of s16 to s32
2372 
2373  // FIXME: Artifact combiner probably should have replaced the truncated
2374  // constant before this, so we shouldn't need
2375  // getIConstantVRegValWithLookThrough.
2376  Optional<ValueAndVReg> MaybeIdxVal =
2377  getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2378  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2379  return true;
2380 
2381  const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2382  Register Dst = MI.getOperand(0).getReg();
2383  Register Vec = MI.getOperand(1).getReg();
2384  Register Ins = MI.getOperand(2).getReg();
2385 
2386  LLT VecTy = MRI.getType(Vec);
2387  LLT EltTy = VecTy.getElementType();
2388  assert(EltTy == MRI.getType(Ins));
2389  (void)Ins;
2390 
2391  unsigned NumElts = VecTy.getNumElements();
2392  if (IdxVal < NumElts) {
2393  SmallVector<Register, 8> SrcRegs;
2394  for (unsigned i = 0; i < NumElts; ++i)
2395  SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2396  B.buildUnmerge(SrcRegs, Vec);
2397 
2398  SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2399  B.buildMerge(Dst, SrcRegs);
2400  } else {
2401  B.buildUndef(Dst);
2402  }
2403 
2404  MI.eraseFromParent();
2405  return true;
2406 }
2407 
2410  MachineIRBuilder &B) const {
2411  const LLT V2S16 = LLT::fixed_vector(2, 16);
2412 
2413  Register Dst = MI.getOperand(0).getReg();
2414  Register Src0 = MI.getOperand(1).getReg();
2415  LLT DstTy = MRI.getType(Dst);
2416  LLT SrcTy = MRI.getType(Src0);
2417 
2418  if (SrcTy == V2S16 && DstTy == V2S16 &&
2419  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2420  return true;
2421 
2422  MachineIRBuilder HelperBuilder(MI);
2423  GISelObserverWrapper DummyObserver;
2424  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2426 }
2427 
2430  MachineIRBuilder &B) const {
2431 
2432  Register DstReg = MI.getOperand(0).getReg();
2433  Register SrcReg = MI.getOperand(1).getReg();
2434  LLT Ty = MRI.getType(DstReg);
2435  unsigned Flags = MI.getFlags();
2436 
2437  Register TrigVal;
2438  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2439  if (ST.hasTrigReducedRange()) {
2440  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2441  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2442  .addUse(MulVal.getReg(0))
2443  .setMIFlags(Flags).getReg(0);
2444  } else
2445  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2446 
2447  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2448  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2449  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2450  .addUse(TrigVal)
2451  .setMIFlags(Flags);
2452  MI.eraseFromParent();
2453  return true;
2454 }
2455 
2458  const GlobalValue *GV,
2459  int64_t Offset,
2460  unsigned GAFlags) const {
2461  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2462  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2463  // to the following code sequence:
2464  //
2465  // For constant address space:
2466  // s_getpc_b64 s[0:1]
2467  // s_add_u32 s0, s0, $symbol
2468  // s_addc_u32 s1, s1, 0
2469  //
2470  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2471  // a fixup or relocation is emitted to replace $symbol with a literal
2472  // constant, which is a pc-relative offset from the encoding of the $symbol
2473  // operand to the global variable.
2474  //
2475  // For global address space:
2476  // s_getpc_b64 s[0:1]
2477  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2478  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2479  //
2480  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2481  // fixups or relocations are emitted to replace $symbol@*@lo and
2482  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2483  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2484  // operand to the global variable.
2485  //
2486  // What we want here is an offset from the value returned by s_getpc
2487  // (which is the address of the s_add_u32 instruction) to the global
2488  // variable, but since the encoding of $symbol starts 4 bytes after the start
2489  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2490  // small. This requires us to add 4 to the global variable offset in order to
2491  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2492  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2493  // instruction.
2494 
2495  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2496 
2497  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2498  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2499 
2500  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2501  .addDef(PCReg);
2502 
2503  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2504  if (GAFlags == SIInstrInfo::MO_NONE)
2505  MIB.addImm(0);
2506  else
2507  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2508 
2509  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2510 
2511  if (PtrTy.getSizeInBits() == 32)
2512  B.buildExtract(DstReg, PCReg, 0);
2513  return true;
2514  }
2515 
2518  MachineIRBuilder &B) const {
2519  Register DstReg = MI.getOperand(0).getReg();
2520  LLT Ty = MRI.getType(DstReg);
2521  unsigned AS = Ty.getAddressSpace();
2522 
2523  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2524  MachineFunction &MF = B.getMF();
2526 
2528  if (!MFI->isModuleEntryFunction() &&
2529  !GV->getName().equals("llvm.amdgcn.module.lds")) {
2530  const Function &Fn = MF.getFunction();
2531  DiagnosticInfoUnsupported BadLDSDecl(
2532  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2533  DS_Warning);
2534  Fn.getContext().diagnose(BadLDSDecl);
2535 
2536  // We currently don't have a way to correctly allocate LDS objects that
2537  // aren't directly associated with a kernel. We do force inlining of
2538  // functions that use local objects. However, if these dead functions are
2539  // not eliminated, we don't want a compile time error. Just emit a warning
2540  // and a trap, since there should be no callable path here.
2541  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2542  B.buildUndef(DstReg);
2543  MI.eraseFromParent();
2544  return true;
2545  }
2546 
2547  // TODO: We could emit code to handle the initialization somewhere.
2548  // We ignore the initializer for now and legalize it to allow selection.
2549  // The initializer will anyway get errored out during assembly emission.
2550  const SITargetLowering *TLI = ST.getTargetLowering();
2551  if (!TLI->shouldUseLDSConstAddress(GV)) {
2552  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2553  return true; // Leave in place;
2554  }
2555 
2556  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2557  Type *Ty = GV->getValueType();
2558  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2559  // zero-sized type in other languages to declare the dynamic shared
2560  // memory which size is not known at the compile time. They will be
2561  // allocated by the runtime and placed directly after the static
2562  // allocated ones. They all share the same offset.
2563  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2564  // Adjust alignment for that dynamic shared memory array.
2565  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2566  LLT S32 = LLT::scalar(32);
2567  auto Sz =
2568  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2569  B.buildIntToPtr(DstReg, Sz);
2570  MI.eraseFromParent();
2571  return true;
2572  }
2573  }
2574 
2575  B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2576  *cast<GlobalVariable>(GV)));
2577  MI.eraseFromParent();
2578  return true;
2579  }
2580 
2581  const SITargetLowering *TLI = ST.getTargetLowering();
2582 
2583  if (TLI->shouldEmitFixup(GV)) {
2584  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2585  MI.eraseFromParent();
2586  return true;
2587  }
2588 
2589  if (TLI->shouldEmitPCReloc(GV)) {
2590  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2591  MI.eraseFromParent();
2592  return true;
2593  }
2594 
2596  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2597 
2598  LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2603  LoadTy, Align(8));
2604 
2605  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2606 
2607  if (Ty.getSizeInBits() == 32) {
2608  // Truncate if this is a 32-bit constant address.
2609  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2610  B.buildExtract(DstReg, Load, 0);
2611  } else
2612  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2613 
2614  MI.eraseFromParent();
2615  return true;
2616 }
2617 
2619  if (Ty.isVector())
2620  return Ty.changeElementCount(
2622  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2623 }
2624 
2626  MachineInstr &MI) const {
2627  MachineIRBuilder &B = Helper.MIRBuilder;
2628  MachineRegisterInfo &MRI = *B.getMRI();
2629  GISelChangeObserver &Observer = Helper.Observer;
2630 
2631  Register PtrReg = MI.getOperand(1).getReg();
2632  LLT PtrTy = MRI.getType(PtrReg);
2633  unsigned AddrSpace = PtrTy.getAddressSpace();
2634 
2635  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2637  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2638  Observer.changingInstr(MI);
2639  MI.getOperand(1).setReg(Cast.getReg(0));
2640  Observer.changedInstr(MI);
2641  return true;
2642  }
2643 
2644  if (MI.getOpcode() != AMDGPU::G_LOAD)
2645  return false;
2646 
2647  Register ValReg = MI.getOperand(0).getReg();
2648  LLT ValTy = MRI.getType(ValReg);
2649 
2650  MachineMemOperand *MMO = *MI.memoperands_begin();
2651  const unsigned ValSize = ValTy.getSizeInBits();
2652  const LLT MemTy = MMO->getMemoryType();
2653  const Align MemAlign = MMO->getAlign();
2654  const unsigned MemSize = MemTy.getSizeInBits();
2655  const uint64_t AlignInBits = 8 * MemAlign.value();
2656 
2657  // Widen non-power-of-2 loads to the alignment if needed
2658  if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2659  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2660 
2661  // This was already the correct extending load result type, so just adjust
2662  // the memory type.
2663  if (WideMemSize == ValSize) {
2664  MachineFunction &MF = B.getMF();
2665 
2666  MachineMemOperand *WideMMO =
2667  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2668  Observer.changingInstr(MI);
2669  MI.setMemRefs(MF, {WideMMO});
2670  Observer.changedInstr(MI);
2671  return true;
2672  }
2673 
2674  // Don't bother handling edge case that should probably never be produced.
2675  if (ValSize > WideMemSize)
2676  return false;
2677 
2678  LLT WideTy = widenToNextPowerOf2(ValTy);
2679 
2680  Register WideLoad;
2681  if (!WideTy.isVector()) {
2682  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2683  B.buildTrunc(ValReg, WideLoad).getReg(0);
2684  } else {
2685  // Extract the subvector.
2686 
2687  if (isRegisterType(ValTy)) {
2688  // If this a case where G_EXTRACT is legal, use it.
2689  // (e.g. <3 x s32> -> <4 x s32>)
2690  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2691  B.buildExtract(ValReg, WideLoad, 0);
2692  } else {
2693  // For cases where the widened type isn't a nice register value, unmerge
2694  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2695  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2696  B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2697  }
2698  }
2699 
2700  MI.eraseFromParent();
2701  return true;
2702  }
2703 
2704  return false;
2705 }
2706 
2709  MachineIRBuilder &B) const {
2710  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2711  assert(Ty.isScalar());
2712 
2713  MachineFunction &MF = B.getMF();
2715 
2716  // TODO: Always legal with future ftz flag.
2717  // FIXME: Do we need just output?
2718  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2719  return true;
2720  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2721  return true;
2722 
2723  MachineIRBuilder HelperBuilder(MI);
2724  GISelObserverWrapper DummyObserver;
2725  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2726  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2727 }
2728 
2731  Register DstReg = MI.getOperand(0).getReg();
2732  Register PtrReg = MI.getOperand(1).getReg();
2733  Register CmpVal = MI.getOperand(2).getReg();
2734  Register NewVal = MI.getOperand(3).getReg();
2735 
2737  "this should not have been custom lowered");
2738 
2739  LLT ValTy = MRI.getType(CmpVal);
2740  LLT VecTy = LLT::fixed_vector(2, ValTy);
2741 
2742  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2743 
2744  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2745  .addDef(DstReg)
2746  .addUse(PtrReg)
2747  .addUse(PackedVal)
2748  .setMemRefs(MI.memoperands());
2749 
2750  MI.eraseFromParent();
2751  return true;
2752 }
2753 
2755  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2756  Register Dst = MI.getOperand(0).getReg();
2757  Register Src = MI.getOperand(1).getReg();
2758  LLT Ty = B.getMRI()->getType(Dst);
2759  unsigned Flags = MI.getFlags();
2760 
2761  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2762  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2763 
2764  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2765  MI.eraseFromParent();
2766  return true;
2767 }
2768 
2770  MachineIRBuilder &B) const {
2771  Register Dst = MI.getOperand(0).getReg();
2772  Register Src = MI.getOperand(1).getReg();
2773  unsigned Flags = MI.getFlags();
2774  LLT Ty = B.getMRI()->getType(Dst);
2775 
2776  auto K = B.buildFConstant(Ty, numbers::log2e);
2777  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2778  B.buildFExp2(Dst, Mul, Flags);
2779  MI.eraseFromParent();
2780  return true;
2781 }
2782 
2784  MachineIRBuilder &B) const {
2785  Register Dst = MI.getOperand(0).getReg();
2786  Register Src0 = MI.getOperand(1).getReg();
2787  Register Src1 = MI.getOperand(2).getReg();
2788  unsigned Flags = MI.getFlags();
2789  LLT Ty = B.getMRI()->getType(Dst);
2790  const LLT S16 = LLT::scalar(16);
2791  const LLT S32 = LLT::scalar(32);
2792 
2793  if (Ty == S32) {
2794  auto Log = B.buildFLog2(S32, Src0, Flags);
2795  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2796  .addUse(Log.getReg(0))
2797  .addUse(Src1)
2798  .setMIFlags(Flags);
2799  B.buildFExp2(Dst, Mul, Flags);
2800  } else if (Ty == S16) {
2801  // There's no f16 fmul_legacy, so we need to convert for it.
2802  auto Log = B.buildFLog2(S16, Src0, Flags);
2803  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2804  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2805  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2806  .addUse(Ext0.getReg(0))
2807  .addUse(Ext1.getReg(0))
2808  .setMIFlags(Flags);
2809 
2810  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2811  } else
2812  return false;
2813 
2814  MI.eraseFromParent();
2815  return true;
2816 }
2817 
2818 // Find a source register, ignoring any possible source modifiers.
2820  Register ModSrc = OrigSrc;
2821  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2822  ModSrc = SrcFNeg->getOperand(1).getReg();
2823  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2824  ModSrc = SrcFAbs->getOperand(1).getReg();
2825  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2826  ModSrc = SrcFAbs->getOperand(1).getReg();
2827  return ModSrc;
2828 }
2829 
2832  MachineIRBuilder &B) const {
2833 
2834  const LLT S1 = LLT::scalar(1);
2835  const LLT S64 = LLT::scalar(64);
2836  Register Dst = MI.getOperand(0).getReg();
2837  Register OrigSrc = MI.getOperand(1).getReg();
2838  unsigned Flags = MI.getFlags();
2839  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2840  "this should not have been custom lowered");
2841 
2842  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2843  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2844  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2845  // V_FRACT bug is:
2846  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2847  //
2848  // Convert floor(x) to (x - fract(x))
2849 
2850  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2851  .addUse(OrigSrc)
2852  .setMIFlags(Flags);
2853 
2854  // Give source modifier matching some assistance before obscuring a foldable
2855  // pattern.
2856 
2857  // TODO: We can avoid the neg on the fract? The input sign to fract
2858  // shouldn't matter?
2859  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2860 
2861  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2862 
2864 
2865  // We don't need to concern ourselves with the snan handling difference, so
2866  // use the one which will directly select.
2867  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2868  if (MFI->getMode().IEEE)
2869  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2870  else
2871  B.buildFMinNum(Min, Fract, Const, Flags);
2872 
2873  Register CorrectedFract = Min;
2874  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2875  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2876  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2877  }
2878 
2879  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2880  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2881 
2882  MI.eraseFromParent();
2883  return true;
2884 }
2885 
2886 // Turn an illegal packed v2s16 build vector into bit operations.
2887 // TODO: This should probably be a bitcast action in LegalizerHelper.
2890  Register Dst = MI.getOperand(0).getReg();
2891  const LLT S32 = LLT::scalar(32);
2892  assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2893 
2894  Register Src0 = MI.getOperand(1).getReg();
2895  Register Src1 = MI.getOperand(2).getReg();
2896  assert(MRI.getType(Src0) == LLT::scalar(16));
2897 
2898  auto Merge = B.buildMerge(S32, {Src0, Src1});
2899  B.buildBitcast(Dst, Merge);
2900 
2901  MI.eraseFromParent();
2902  return true;
2903 }
2904 
2905 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2906 //
2907 // Source and accumulation registers must all be 32-bits.
2908 //
2909 // TODO: When the multiply is uniform, we should produce a code sequence
2910 // that is better suited to instruction selection on the SALU. Instead of
2911 // the outer loop going over parts of the result, the outer loop should go
2912 // over parts of one of the factors. This should result in instruction
2913 // selection that makes full use of S_ADDC_U32 instructions.
2917  bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2918  // Use (possibly empty) vectors of S1 registers to represent the set of
2919  // carries from one pair of positions to the next.
2920  using Carry = SmallVector<Register, 2>;
2921 
2922  MachineIRBuilder &B = Helper.MIRBuilder;
2923 
2924  const LLT S1 = LLT::scalar(1);
2925  const LLT S32 = LLT::scalar(32);
2926  const LLT S64 = LLT::scalar(64);
2927 
2928  Register Zero32;
2929  Register Zero64;
2930 
2931  auto getZero32 = [&]() -> Register {
2932  if (!Zero32)
2933  Zero32 = B.buildConstant(S32, 0).getReg(0);
2934  return Zero32;
2935  };
2936  auto getZero64 = [&]() -> Register {
2937  if (!Zero64)
2938  Zero64 = B.buildConstant(S64, 0).getReg(0);
2939  return Zero64;
2940  };
2941 
2942  // Merge the given carries into the 32-bit LocalAccum, which is modified
2943  // in-place.
2944  //
2945  // Returns the carry-out, which is a single S1 register or null.
2946  auto mergeCarry =
2947  [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2948  if (CarryIn.empty())
2949  return Register();
2950 
2951  bool HaveCarryOut = true;
2952  Register CarryAccum;
2953  if (CarryIn.size() == 1) {
2954  if (!LocalAccum) {
2955  LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2956  return Register();
2957  }
2958 
2959  CarryAccum = getZero32();
2960  } else {
2961  CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2962  for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2963  CarryAccum =
2964  B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2965  .getReg(0);
2966  }
2967 
2968  if (!LocalAccum) {
2969  LocalAccum = getZero32();
2970  HaveCarryOut = false;
2971  }
2972  }
2973 
2974  auto Add =
2975  B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2976  LocalAccum = Add.getReg(0);
2977  return HaveCarryOut ? Add.getReg(1) : Register();
2978  };
2979 
2980  // Build a multiply-add chain to compute
2981  //
2982  // LocalAccum + (partial products at DstIndex)
2983  // + (opportunistic subset of CarryIn)
2984  //
2985  // LocalAccum is an array of one or two 32-bit registers that are updated
2986  // in-place. The incoming registers may be null.
2987  //
2988  // In some edge cases, carry-ins can be consumed "for free". In that case,
2989  // the consumed carry bits are removed from CarryIn in-place.
2990  auto buildMadChain =
2991  [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2992  -> Carry {
2993  assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2994  (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2995 
2996  Carry CarryOut;
2997  unsigned j0 = 0;
2998 
2999  // Use plain 32-bit multiplication for the most significant part of the
3000  // result by default.
3001  if (LocalAccum.size() == 1 &&
3002  (!UsePartialMad64_32 || !CarryIn.empty())) {
3003  do {
3004  unsigned j1 = DstIndex - j0;
3005  auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3006  if (!LocalAccum[0]) {
3007  LocalAccum[0] = Mul.getReg(0);
3008  } else {
3009  if (CarryIn.empty()) {
3010  LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3011  } else {
3012  LocalAccum[0] =
3013  B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3014  .getReg(0);
3015  CarryIn.pop_back();
3016  }
3017  }
3018  ++j0;
3019  } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3020  }
3021 
3022  // Build full 64-bit multiplies.
3023  if (j0 <= DstIndex) {
3024  bool HaveSmallAccum = false;
3025  Register Tmp;
3026 
3027  if (LocalAccum[0]) {
3028  if (LocalAccum.size() == 1) {
3029  Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3030  HaveSmallAccum = true;
3031  } else if (LocalAccum[1]) {
3032  Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
3033  HaveSmallAccum = false;
3034  } else {
3035  Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3036  HaveSmallAccum = true;
3037  }
3038  } else {
3039  assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3040  Tmp = getZero64();
3041  HaveSmallAccum = true;
3042  }
3043 
3044  do {
3045  unsigned j1 = DstIndex - j0;
3046  auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3047  {Src0[j0], Src1[j1], Tmp});
3048  Tmp = Mad.getReg(0);
3049  if (!HaveSmallAccum)
3050  CarryOut.push_back(Mad.getReg(1));
3051  HaveSmallAccum = false;
3052  ++j0;
3053  } while (j0 <= DstIndex);
3054 
3055  auto Unmerge = B.buildUnmerge(S32, Tmp);
3056  LocalAccum[0] = Unmerge.getReg(0);
3057  if (LocalAccum.size() > 1)
3058  LocalAccum[1] = Unmerge.getReg(1);
3059  }
3060 
3061  return CarryOut;
3062  };
3063 
3064  // Outer multiply loop, iterating over destination parts from least
3065  // significant to most significant parts.
3066  //
3067  // The columns of the following diagram correspond to the destination parts
3068  // affected by one iteration of the outer loop (ignoring boundary
3069  // conditions).
3070  //
3071  // Dest index relative to 2 * i: 1 0 -1
3072  // ------
3073  // Carries from previous iteration: e o
3074  // Even-aligned partial product sum: E E .
3075  // Odd-aligned partial product sum: O O
3076  //
3077  // 'o' is OddCarry, 'e' is EvenCarry.
3078  // EE and OO are computed from partial products via buildMadChain and use
3079  // accumulation where possible and appropriate.
3080  //
3081  Register SeparateOddCarry;
3082  Carry EvenCarry;
3083  Carry OddCarry;
3084 
3085  for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3086  Carry OddCarryIn = std::move(OddCarry);
3087  Carry EvenCarryIn = std::move(EvenCarry);
3088  OddCarry.clear();
3089  EvenCarry.clear();
3090 
3091  // Partial products at offset 2 * i.
3092  if (2 * i < Accum.size()) {
3093  auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3094  EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3095  }
3096 
3097  // Partial products at offset 2 * i - 1.
3098  if (i > 0) {
3099  if (!SeparateOddAlignedProducts) {
3100  auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3101  OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3102  } else {
3103  bool IsHighest = 2 * i >= Accum.size();
3104  Register SeparateOddOut[2];
3105  auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
3106  .take_front(IsHighest ? 1 : 2);
3107  OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3108 
3109  MachineInstr *Lo;
3110 
3111  if (i == 1) {
3112  if (!IsHighest)
3113  Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3114  else
3115  Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3116  } else {
3117  Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3118  SeparateOddCarry);
3119  }
3120  Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3121 
3122  if (!IsHighest) {
3123  auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3124  Lo->getOperand(1).getReg());
3125  Accum[2 * i] = Hi.getReg(0);
3126  SeparateOddCarry = Hi.getReg(1);
3127  }
3128  }
3129  }
3130 
3131  // Add in the carries from the previous iteration
3132  if (i > 0) {
3133  if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3134  EvenCarryIn.push_back(CarryOut);
3135 
3136  if (2 * i < Accum.size()) {
3137  if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3138  OddCarry.push_back(CarryOut);
3139  }
3140  }
3141  }
3142 }
3143 
3144 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3145 //
3146 // TODO: If the multiply is followed by an addition, we should attempt to
3147 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3149  MachineInstr &MI) const {
3150  assert(ST.hasMad64_32());
3151  assert(MI.getOpcode() == TargetOpcode::G_MUL);
3152 
3153  MachineIRBuilder &B = Helper.MIRBuilder;
3154  MachineRegisterInfo &MRI = *B.getMRI();
3155 
3156  Register DstReg = MI.getOperand(0).getReg();
3157  Register Src0 = MI.getOperand(1).getReg();
3158  Register Src1 = MI.getOperand(2).getReg();
3159 
3160  LLT Ty = MRI.getType(DstReg);
3161  assert(Ty.isScalar());
3162 
3163  unsigned Size = Ty.getSizeInBits();
3164  unsigned NumParts = Size / 32;
3165  assert((Size % 32) == 0);
3166  assert(NumParts >= 2);
3167 
3168  // Whether to use MAD_64_32 for partial products whose high half is
3169  // discarded. This avoids some ADD instructions but risks false dependency
3170  // stalls on some subtargets in some cases.
3171  const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3172 
3173  // Whether to compute odd-aligned partial products separately. This is
3174  // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3175  // in an even-aligned VGPR.
3176  const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3177 
3178  LLT S32 = LLT::scalar(32);
3179  SmallVector<Register, 2> Src0Parts, Src1Parts;
3180  for (unsigned i = 0; i < NumParts; ++i) {
3181  Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3182  Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3183  }
3184  B.buildUnmerge(Src0Parts, Src0);
3185  B.buildUnmerge(Src1Parts, Src1);
3186 
3187  SmallVector<Register, 2> AccumRegs(NumParts);
3188  buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3189  SeparateOddAlignedProducts);
3190 
3191  B.buildMerge(DstReg, AccumRegs);
3192  MI.eraseFromParent();
3193  return true;
3194 
3195 }
3196 
3197 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3198 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3199 // case with a single min instruction instead of a compare+select.
3202  MachineIRBuilder &B) const {
3203  Register Dst = MI.getOperand(0).getReg();
3204  Register Src = MI.getOperand(1).getReg();
3205  LLT DstTy = MRI.getType(Dst);
3206  LLT SrcTy = MRI.getType(Src);
3207 
3208  unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3209  ? AMDGPU::G_AMDGPU_FFBH_U32
3210  : AMDGPU::G_AMDGPU_FFBL_B32;
3211  auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3212  B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3213 
3214  MI.eraseFromParent();
3215  return true;
3216 }
3217 
3218 // Check that this is a G_XOR x, -1
3219 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3220  if (MI.getOpcode() != TargetOpcode::G_XOR)
3221  return false;
3222  auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3223  return ConstVal && *ConstVal == -1;
3224 }
3225 
3226 // Return the use branch instruction, otherwise null if the usage is invalid.
3227 static MachineInstr *
3229  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3230  Register CondDef = MI.getOperand(0).getReg();
3231  if (!MRI.hasOneNonDBGUse(CondDef))
3232  return nullptr;
3233 
3234  MachineBasicBlock *Parent = MI.getParent();
3236 
3237  if (isNot(MRI, *UseMI)) {
3238  Register NegatedCond = UseMI->getOperand(0).getReg();
3239  if (!MRI.hasOneNonDBGUse(NegatedCond))
3240  return nullptr;
3241 
3242  // We're deleting the def of this value, so we need to remove it.
3243  eraseInstr(*UseMI, MRI);
3244 
3245  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3246  Negated = true;
3247  }
3248 
3249  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3250  return nullptr;
3251 
3252  // Make sure the cond br is followed by a G_BR, or is the last instruction.
3253  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3254  if (Next == Parent->end()) {
3255  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3256  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3257  return nullptr;
3258  UncondBrTarget = &*NextMBB;
3259  } else {
3260  if (Next->getOpcode() != AMDGPU::G_BR)
3261  return nullptr;
3262  Br = &*Next;
3263  UncondBrTarget = Br->getOperand(0).getMBB();
3264  }
3265 
3266  return UseMI;
3267 }
3268 
3270  const ArgDescriptor *Arg,
3271  const TargetRegisterClass *ArgRC,
3272  LLT ArgTy) const {
3273  MCRegister SrcReg = Arg->getRegister();
3274  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3275  assert(DstReg.isVirtual() && "Virtual register expected");
3276 
3277  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3278  *ArgRC, B.getDebugLoc(), ArgTy);
3279  if (Arg->isMasked()) {
3280  // TODO: Should we try to emit this once in the entry block?
3281  const LLT S32 = LLT::scalar(32);
3282  const unsigned Mask = Arg->getMask();
3283  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
3284 
3285  Register AndMaskSrc = LiveIn;
3286 
3287  // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3288  // 0.
3289  if (Shift != 0) {
3290  auto ShiftAmt = B.buildConstant(S32, Shift);
3291  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3292  }
3293 
3294  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3295  } else {
3296  B.buildCopy(DstReg, LiveIn);
3297  }
3298 
3299  return true;
3300 }
3301 
3303  Register DstReg, MachineIRBuilder &B,
3304  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3305  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3306  const ArgDescriptor *Arg;
3307  const TargetRegisterClass *ArgRC;
3308  LLT ArgTy;
3309  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3310 
3311  if (!Arg) {
3313  // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3314  // case the pointer argument may be missing and we use null.
3315  B.buildConstant(DstReg, 0);
3316  return true;
3317  }
3318 
3319  // It's undefined behavior if a function marked with the amdgpu-no-*
3320  // attributes uses the corresponding intrinsic.
3321  B.buildUndef(DstReg);
3322  return true;
3323  }
3324 
3325  if (!Arg->isRegister() || !Arg->getRegister().isValid())
3326  return false; // TODO: Handle these
3327  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3328 }
3329 
3332  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3333  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3334  return false;
3335 
3336  MI.eraseFromParent();
3337  return true;
3338 }
3339 
3341  int64_t C) {
3342  B.buildConstant(MI.getOperand(0).getReg(), C);
3343  MI.eraseFromParent();
3344  return true;
3345 }
3346 
3349  unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3350  unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3351  if (MaxID == 0)
3352  return replaceWithConstant(B, MI, 0);
3353 
3354  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3355  const ArgDescriptor *Arg;
3356  const TargetRegisterClass *ArgRC;
3357  LLT ArgTy;
3358  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3359 
3360  Register DstReg = MI.getOperand(0).getReg();
3361  if (!Arg) {
3362  // It's undefined behavior if a function marked with the amdgpu-no-*
3363  // attributes uses the corresponding intrinsic.
3364  B.buildUndef(DstReg);
3365  MI.eraseFromParent();
3366  return true;
3367  }
3368 
3369  if (Arg->isMasked()) {
3370  // Don't bother inserting AssertZext for packed IDs since we're emitting the
3371  // masking operations anyway.
3372  //
3373  // TODO: We could assert the top bit is 0 for the source copy.
3374  if (!loadInputValue(DstReg, B, ArgType))
3375  return false;
3376  } else {
3378  if (!loadInputValue(TmpReg, B, ArgType))
3379  return false;
3380  B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3381  }
3382 
3383  MI.eraseFromParent();
3384  return true;
3385 }
3386 
3388  int64_t Offset) const {
3390  Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3391 
3392  // TODO: If we passed in the base kernel offset we could have a better
3393  // alignment than 4, but we don't really need it.
3394  if (!loadInputValue(KernArgReg, B,
3396  llvm_unreachable("failed to find kernarg segment ptr");
3397 
3398  auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3399  // TODO: Should get nuw
3400  return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3401 }
3402 
3403 /// Legalize a value that's loaded from kernel arguments. This is only used by
3404 /// legacy intrinsics.
3407  uint64_t Offset,
3408  Align Alignment) const {
3409  Register DstReg = MI.getOperand(0).getReg();
3410 
3411  assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3412  "unexpected kernarg parameter type");
3413 
3414  Register Ptr = getKernargParameterPtr(B, Offset);
3416  B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3419  MI.eraseFromParent();
3420  return true;
3421 }
3422 
3425  MachineIRBuilder &B) const {
3426  Register Dst = MI.getOperand(0).getReg();
3427  LLT DstTy = MRI.getType(Dst);
3428  LLT S16 = LLT::scalar(16);
3429  LLT S32 = LLT::scalar(32);
3430  LLT S64 = LLT::scalar(64);
3431 
3432  if (DstTy == S16)
3433  return legalizeFDIV16(MI, MRI, B);
3434  if (DstTy == S32)
3435  return legalizeFDIV32(MI, MRI, B);
3436  if (DstTy == S64)
3437  return legalizeFDIV64(MI, MRI, B);
3438 
3439  return false;
3440 }
3441 
3443  Register DstDivReg,
3444  Register DstRemReg,
3445  Register X,
3446  Register Y) const {
3447  const LLT S1 = LLT::scalar(1);
3448  const LLT S32 = LLT::scalar(32);
3449 
3450  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3451  // algorithm used here.
3452 
3453  // Initial estimate of inv(y).
3454  auto FloatY = B.buildUITOFP(S32, Y);
3455  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3456  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3457  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3458  auto Z = B.buildFPTOUI(S32, ScaledY);
3459 
3460  // One round of UNR.
3461  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3462  auto NegYZ = B.buildMul(S32, NegY, Z);
3463  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3464 
3465  // Quotient/remainder estimate.
3466  auto Q = B.buildUMulH(S32, X, Z);
3467  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3468 
3469  // First quotient/remainder refinement.
3470  auto One = B.buildConstant(S32, 1);
3471  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3472  if (DstDivReg)
3473  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3474  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3475 
3476  // Second quotient/remainder refinement.
3477  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3478  if (DstDivReg)
3479  B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3480 
3481  if (DstRemReg)
3482  B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3483 }
3484 
3485 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3486 //
3487 // Return lo, hi of result
3488 //
3489 // %cvt.lo = G_UITOFP Val.lo
3490 // %cvt.hi = G_UITOFP Val.hi
3491 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3492 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3493 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3494 // %mul2 = G_FMUL %mul1, 2**(-32)
3495 // %trunc = G_INTRINSIC_TRUNC %mul2
3496 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3497 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3498 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3499  Register Val) {
3500  const LLT S32 = LLT::scalar(32);
3501  auto Unmerge = B.buildUnmerge(S32, Val);
3502 
3503  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3504  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3505 
3506  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3507  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3508 
3509  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3510  auto Mul1 =
3511  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3512 
3513  // 2**(-32)
3514  auto Mul2 =
3515  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3516  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3517 
3518  // -(2**32)
3519  auto Mad2 = B.buildFMAD(S32, Trunc,
3520  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3521 
3522  auto ResultLo = B.buildFPTOUI(S32, Mad2);
3523  auto ResultHi = B.buildFPTOUI(S32, Trunc);
3524 
3525  return {ResultLo.getReg(0), ResultHi.getReg(0)};
3526 }
3527 
3529  Register DstDivReg,
3530  Register DstRemReg,
3531  Register Numer,
3532  Register Denom) const {
3533  const LLT S32 = LLT::scalar(32);
3534  const LLT S64 = LLT::scalar(64);
3535  const LLT S1 = LLT::scalar(1);
3536  Register RcpLo, RcpHi;
3537 
3538  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3539 
3540  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3541 
3542  auto Zero64 = B.buildConstant(S64, 0);
3543  auto NegDenom = B.buildSub(S64, Zero64, Denom);
3544 
3545  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3546  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3547 
3548  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3549  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3550  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3551 
3552  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3553  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3554  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3555 
3556  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3557  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3558  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3559  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3560  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3561 
3562  auto Zero32 = B.buildConstant(S32, 0);
3563  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3564  auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3565  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3566 
3567  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3568  Register NumerLo = UnmergeNumer.getReg(0);
3569  Register NumerHi = UnmergeNumer.getReg(1);
3570 
3571  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3572  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3573  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3574  Register Mul3_Lo = UnmergeMul3.getReg(0);
3575  Register Mul3_Hi = UnmergeMul3.getReg(1);
3576  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3577  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3578  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3579  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3580 
3581  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3582  Register DenomLo = UnmergeDenom.getReg(0);
3583  Register DenomHi = UnmergeDenom.getReg(1);
3584 
3585  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3586  auto C1 = B.buildSExt(S32, CmpHi);
3587 
3588  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3589  auto C2 = B.buildSExt(S32, CmpLo);
3590 
3591  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3592  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3593 
3594  // TODO: Here and below portions of the code can be enclosed into if/endif.
3595  // Currently control flow is unconditional and we have 4 selects after
3596  // potential endif to substitute PHIs.
3597 
3598  // if C3 != 0 ...
3599  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3600  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3601  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3602  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3603 
3604  auto One64 = B.buildConstant(S64, 1);
3605  auto Add3 = B.buildAdd(S64, MulHi3, One64);
3606 
3607  auto C4 =
3608  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3609  auto C5 =
3610  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3611  auto C6 = B.buildSelect(
3612  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3613 
3614  // if (C6 != 0)
3615  auto Add4 = B.buildAdd(S64, Add3, One64);
3616  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3617 
3618  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3619  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3620  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3621 
3622  // endif C6
3623  // endif C3
3624 
3625  if (DstDivReg) {
3626  auto Sel1 = B.buildSelect(
3627  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3628  B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3629  Sel1, MulHi3);
3630  }
3631 
3632  if (DstRemReg) {
3633  auto Sel2 = B.buildSelect(
3634  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3635  B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3636  Sel2, Sub1);
3637  }
3638 }
3639 
3642  MachineIRBuilder &B) const {
3643  Register DstDivReg, DstRemReg;
3644  switch (MI.getOpcode()) {
3645  default:
3646  llvm_unreachable("Unexpected opcode!");
3647  case AMDGPU::G_UDIV: {
3648  DstDivReg = MI.getOperand(0).getReg();
3649  break;
3650  }
3651  case AMDGPU::G_UREM: {
3652  DstRemReg = MI.getOperand(0).getReg();
3653  break;
3654  }
3655  case AMDGPU::G_UDIVREM: {
3656  DstDivReg = MI.getOperand(0).getReg();
3657  DstRemReg = MI.getOperand(1).getReg();
3658  break;
3659  }
3660  }
3661 
3662  const LLT S64 = LLT::scalar(64);
3663  const LLT S32 = LLT::scalar(32);
3664  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3665  Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3666  Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3667  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3668 
3669  if (Ty == S32)
3670  legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3671  else if (Ty == S64)
3672  legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3673  else
3674  return false;
3675 
3676  MI.eraseFromParent();
3677  return true;
3678 }
3679 
3682  MachineIRBuilder &B) const {
3683  const LLT S64 = LLT::scalar(64);
3684  const LLT S32 = LLT::scalar(32);
3685 
3686  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3687  if (Ty != S32 && Ty != S64)
3688  return false;
3689 
3690  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3691  Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3692  Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3693 
3694  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3695  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3696  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3697 
3698  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3699  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3700 
3701  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3702  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3703 
3704  Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3705  switch (MI.getOpcode()) {
3706  default:
3707  llvm_unreachable("Unexpected opcode!");
3708  case AMDGPU::G_SDIV: {
3709  DstDivReg = MI.getOperand(0).getReg();
3710  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3711  break;
3712  }
3713  case AMDGPU::G_SREM: {
3714  DstRemReg = MI.getOperand(0).getReg();
3715  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3716  break;
3717  }
3718  case AMDGPU::G_SDIVREM: {
3719  DstDivReg = MI.getOperand(0).getReg();
3720  DstRemReg = MI.getOperand(1).getReg();
3721  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3722  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3723  break;
3724  }
3725  }
3726 
3727  if (Ty == S32)
3728  legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3729  else
3730  legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3731 
3732  if (DstDivReg) {
3733  auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3734  auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3735  B.buildSub(DstDivReg, SignXor, Sign);
3736  }
3737 
3738  if (DstRemReg) {
3739  auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3740  auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3741  B.buildSub(DstRemReg, SignXor, Sign);
3742  }
3743 
3744  MI.eraseFromParent();
3745  return true;
3746 }
3747 
3750  MachineIRBuilder &B) const {
3751  Register Res = MI.getOperand(0).getReg();
3752  Register LHS = MI.getOperand(1).getReg();
3753  Register RHS = MI.getOperand(2).getReg();
3754  uint16_t Flags = MI.getFlags();
3755  LLT ResTy = MRI.getType(Res);
3756 
3757  const MachineFunction &MF = B.getMF();
3758  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3759  MI.getFlag(MachineInstr::FmAfn);
3760 
3761  if (!AllowInaccurateRcp)
3762  return false;
3763 
3764  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3765  // 1 / x -> RCP(x)
3766  if (CLHS->isExactlyValue(1.0)) {
3767  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3768  .addUse(RHS)
3769  .setMIFlags(Flags);
3770 
3771  MI.eraseFromParent();
3772  return true;
3773  }
3774 
3775  // -1 / x -> RCP( FNEG(x) )
3776  if (CLHS->isExactlyValue(-1.0)) {
3777  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3778  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3779  .addUse(FNeg.getReg(0))
3780  .setMIFlags(Flags);
3781 
3782  MI.eraseFromParent();
3783  return true;
3784  }
3785  }
3786 
3787  // x / y -> x * (1.0 / y)
3788  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3789  .addUse(RHS)
3790  .setMIFlags(Flags);
3791  B.buildFMul(Res, LHS, RCP, Flags);
3792 
3793  MI.eraseFromParent();
3794  return true;
3795 }
3796 
3799  MachineIRBuilder &B) const {
3800  Register Res = MI.getOperand(0).getReg();
3801  Register X = MI.getOperand(1).getReg();
3802  Register Y = MI.getOperand(2).getReg();
3803  uint16_t Flags = MI.getFlags();
3804  LLT ResTy = MRI.getType(Res);
3805 
3806  const MachineFunction &MF = B.getMF();
3807  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3808  MI.getFlag(MachineInstr::FmAfn);
3809 
3810  if (!AllowInaccurateRcp)
3811  return false;
3812 
3813  auto NegY = B.buildFNeg(ResTy, Y);
3814  auto One = B.buildFConstant(ResTy, 1.0);
3815 
3816  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3817  .addUse(Y)
3818  .setMIFlags(Flags);
3819 
3820  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3821  R = B.buildFMA(ResTy, Tmp0, R, R);
3822 
3823  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3824  R = B.buildFMA(ResTy, Tmp1, R, R);
3825 
3826  auto Ret = B.buildFMul(ResTy, X, R);
3827  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3828 
3829  B.buildFMA(Res, Tmp2, R, Ret);
3830  MI.eraseFromParent();
3831  return true;
3832 }
3833 
3836  MachineIRBuilder &B) const {
3837  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3838  return true;
3839 
3840  Register Res = MI.getOperand(0).getReg();
3841  Register LHS = MI.getOperand(1).getReg();
3842  Register RHS = MI.getOperand(2).getReg();
3843 
3844  uint16_t Flags = MI.getFlags();
3845 
3846  LLT S16 = LLT::scalar(16);
3847  LLT S32 = LLT::scalar(32);
3848 
3849  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3850  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3851 
3852  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3853  .addUse(RHSExt.getReg(0))
3854  .setMIFlags(Flags);
3855 
3856  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3857  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3858 
3859  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3860  .addUse(RDst.getReg(0))
3861  .addUse(RHS)
3862  .addUse(LHS)
3863  .setMIFlags(Flags);
3864 
3865  MI.eraseFromParent();
3866  return true;
3867 }
3868 
3869 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3870 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3871 static void toggleSPDenormMode(bool Enable,
3873  const GCNSubtarget &ST,
3874  AMDGPU::SIModeRegisterDefaults Mode) {
3875  // Set SP denorm mode to this value.
3876  unsigned SPDenormMode =
3877  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3878 
3879  if (ST.hasDenormModeInst()) {
3880  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3881  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3882 
3883  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3884  B.buildInstr(AMDGPU::S_DENORM_MODE)
3885  .addImm(NewDenormModeValue);
3886 
3887  } else {
3888  // Select FP32 bit field in mode register.
3889  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3892 
3893  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3894  .addImm(SPDenormMode)
3895  .addImm(SPDenormModeBitField);
3896  }
3897 }
3898 
3901  MachineIRBuilder &B) const {
3902  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3903  return true;
3904 
3905  Register Res = MI.getOperand(0).getReg();
3906  Register LHS = MI.getOperand(1).getReg();
3907  Register RHS = MI.getOperand(2).getReg();
3908  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3910 
3911  uint16_t Flags = MI.getFlags();
3912 
3913  LLT S32 = LLT::scalar(32);
3914  LLT S1 = LLT::scalar(1);
3915 
3916  auto One = B.buildFConstant(S32, 1.0f);
3917 
3918  auto DenominatorScaled =
3919  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3920  .addUse(LHS)
3921  .addUse(RHS)
3922  .addImm(0)
3923  .setMIFlags(Flags);
3924  auto NumeratorScaled =
3925  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3926  .addUse(LHS)
3927  .addUse(RHS)
3928  .addImm(1)
3929  .setMIFlags(Flags);
3930 
3931  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3932  .addUse(DenominatorScaled.getReg(0))
3933  .setMIFlags(Flags);
3934  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3935 
3936  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3937  // aren't modeled as reading it.
3938  if (!Mode.allFP32Denormals())
3939  toggleSPDenormMode(true, B, ST, Mode);
3940 
3941  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3942  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3943  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3944  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3945  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3946  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3947 
3948  if (!Mode.allFP32Denormals())
3949  toggleSPDenormMode(false, B, ST, Mode);
3950 
3951  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3952  .addUse(Fma4.getReg(0))
3953  .addUse(Fma1.getReg(0))
3954  .addUse(Fma3.getReg(0))
3955  .addUse(NumeratorScaled.getReg(1))
3956  .setMIFlags(Flags);
3957 
3958  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3959  .addUse(Fmas.getReg(0))
3960  .addUse(RHS)
3961  .addUse(LHS)
3962  .setMIFlags(Flags);
3963 
3964  MI.eraseFromParent();
3965  return true;
3966 }
3967 
3970  MachineIRBuilder &B) const {
3972  return true;
3973 
3974  Register Res = MI.getOperand(0).getReg();
3975  Register LHS = MI.getOperand(1).getReg();
3976  Register RHS = MI.getOperand(2).getReg();
3977 
3978  uint16_t Flags = MI.getFlags();
3979 
3980  LLT S64 = LLT::scalar(64);
3981  LLT S1 = LLT::scalar(1);
3982 
3983  auto One = B.buildFConstant(S64, 1.0);
3984 
3985  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3986  .addUse(LHS)
3987  .addUse(RHS)
3988  .addImm(0)
3989  .setMIFlags(Flags);
3990 
3991  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3992 
3993  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3994  .addUse(DivScale0.getReg(0))
3995  .setMIFlags(Flags);
3996 
3997  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3998  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3999  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4000 
4001  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4002  .addUse(LHS)
4003  .addUse(RHS)
4004  .addImm(1)
4005  .setMIFlags(Flags);
4006 
4007  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4008  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4009  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4010 
4011  Register Scale;
4013  // Workaround a hardware bug on SI where the condition output from div_scale
4014  // is not usable.
4015 
4016  LLT S32 = LLT::scalar(32);
4017 
4018  auto NumUnmerge = B.buildUnmerge(S32, LHS);
4019  auto DenUnmerge = B.buildUnmerge(S32, RHS);
4020  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4021  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4022 
4023  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4024  Scale1Unmerge.getReg(1));
4025  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4026  Scale0Unmerge.getReg(1));
4027  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4028  } else {
4029  Scale = DivScale1.getReg(1);
4030  }
4031 
4032  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4033  .addUse(Fma4.getReg(0))
4034  .addUse(Fma3.getReg(0))
4035  .addUse(Mul.getReg(0))
4036  .addUse(Scale)
4037  .setMIFlags(Flags);
4038 
4039  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
4040  .addUse(Fmas.getReg(0))
4041  .addUse(RHS)
4042  .addUse(LHS)
4043  .setMIFlags(Flags);
4044 
4045  MI.eraseFromParent();
4046  return true;
4047 }
4048 
4051  MachineIRBuilder &B) const {
4052  Register Res = MI.getOperand(0).getReg();
4053  Register LHS = MI.getOperand(2).getReg();
4054  Register RHS = MI.getOperand(3).getReg();
4055  uint16_t Flags = MI.getFlags();
4056 
4057  LLT S32 = LLT::scalar(32);
4058  LLT S1 = LLT::scalar(1);
4059 
4060  auto Abs = B.buildFAbs(S32, RHS, Flags);
4061  const APFloat C0Val(1.0f);
4062 
4063  auto C0 = B.buildConstant(S32, 0x6f800000);
4064  auto C1 = B.buildConstant(S32, 0x2f800000);
4065  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4066 
4067  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4068  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4069 
4070  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4071 
4072  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4073  .addUse(Mul0.getReg(0))
4074  .setMIFlags(Flags);
4075 
4076  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4077 
4078  B.buildFMul(Res, Sel, Mul1, Flags);
4079 
4080  MI.eraseFromParent();
4081  return true;
4082 }
4083 
4084 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4085 // FIXME: Why do we handle this one but not other removed instructions?
4086 //
4087 // Reciprocal square root. The clamp prevents infinite results, clamping
4088 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
4089 // +-max_float.
4092  MachineIRBuilder &B) const {
4094  return true;
4095 
4096  Register Dst = MI.getOperand(0).getReg();
4097  Register Src = MI.getOperand(2).getReg();
4098  auto Flags = MI.getFlags();
4099 
4100  LLT Ty = MRI.getType(Dst);
4101 
4102  const fltSemantics *FltSemantics;
4103  if (Ty == LLT::scalar(32))
4104  FltSemantics = &APFloat::IEEEsingle();
4105  else if (Ty == LLT::scalar(64))
4106  FltSemantics = &APFloat::IEEEdouble();
4107  else
4108  return false;
4109 
4110  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4111  .addUse(Src)
4112  .setMIFlags(Flags);
4113 
4114  // We don't need to concern ourselves with the snan handling difference, since
4115  // the rsq quieted (or not) so use the one which will directly select.
4116  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4117  const bool UseIEEE = MFI->getMode().IEEE;
4118 
4119  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4120  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4121  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4122 
4123  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4124 
4125  if (UseIEEE)
4126  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4127  else
4128  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4129  MI.eraseFromParent();
4130  return true;
4131 }
4132 
4133 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4134  switch (IID) {
4135  case Intrinsic::amdgcn_ds_fadd:
4136  return AMDGPU::G_ATOMICRMW_FADD;
4137  case Intrinsic::amdgcn_ds_fmin:
4138  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4139  case Intrinsic::amdgcn_ds_fmax:
4140  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4141  default:
4142  llvm_unreachable("not a DS FP intrinsic");
4143  }
4144 }
4145 
4147  MachineInstr &MI,
4148  Intrinsic::ID IID) const {
4149  GISelChangeObserver &Observer = Helper.Observer;
4150  Observer.changingInstr(MI);
4151 
4152  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4153 
4154  // The remaining operands were used to set fields in the MemOperand on
4155  // construction.
4156  for (int I = 6; I > 3; --I)
4157  MI.removeOperand(I);
4158 
4159  MI.removeOperand(1); // Remove the intrinsic ID.
4160  Observer.changedInstr(MI);
4161  return true;
4162 }
4163 
4166  MachineIRBuilder &B) const {
4167  uint64_t Offset =
4170  LLT DstTy = MRI.getType(DstReg);
4171  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4172 
4173  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4174  if (!loadInputValue(KernargPtrReg, B,
4176  return false;
4177 
4178  // FIXME: This should be nuw
4179  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4180  return true;
4181 }
4182 
4185  MachineIRBuilder &B) const {
4186  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4187  if (!MFI->isEntryFunction()) {
4190  }
4191 
4192  Register DstReg = MI.getOperand(0).getReg();
4193  if (!getImplicitArgPtr(DstReg, MRI, B))
4194  return false;
4195 
4196  MI.eraseFromParent();
4197  return true;
4198 }
4199 
4202  MachineIRBuilder &B) const {
4203  Function &F = B.getMF().getFunction();
4204  Optional<uint32_t> KnownSize =
4206  if (KnownSize.has_value())
4207  B.buildConstant(DstReg, KnownSize.value());
4208  return false;
4209 }
4210 
4213  MachineIRBuilder &B) const {
4214 
4215  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4216  if (!MFI->isEntryFunction()) {
4219  }
4220 
4221  Register DstReg = MI.getOperand(0).getReg();
4222  if (!getLDSKernelId(DstReg, MRI, B))
4223  return false;
4224 
4225  MI.eraseFromParent();
4226  return true;
4227 }
4228 
4232  unsigned AddrSpace) const {
4233  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4234  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4235  Register Hi32 = Unmerge.getReg(1);
4236 
4237  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4238  MI.eraseFromParent();
4239  return true;
4240 }
4241 
4242 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4243 // offset (the offset that is included in bounds checking and swizzling, to be
4244 // split between the instruction's voffset and immoffset fields) and soffset
4245 // (the offset that is excluded from bounds checking and swizzling, to go in
4246 // the instruction's soffset field). This function takes the first kind of
4247 // offset and figures out how to split it between voffset and immoffset.
4248 std::pair<Register, unsigned>
4250  Register OrigOffset) const {
4251  const unsigned MaxImm = 4095;
4252  Register BaseReg;
4253  unsigned ImmOffset;
4254  const LLT S32 = LLT::scalar(32);
4255  MachineRegisterInfo &MRI = *B.getMRI();
4256 
4257  std::tie(BaseReg, ImmOffset) =
4259 
4260  // If BaseReg is a pointer, convert it to int.
4261  if (MRI.getType(BaseReg).isPointer())
4262  BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4263 
4264  // If the immediate value is too big for the immoffset field, put the value
4265  // and -4096 into the immoffset field so that the value that is copied/added
4266  // for the voffset field is a multiple of 4096, and it stands more chance
4267  // of being CSEd with the copy/add for another similar load/store.
4268  // However, do not do that rounding down to a multiple of 4096 if that is a
4269  // negative number, as it appears to be illegal to have a negative offset
4270  // in the vgpr, even if adding the immediate offset makes it positive.
4271  unsigned Overflow = ImmOffset & ~MaxImm;
4272  ImmOffset -= Overflow;
4273  if ((int32_t)Overflow < 0) {
4274  Overflow += ImmOffset;
4275  ImmOffset = 0;
4276  }
4277 
4278  if (Overflow != 0) {
4279  if (!BaseReg) {
4280  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4281  } else {
4282  auto OverflowVal = B.buildConstant(S32, Overflow);
4283  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4284  }
4285  }
4286 
4287  if (!BaseReg)
4288  BaseReg = B.buildConstant(S32, 0).getReg(0);
4289 
4290  return std::make_pair(BaseReg, ImmOffset);
4291 }
4292 
4293 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4295  Register VOffset, Register SOffset,
4296  unsigned ImmOffset, Register VIndex,
4297  MachineRegisterInfo &MRI) const {
4298  Optional<ValueAndVReg> MaybeVOffsetVal =
4300  Optional<ValueAndVReg> MaybeSOffsetVal =
4302  Optional<ValueAndVReg> MaybeVIndexVal =
4304  // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4305  // update the MMO with that offset. The stride is unknown so we can only do
4306  // this if VIndex is constant 0.
4307  if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4308  MaybeVIndexVal->Value == 0) {
4309  uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4310  MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4311  MMO->setOffset(TotalOffset);
4312  } else {
4313  // We don't have a constant combined offset to use in the MMO. Give up.
4314  MMO->setValue((Value *)nullptr);
4315  }
4316 }
4317 
4318 /// Handle register layout difference for f16 images for some subtargets.
4321  Register Reg,
4322  bool ImageStore) const {
4323  const LLT S16 = LLT::scalar(16);
4324  const LLT S32 = LLT::scalar(32);
4325  LLT StoreVT = MRI.getType(Reg);
4326  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4327 
4328  if (ST.hasUnpackedD16VMem()) {
4329  auto Unmerge = B.buildUnmerge(S16, Reg);
4330 
4331  SmallVector<Register, 4> WideRegs;
4332  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4333  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4334 
4335  int NumElts = StoreVT.getNumElements();
4336 
4337  return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4338  .getReg(0);
4339  }
4340 
4341  if (ImageStore && ST.hasImageStoreD16Bug()) {
4342  if (StoreVT.getNumElements() == 2) {
4343  SmallVector<Register, 4> PackedRegs;
4344  Reg = B.buildBitcast(S32, Reg).getReg(0);
4345  PackedRegs.push_back(Reg);
4346  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4347  return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4348  .getReg(0);
4349  }
4350 
4351  if (StoreVT.getNumElements() == 3) {
4352  SmallVector<Register, 4> PackedRegs;
4353  auto Unmerge = B.buildUnmerge(S16, Reg);
4354  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4355  PackedRegs.push_back(Unmerge.getReg(I));
4356  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4357  Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4358  return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4359  }
4360 
4361  if (StoreVT.getNumElements() == 4) {
4362  SmallVector<Register, 4> PackedRegs;
4363  Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4364  auto Unmerge = B.buildUnmerge(S32, Reg);
4365  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4366  PackedRegs.push_back(Unmerge.getReg(I));
4367  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4368  return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4369  .getReg(0);
4370  }
4371 
4372  llvm_unreachable("invalid data type");
4373  }
4374 
4375  if (StoreVT == LLT::fixed_vector(3, S16)) {
4376  Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4377  .getReg(0);
4378  }
4379  return Reg;
4380 }
4381 
4383  MachineIRBuilder &B, Register VData, bool IsFormat) const {
4384  MachineRegisterInfo *MRI = B.getMRI();
4385  LLT Ty = MRI->getType(VData);
4386 
4387  const LLT S16 = LLT::scalar(16);
4388 
4389  // Fixup illegal register types for i8 stores.
4390  if (Ty == LLT::scalar(8) || Ty == S16) {
4391  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4392  return AnyExt;
4393  }
4394 
4395  if (Ty.isVector()) {
4396  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4397  if (IsFormat)
4398  return handleD16VData(B, *MRI, VData);
4399  }
4400  }
4401 
4402  return VData;
4403 }
4404 
4408  bool IsTyped,
4409  bool IsFormat) const {
4410  Register VData = MI.getOperand(1).getReg();
4411  LLT Ty = MRI.getType(VData);
4412  LLT EltTy = Ty.getScalarType();
4413  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4414  const LLT S32 = LLT::scalar(32);
4415 
4416  VData = fixStoreSourceType(B, VData, IsFormat);
4417  Register RSrc = MI.getOperand(2).getReg();
4418 
4419  MachineMemOperand *MMO = *MI.memoperands_begin();
4420  const int MemSize = MMO->getSize();
4421 
4422  unsigned ImmOffset;
4423 
4424  // The typed intrinsics add an immediate after the registers.
4425  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4426 
4427  // The struct intrinsic variants add one additional operand over raw.
4428  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4429  Register VIndex;
4430  int OpOffset = 0;
4431  if (HasVIndex) {
4432  VIndex = MI.getOperand(3).getReg();
4433  OpOffset = 1;
4434  } else {
4435  VIndex = B.buildConstant(S32, 0).getReg(0);
4436  }
4437 
4438  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4439  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4440 
4441  unsigned Format = 0;
4442  if (IsTyped) {
4443  Format = MI.getOperand(5 + OpOffset).getImm();
4444  ++OpOffset;
4445  }
4446 
4447  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4448 
4449  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4450  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4451 
4452  unsigned Opc;
4453  if (IsTyped) {
4454  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4455  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4456  } else if (IsFormat) {
4457  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4458  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4459  } else {
4460  switch (MemSize) {
4461  case 1:
4462  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4463  break;
4464  case 2:
4465  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4466  break;
4467  default:
4468  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4469  break;
4470  }
4471  }
4472 
4473  auto MIB = B.buildInstr(Opc)
4474  .addUse(VData) // vdata
4475  .addUse(RSrc) // rsrc
4476  .addUse(VIndex) // vindex
4477  .addUse(VOffset) // voffset
4478  .addUse(SOffset) // soffset
4479  .addImm(ImmOffset); // offset(imm)
4480 
4481  if (IsTyped)
4482  MIB.addImm(Format);
4483 
4484  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4485  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4486  .addMemOperand(MMO);
4487 
4488  MI.eraseFromParent();
4489  return true;
4490 }
4491 
4495  bool IsFormat,
4496  bool IsTyped) const {
4497  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4498  MachineMemOperand *MMO = *MI.memoperands_begin();
4499  const LLT MemTy = MMO->getMemoryType();
4500  const LLT S32 = LLT::scalar(32);
4501 
4502  Register Dst = MI.getOperand(0).getReg();
4503  Register RSrc = MI.getOperand(2).getReg();
4504 
4505  // The typed intrinsics add an immediate after the registers.
4506  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4507 
4508  // The struct intrinsic variants add one additional operand over raw.
4509  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4510  Register VIndex;
4511  int OpOffset = 0;
4512  if (HasVIndex) {
4513  VIndex = MI.getOperand(3).getReg();
4514  OpOffset = 1;
4515  } else {
4516  VIndex = B.buildConstant(S32, 0).getReg(0);
4517  }
4518 
4519  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4520  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4521 
4522  unsigned Format = 0;
4523  if (IsTyped) {
4524  Format = MI.getOperand(5 + OpOffset).getImm();
4525  ++OpOffset;
4526  }
4527 
4528  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4529  unsigned ImmOffset;
4530 
4531  LLT Ty = MRI.getType(Dst);
4532  LLT EltTy = Ty.getScalarType();
4533  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4534  const bool Unpacked = ST.hasUnpackedD16VMem();
4535 
4536  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4537  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4538 
4539  unsigned Opc;
4540 
4541  if (IsTyped) {
4542  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4543  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4544  } else if (IsFormat) {
4545  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4546  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4547  } else {
4548  switch (MemTy.getSizeInBits()) {
4549  case 8:
4550  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4551  break;
4552  case 16:
4553  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4554  break;
4555  default:
4556  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4557  break;
4558  }
4559  }
4560 
4561  Register LoadDstReg;
4562 
4563  bool IsExtLoad =
4564  (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4565  LLT UnpackedTy = Ty.changeElementSize(32);
4566 
4567  if (IsExtLoad)
4568  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4569  else if (Unpacked && IsD16 && Ty.isVector())
4570  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4571  else
4572  LoadDstReg = Dst;
4573 
4574  auto MIB = B.buildInstr(Opc)
4575  .addDef(LoadDstReg) // vdata
4576  .addUse(RSrc) // rsrc
4577  .addUse(VIndex) // vindex
4578  .addUse(VOffset) // voffset
4579  .addUse(SOffset) // soffset
4580  .addImm(ImmOffset); // offset(imm)
4581 
4582  if (IsTyped)
4583  MIB.addImm(Format);
4584 
4585  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4586  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4587  .addMemOperand(MMO);
4588 
4589  if (LoadDstReg != Dst) {
4590  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4591 
4592  // Widen result for extending loads was widened.
4593  if (IsExtLoad)
4594  B.buildTrunc(Dst, LoadDstReg);
4595  else {
4596  // Repack to original 16-bit vector result
4597  // FIXME: G_TRUNC should work, but legalization currently fails
4598  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4599  SmallVector<Register, 4> Repack;
4600  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4601  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4602  B.buildMerge(Dst, Repack);
4603  }
4604  }
4605 
4606  MI.eraseFromParent();
4607  return true;
4608 }
4609 
4612  bool IsInc) const {
4613  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4614  AMDGPU::G_AMDGPU_ATOMIC_DEC;
4615  B.buildInstr(Opc)
4616  .addDef(MI.getOperand(0).getReg())
4617  .addUse(MI.getOperand(2).getReg())
4618  .addUse(MI.getOperand(3).getReg())
4619  .cloneMemRefs(MI);
4620  MI.eraseFromParent();
4621  return true;
4622 }
4623 
4624 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4625  switch (IntrID) {
4626  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4627  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4628  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4629  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4630  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4631  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4632  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4633  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4634  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4635  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4636  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4637  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4638  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4639  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4640  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4641  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4642  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4643  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4644  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4645  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4646  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4647  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4648  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4649  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4650  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4651  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4652  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4653  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4654  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4655  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4656  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4657  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4658  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4659  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4660  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4661  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4662  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4663  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4664  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4665  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4666  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4667  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4668  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4669  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4670  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4671  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4672  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4673  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4674  default:
4675  llvm_unreachable("unhandled atomic opcode");
4676  }
4677 }
4678 
4681  Intrinsic::ID IID) const {
4682  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4683  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4684  const bool HasReturn = MI.getNumExplicitDefs() != 0;
4685 
4686  Register Dst;
4687 
4688  int OpOffset = 0;
4689  if (HasReturn) {
4690  // A few FP atomics do not support return values.
4691  Dst = MI.getOperand(0).getReg();
4692  } else {
4693  OpOffset = -1;
4694  }
4695 
4696  Register VData = MI.getOperand(2 + OpOffset).getReg();
4697  Register CmpVal;
4698 
4699  if (IsCmpSwap) {
4700  CmpVal = MI.getOperand(3 + OpOffset).getReg();
4701  ++OpOffset;
4702  }
4703 
4704  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4705  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4706 
4707  // The struct intrinsic variants add one additional operand over raw.
4708  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4709  Register VIndex;
4710  if (HasVIndex) {
4711  VIndex = MI.getOperand(4 + OpOffset).getReg();
4712  ++OpOffset;
4713  } else {
4714  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4715  }
4716 
4717  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4718  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4719  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4720 
4721  MachineMemOperand *MMO = *MI.memoperands_begin();
4722 
4723  unsigned ImmOffset;
4724  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4725  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4726 
4727  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4728 
4729  if (HasReturn)
4730  MIB.addDef(Dst);
4731 
4732  MIB.addUse(VData); // vdata
4733 
4734  if (IsCmpSwap)
4735  MIB.addReg(CmpVal);
4736 
4737  MIB.addUse(RSrc) // rsrc
4738  .addUse(VIndex) // vindex
4739  .addUse(VOffset) // voffset
4740  .addUse(SOffset) // soffset
4741  .addImm(ImmOffset) // offset(imm)
4742  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4743  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4744  .addMemOperand(MMO);
4745 
4746  MI.eraseFromParent();
4747  return true;
4748 }
4749 
4750 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4751 /// vector with s16 typed elements.
4753  SmallVectorImpl<Register> &PackedAddrs,
4754  unsigned ArgOffset,
4755  const AMDGPU::ImageDimIntrinsicInfo *Intr,
4756  bool IsA16, bool IsG16) {
4757  const LLT S16 = LLT::scalar(16);
4758  const LLT V2S16 = LLT::fixed_vector(2, 16);
4759  auto EndIdx = Intr->VAddrEnd;
4760 
4761  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4762  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4763  if (!SrcOp.isReg())
4764  continue; // _L to _LZ may have eliminated this.
4765 
4766  Register AddrReg = SrcOp.getReg();
4767 
4768  if ((I < Intr->GradientStart) ||
4769  (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4770  (I >= Intr->CoordStart && !IsA16)) {
4771  if ((I < Intr->GradientStart) && IsA16 &&
4772  (B.getMRI()->getType(AddrReg) == S16)) {
4773  assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4774  // Special handling of bias when A16 is on. Bias is of type half but
4775  // occupies full 32-bit.
4776  PackedAddrs.push_back(
4777  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4778  .getReg(0));
4779  } else {
4780  assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4781  "Bias needs to be converted to 16 bit in A16 mode");
4782  // Handle any gradient or coordinate operands that should not be packed
4783  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4784  PackedAddrs.push_back(AddrReg);
4785  }
4786  } else {
4787  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4788  // derivatives dx/dh and dx/dv are packed with undef.
4789  if (((I + 1) >= EndIdx) ||
4790  ((Intr->NumGradients / 2) % 2 == 1 &&
4791  (I == static_cast<unsigned>(Intr->GradientStart +
4792  (Intr->NumGradients / 2) - 1) ||
4793  I == static_cast<unsigned>(Intr->GradientStart +
4794  Intr->NumGradients - 1))) ||
4795  // Check for _L to _LZ optimization
4796  !MI.getOperand(ArgOffset + I + 1).isReg()) {
4797  PackedAddrs.push_back(
4798  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4799  .getReg(0));
4800  } else {
4801  PackedAddrs.push_back(
4802  B.buildBuildVector(
4803  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4804  .getReg(0));
4805  ++I;
4806  }
4807  }
4808  }
4809 }
4810 
4811 /// Convert from separate vaddr components to a single vector address register,
4812 /// and replace the remaining operands with $noreg.
4814  int DimIdx, int NumVAddrs) {
4815  const LLT S32 = LLT::scalar(32);
4816 
4817  SmallVector<Register, 8> AddrRegs;
4818  for (int I = 0; I != NumVAddrs; ++I) {
4819  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4820  if (SrcOp.isReg()) {
4821  AddrRegs.push_back(SrcOp.getReg());
4822  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4823  }
4824  }
4825 
4826  int NumAddrRegs = AddrRegs.size();
4827  if (NumAddrRegs != 1) {
4828  // Above 8 elements round up to next power of 2 (i.e. 16).
4829  if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4830  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4831  auto Undef = B.buildUndef(S32);
4832  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4833  NumAddrRegs = RoundedNumRegs;
4834  }
4835 
4836  auto VAddr =
4837  B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4838  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4839  }
4840 
4841  for (int I = 1; I != NumVAddrs; ++I) {
4842  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4843  if (SrcOp.isReg())
4844  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4845  }
4846 }
4847 
4848 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4849 ///
4850 /// Depending on the subtarget, load/store with 16-bit element data need to be
4851 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4852 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4853 /// registers.
4854 ///
4855 /// We don't want to directly select image instructions just yet, but also want
4856 /// to exposes all register repacking to the legalizer/combiners. We also don't
4857 /// want a selected instruction entering RegBankSelect. In order to avoid
4858 /// defining a multitude of intermediate image instructions, directly hack on
4859 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4860 /// padding now unnecessary arguments with $noreg.
4863  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4864 
4865  const unsigned NumDefs = MI.getNumExplicitDefs();
4866  const unsigned ArgOffset = NumDefs + 1;
4867  bool IsTFE = NumDefs == 2;
4868  // We are only processing the operands of d16 image operations on subtargets
4869  // that use the unpacked register layout, or need to repack the TFE result.
4870 
4871  // TODO: Do we need to guard against already legalized intrinsics?
4872  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4873  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4874 
4875  MachineRegisterInfo *MRI = B.getMRI();
4876  const LLT S32 = LLT::scalar(32);
4877  const LLT S16 = LLT::scalar(16);
4878  const LLT V2S16 = LLT::fixed_vector(2, 16);
4879 
4880  unsigned DMask = 0;
4881  Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4882  LLT Ty = MRI->getType(VData);
4883 
4884  // Check for 16 bit addresses and pack if true.
4885  LLT GradTy =
4886  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4887  LLT AddrTy =
4888  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4889  const bool IsG16 = GradTy == S16;
4890  const bool IsA16 = AddrTy == S16;
4891  const bool IsD16 = Ty.getScalarType() == S16;
4892 
4893  int DMaskLanes = 0;
4894  if (!BaseOpcode->Atomic) {
4895  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4896  if (BaseOpcode->Gather4) {
4897  DMaskLanes = 4;
4898  } else if (DMask != 0) {
4899  DMaskLanes = countPopulation(DMask);
4900  } else if (!IsTFE && !BaseOpcode->Store) {
4901  // If dmask is 0, this is a no-op load. This can be eliminated.
4902  B.buildUndef(MI.getOperand(0));
4903  MI.eraseFromParent();
4904  return true;
4905  }
4906  }
4907 
4908  Observer.changingInstr(MI);
4909  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4910 
4911  const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4912  : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4913  const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4914  : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4915  unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4916 
4917  // Track that we legalized this
4918  MI.setDesc(B.getTII().get(NewOpcode));
4919 
4920  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4921  // dmask to be at least 1 otherwise the instruction will fail
4922  if (IsTFE && DMask == 0) {
4923  DMask = 0x1;
4924  DMaskLanes = 1;
4925  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4926  }
4927 
4928  if (BaseOpcode->Atomic) {
4929  Register VData0 = MI.getOperand(2).getReg();
4930  LLT Ty = MRI->getType(VData0);
4931 
4932  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4933  if (Ty.isVector())
4934  return false;
4935 
4936  if (BaseOpcode->AtomicX2) {
4937  Register VData1 = MI.getOperand(3).getReg();
4938  // The two values are packed in one register.
4939  LLT PackedTy = LLT::fixed_vector(2, Ty);
4940  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4941  MI.getOperand(2).setReg(Concat.getReg(0));
4942  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4943  }
4944  }
4945 
4946  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4947 
4948  // Rewrite the addressing register layout before doing anything else.
4949  if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4950  // 16 bit gradients are supported, but are tied to the A16 control
4951  // so both gradients and addresses must be 16 bit
4952  return false;
4953  }
4954 
4955  if (IsA16 && !ST.hasA16()) {
4956  // A16 not supported
4957  return false;
4958  }
4959 
4960  if (IsA16 || IsG16) {
4961  if (Intr->NumVAddrs > 1) {
4962  SmallVector<Register, 4> PackedRegs;
4963 
4964  packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4965  IsG16);
4966 
4967  // See also below in the non-a16 branch
4968  const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4969  PackedRegs.size() <= ST.getNSAMaxSize();
4970 
4971  if (!UseNSA && PackedRegs.size() > 1) {
4972  LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4973  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4974  PackedRegs[0] = Concat.getReg(0);
4975  PackedRegs.resize(1);
4976  }
4977 
4978  const unsigned NumPacked = PackedRegs.size();
4979  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4980  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4981  if (!SrcOp.isReg()) {
4982  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4983  continue;
4984  }
4985 
4986  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4987 
4988  if (I - Intr->VAddrStart < NumPacked)
4989  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4990  else
4991  SrcOp.setReg(AMDGPU::NoRegister);
4992  }
4993  }
4994  } else {
4995  // If the register allocator cannot place the address registers contiguously
4996  // without introducing moves, then using the non-sequential address encoding
4997  // is always preferable, since it saves VALU instructions and is usually a
4998  // wash in terms of code size or even better.
4999  //
5000  // However, we currently have no way of hinting to the register allocator
5001  // that MIMG addresses should be placed contiguously when it is possible to
5002  // do so, so force non-NSA for the common 2-address case as a heuristic.
5003  //
5004  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5005  // allocation when possible.
5006  //
5007  // TODO: we can actually allow partial NSA where the final register is a
5008  // contiguous set of the remaining addresses.
5009  // This could help where there are more addresses than supported.
5010  const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
5011  CorrectedNumVAddrs <= ST.getNSAMaxSize();
5012 
5013  if (!UseNSA && Intr->NumVAddrs > 1)
5014  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5015  Intr->NumVAddrs);
5016  }
5017 
5018  int Flags = 0;
5019  if (IsA16)
5020  Flags |= 1;
5021  if (IsG16)
5022  Flags |= 2;
5023  MI.addOperand(MachineOperand::CreateImm(Flags));
5024 
5025  if (BaseOpcode->Store) { // No TFE for stores?
5026  // TODO: Handle dmask trim
5027  if (!Ty.isVector() || !IsD16)
5028  return true;
5029 
5030  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5031  if (RepackedReg != VData) {
5032  MI.getOperand(1).setReg(RepackedReg);
5033  }
5034 
5035  return true;
5036  }
5037 
5038  Register DstReg = MI.getOperand(0).getReg();
5039  const LLT EltTy = Ty.getScalarType();
5040  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5041 
5042  // Confirm that the return type is large enough for the dmask specified
5043  if (NumElts < DMaskLanes)
5044  return false;
5045 
5046  if (NumElts > 4 || DMaskLanes > 4)
5047  return false;
5048 
5049  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5050  const LLT AdjustedTy =
5051  Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5052 
5053  // The raw dword aligned data component of the load. The only legal cases
5054  // where this matters should be when using the packed D16 format, for
5055  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5056  LLT RoundedTy;
5057 
5058  // S32 vector to to cover all data, plus TFE result element.
5059  LLT TFETy;
5060 
5061  // Register type to use for each loaded component. Will be S32 or V2S16.
5062  LLT RegTy;
5063 
5064  if (IsD16 && ST.hasUnpackedD16VMem()) {
5065  RoundedTy =
5066  LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5067  TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5068  RegTy = S32;
5069  } else {
5070  unsigned EltSize = EltTy.getSizeInBits();
5071  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5072  unsigned RoundedSize = 32 * RoundedElts;
5073  RoundedTy = LLT::scalarOrVector(
5074  ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5075  TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5076  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5077  }
5078 
5079  // The return type does not need adjustment.
5080  // TODO: Should we change s16 case to s32 or <2 x s16>?
5081  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5082  return true;
5083 
5084  Register Dst1Reg;
5085 
5086  // Insert after the instruction.
5087  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5088 
5089  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5090  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5091  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5092  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5093 
5094  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5095 
5096  MI.getOperand(0).setReg(NewResultReg);
5097 
5098  // In the IR, TFE is supposed to be used with a 2 element struct return
5099  // type. The instruction really returns these two values in one contiguous
5100  // register, with one additional dword beyond the loaded data. Rewrite the
5101  // return type to use a single register result.
5102 
5103  if (IsTFE) {
5104  Dst1Reg = MI.getOperand(1).getReg();
5105  if (MRI->getType(Dst1Reg) != S32)
5106  return false;
5107 
5108  // TODO: Make sure the TFE operand bit is set.
5109  MI.removeOperand(1);
5110 
5111  // Handle the easy case that requires no repack instructions.
5112  if (Ty == S32) {
5113  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5114  return true;
5115  }
5116  }
5117 
5118  // Now figure out how to copy the new result register back into the old
5119  // result.
5120  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5121 
5122  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
5123 
5124  if (ResultNumRegs == 1) {
5125  assert(!IsTFE);
5126  ResultRegs[0] = NewResultReg;
5127  } else {
5128  // We have to repack into a new vector of some kind.
5129  for (int I = 0; I != NumDataRegs; ++I)
5130  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5131  B.buildUnmerge(ResultRegs, NewResultReg);
5132 
5133  // Drop the final TFE element to get the data part. The TFE result is
5134  // directly written to the right place already.
5135  if (IsTFE)
5136  ResultRegs.resize(NumDataRegs);
5137  }
5138 
5139  // For an s16 scalar result, we form an s32 result with a truncate regardless
5140  // of packed vs. unpacked.
5141  if (IsD16 && !Ty.isVector()) {
5142  B.buildTrunc(DstReg, ResultRegs[0]);
5143  return true;
5144  }
5145 
5146  // Avoid a build/concat_vector of 1 entry.
5147  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5148  B.buildBitcast(DstReg, ResultRegs[0]);
5149  return true;
5150  }
5151 
5152  assert(Ty.isVector());
5153 
5154  if (IsD16) {
5155  // For packed D16 results with TFE enabled, all the data components are
5156  // S32. Cast back to the expected type.
5157  //
5158  // TODO: We don't really need to use load s32 elements. We would only need one
5159  // cast for the TFE result if a multiple of v2s16 was used.
5160  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5161  for (Register &Reg : ResultRegs)
5162  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5163  } else if (ST.hasUnpackedD16VMem()) {
5164  for (Register &Reg : ResultRegs)
5165  Reg = B.buildTrunc(S16, Reg).getReg(0);
5166  }
5167  }
5168 
5169  auto padWithUndef = [&](LLT Ty, int NumElts) {
5170  if (NumElts == 0)
5171  return;
5172  Register Undef = B.buildUndef(Ty).getReg(0);
5173  for (int I = 0; I != NumElts; ++I)
5174  ResultRegs.push_back(Undef);
5175  };
5176 
5177  // Pad out any elements eliminated due to the dmask.
5178  LLT ResTy = MRI->getType(ResultRegs[0]);
5179  if (!ResTy.isVector()) {
5180  padWithUndef(ResTy, NumElts - ResultRegs.size());
5181  B.buildBuildVector(DstReg, ResultRegs);
5182  return true;
5183  }
5184 
5185  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
5186  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5187 
5188  // Deal with the one annoying legal case.
5189  const LLT V3S16 = LLT::fixed_vector(3, 16);
5190  if (Ty == V3S16) {
5191  if (IsTFE) {
5192  if (ResultRegs.size() == 1) {
5193  NewResultReg = ResultRegs[0];
5194  } else if (ResultRegs.size() == 2) {
5195  LLT V4S16 = LLT::fixed_vector(4, 16);
5196  NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5197  } else {
5198  return false;
5199  }
5200  }
5201 
5202  if (MRI->getType(DstReg).getNumElements() <
5203  MRI->getType(NewResultReg).getNumElements()) {
5204  B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5205  } else {
5206  B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5207  }
5208  return true;
5209  }
5210 
5211  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5212  B.buildConcatVectors(DstReg, ResultRegs);
5213  return true;
5214 }
5215 
5217  LegalizerHelper &Helper, MachineInstr &MI) const {
5218  MachineIRBuilder &B = Helper.MIRBuilder;
5219  GISelChangeObserver &Observer = Helper.Observer;
5220 
5221  Register Dst = MI.getOperand(0).getReg();
5222  LLT Ty = B.getMRI()->getType(Dst);
5223  unsigned Size = Ty.getSizeInBits();
5224  MachineFunction &MF = B.getMF();
5225 
5226  Observer.changingInstr(MI);
5227 
5228  if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5229  Ty = getBitcastRegisterType(Ty);
5230  Helper.bitcastDst(MI, Ty, 0);
5231  Dst = MI.getOperand(0).getReg();
5232  B.setInsertPt(B.getMBB(), MI);
5233  }
5234 
5235  // FIXME: We don't really need this intermediate instruction. The intrinsic
5236  // should be fixed to have a memory operand. Since it's readnone, we're not
5237  // allowed to add one.
5238  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5239  MI.removeOperand(1); // Remove intrinsic ID
5240 
5241  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5242  // TODO: Should this use datalayout alignment?
5243  const unsigned MemSize = (Size + 7) / 8;
5244  const Align MemAlign(4);
5249  MemSize, MemAlign);
5250  MI.addMemOperand(MF, MMO);
5251 
5252  // There are no 96-bit result scalar loads, but widening to 128-bit should
5253  // always be legal. We may need to restore this to a 96-bit result if it turns
5254  // out this needs to be converted to a vector load during RegBankSelect.
5255  if (!isPowerOf2_32(Size)) {
5256