LLVM  15.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
41  "amdgpu-global-isel-new-legality",
42  cl::desc("Use GlobalISel desired legality, rather than try to use"
43  "rules compatible with selection patterns"),
44  cl::init(false),
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
51  unsigned NElts = Ty.getNumElements();
52  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53  return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55 
56 // Round the number of bits to the next power of two bits
58  unsigned Bits = Ty.getSizeInBits();
59  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60  return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67  return [=](const LegalityQuery &Query) {
68  const LLT Ty = Query.Types[TypeIdx];
69  if (!Ty.isVector())
70  return false;
71 
72  const LLT EltTy = Ty.getElementType();
73  const unsigned EltSize = EltTy.getSizeInBits();
74  return Ty.getNumElements() % 2 != 0 &&
75  EltSize > 1 && EltSize < 32 &&
76  Ty.getSizeInBits() % 32 != 0;
77  };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81  return [=](const LegalityQuery &Query) {
82  const LLT Ty = Query.Types[TypeIdx];
83  return Ty.getSizeInBits() % 32 == 0;
84  };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88  return [=](const LegalityQuery &Query) {
89  const LLT Ty = Query.Types[TypeIdx];
90  const LLT EltTy = Ty.getScalarType();
91  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92  };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96  return [=](const LegalityQuery &Query) {
97  const LLT Ty = Query.Types[TypeIdx];
98  const LLT EltTy = Ty.getElementType();
99  return std::make_pair(TypeIdx,
100  LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101  };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105  return [=](const LegalityQuery &Query) {
106  const LLT Ty = Query.Types[TypeIdx];
107  const LLT EltTy = Ty.getElementType();
108  unsigned Size = Ty.getSizeInBits();
109  unsigned Pieces = (Size + 63) / 64;
110  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111  return std::make_pair(
112  TypeIdx,
113  LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114  };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120  return [=](const LegalityQuery &Query) {
121  const LLT Ty = Query.Types[TypeIdx];
122 
123  const LLT EltTy = Ty.getElementType();
124  const int Size = Ty.getSizeInBits();
125  const int EltSize = EltTy.getSizeInBits();
126  const int NextMul32 = (Size + 31) / 32;
127 
128  assert(EltSize < 32);
129 
130  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131  return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132  };
133 }
134 
135 static LLT getBitcastRegisterType(const LLT Ty) {
136  const unsigned Size = Ty.getSizeInBits();
137 
138  if (Size <= 32) {
139  // <2 x s8> -> s16
140  // <4 x s8> -> s32
141  return LLT::scalar(Size);
142  }
143 
144  return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148  return [=](const LegalityQuery &Query) {
149  const LLT Ty = Query.Types[TypeIdx];
150  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151  };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155  return [=](const LegalityQuery &Query) {
156  const LLT Ty = Query.Types[TypeIdx];
157  unsigned Size = Ty.getSizeInBits();
158  assert(Size % 32 == 0);
159  return std::make_pair(
160  TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161  };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165  return [=](const LegalityQuery &Query) {
166  const LLT QueryTy = Query.Types[TypeIdx];
167  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168  };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172  return [=](const LegalityQuery &Query) {
173  const LLT QueryTy = Query.Types[TypeIdx];
174  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175  };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179  return [=](const LegalityQuery &Query) {
180  const LLT QueryTy = Query.Types[TypeIdx];
181  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182  };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186  return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190  const int EltSize = EltTy.getSizeInBits();
191  return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195  const int EltSize = Ty.getElementType().getSizeInBits();
196  return EltSize == 32 || EltSize == 64 ||
197  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198  EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202  if (!isRegisterSize(Ty.getSizeInBits()))
203  return false;
204 
205  if (Ty.isVector())
206  return isRegisterVectorType(Ty);
207 
208  return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214  return [=](const LegalityQuery &Query) {
215  return isRegisterType(Query.Types[TypeIdx]);
216  };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220  return [=](const LegalityQuery &Query) {
221  const LLT QueryTy = Query.Types[TypeIdx];
222  if (!QueryTy.isVector())
223  return false;
224  const LLT EltTy = QueryTy.getElementType();
225  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226  };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
232  return [=](const LegalityQuery &Query) {
233  const LLT Ty = Query.Types[TypeIdx];
234  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235  Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236  };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243  bool IsLoad) {
244  switch (AS) {
246  // FIXME: Private element size.
247  return ST.enableFlatScratch() ? 128 : 32;
249  return ST.useDS128() ? 128 : 64;
253  // Treat constant and global as identical. SMRD loads are sometimes usable for
254  // global loads (ideally constant address space should be eliminated)
255  // depending on the context. Legality cannot be context dependent, but
256  // RegBankSelect can split the load as necessary depending on the pointer
257  // register bank/uniformity and if the memory is invariant or not written in a
258  // kernel.
259  return IsLoad ? 512 : 128;
260  default:
261  // Flat addresses may contextually need to be split to 32-bit parts if they
262  // may alias scratch depending on the subtarget.
263  return 128;
264  }
265 }
266 
268  const LegalityQuery &Query) {
269  const LLT Ty = Query.Types[0];
270 
271  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272  const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274  unsigned RegSize = Ty.getSizeInBits();
275  uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276  uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277  unsigned AS = Query.Types[1].getAddressSpace();
278 
279  // All of these need to be custom lowered to cast the pointer operand.
281  return false;
282 
283  // Do not handle extending vector loads.
284  if (Ty.isVector() && MemSize != RegSize)
285  return false;
286 
287  // TODO: We should be able to widen loads if the alignment is high enough, but
288  // we also need to modify the memory access size.
289 #if 0
290  // Accept widening loads based on alignment.
291  if (IsLoad && MemSize < Size)
292  MemSize = std::max(MemSize, Align);
293 #endif
294 
295  // Only 1-byte and 2-byte to 32-bit extloads are valid.
296  if (MemSize != RegSize && RegSize != 32)
297  return false;
298 
299  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300  return false;
301 
302  switch (MemSize) {
303  case 8:
304  case 16:
305  case 32:
306  case 64:
307  case 128:
308  break;
309  case 96:
310  if (!ST.hasDwordx3LoadStores())
311  return false;
312  break;
313  case 256:
314  case 512:
315  // These may contextually need to be broken down.
316  break;
317  default:
318  return false;
319  }
320 
321  assert(RegSize >= MemSize);
322 
323  if (AlignBits < MemSize) {
324  const SITargetLowering *TLI = ST.getTargetLowering();
325  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326  Align(AlignBits / 8)))
327  return false;
328  }
329 
330  return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338  if (EnableNewLegality)
339  return false;
340 
341  const unsigned Size = Ty.getSizeInBits();
342  if (Size <= 64)
343  return false;
344  if (!Ty.isVector())
345  return true;
346 
347  LLT EltTy = Ty.getElementType();
348  if (EltTy.isPointer())
349  return true;
350 
351  unsigned EltSize = EltTy.getSizeInBits();
352  return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356  const LLT Ty = Query.Types[0];
357  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364  const LLT MemTy) {
365  const unsigned MemSizeInBits = MemTy.getSizeInBits();
366  const unsigned Size = Ty.getSizeInBits();
367  if (Size != MemSizeInBits)
368  return Size <= 32 && Ty.isVector();
369 
371  return true;
372 
373  // Don't try to handle bitcasting vector ext loads for now.
374  return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375  (Size <= 32 || isRegisterSize(Size)) &&
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383  uint64_t AlignInBits, unsigned AddrSpace,
384  unsigned Opcode) {
385  unsigned SizeInBits = MemoryTy.getSizeInBits();
386  // We don't want to widen cases that are naturally legal.
387  if (isPowerOf2_32(SizeInBits))
388  return false;
389 
390  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391  // end up widening these for a scalar load during RegBankSelect, since there
392  // aren't 96-bit scalar loads.
393  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394  return false;
395 
396  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397  return false;
398 
399  // A load is known dereferenceable up to the alignment, so it's legal to widen
400  // to it.
401  //
402  // TODO: Could check dereferenceable for less aligned cases.
403  unsigned RoundedSize = NextPowerOf2(SizeInBits);
404  if (AlignInBits < RoundedSize)
405  return false;
406 
407  // Do not widen if it would introduce a slow unaligned load.
408  const SITargetLowering *TLI = ST.getTargetLowering();
409  bool Fast = false;
411  RoundedSize, AddrSpace, Align(AlignInBits / 8),
412  MachineMemOperand::MOLoad, &Fast) &&
413  Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417  unsigned Opcode) {
418  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419  return false;
420 
421  return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422  Query.MMODescrs[0].AlignInBits,
423  Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
427  const GCNTargetMachine &TM)
428  : ST(ST_) {
429  using namespace TargetOpcode;
430 
431  auto GetAddrSpacePtr = [&TM](unsigned AS) {
432  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433  };
434 
435  const LLT S1 = LLT::scalar(1);
436  const LLT S8 = LLT::scalar(8);
437  const LLT S16 = LLT::scalar(16);
438  const LLT S32 = LLT::scalar(32);
439  const LLT S64 = LLT::scalar(64);
440  const LLT S128 = LLT::scalar(128);
441  const LLT S256 = LLT::scalar(256);
442  const LLT S512 = LLT::scalar(512);
443  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445  const LLT V2S8 = LLT::fixed_vector(2, 8);
446  const LLT V2S16 = LLT::fixed_vector(2, 16);
447  const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449  const LLT V2S32 = LLT::fixed_vector(2, 32);
450  const LLT V3S32 = LLT::fixed_vector(3, 32);
451  const LLT V4S32 = LLT::fixed_vector(4, 32);
452  const LLT V5S32 = LLT::fixed_vector(5, 32);
453  const LLT V6S32 = LLT::fixed_vector(6, 32);
454  const LLT V7S32 = LLT::fixed_vector(7, 32);
455  const LLT V8S32 = LLT::fixed_vector(8, 32);
456  const LLT V9S32 = LLT::fixed_vector(9, 32);
457  const LLT V10S32 = LLT::fixed_vector(10, 32);
458  const LLT V11S32 = LLT::fixed_vector(11, 32);
459  const LLT V12S32 = LLT::fixed_vector(12, 32);
460  const LLT V13S32 = LLT::fixed_vector(13, 32);
461  const LLT V14S32 = LLT::fixed_vector(14, 32);
462  const LLT V15S32 = LLT::fixed_vector(15, 32);
463  const LLT V16S32 = LLT::fixed_vector(16, 32);
464  const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466  const LLT V2S64 = LLT::fixed_vector(2, 64);
467  const LLT V3S64 = LLT::fixed_vector(3, 64);
468  const LLT V4S64 = LLT::fixed_vector(4, 64);
469  const LLT V5S64 = LLT::fixed_vector(5, 64);
470  const LLT V6S64 = LLT::fixed_vector(6, 64);
471  const LLT V7S64 = LLT::fixed_vector(7, 64);
472  const LLT V8S64 = LLT::fixed_vector(8, 64);
473  const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475  std::initializer_list<LLT> AllS32Vectors =
476  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478  std::initializer_list<LLT> AllS64Vectors =
479  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489  const LLT CodePtr = FlatPtr;
490 
491  const std::initializer_list<LLT> AddrSpaces64 = {
492  GlobalPtr, ConstantPtr, FlatPtr
493  };
494 
495  const std::initializer_list<LLT> AddrSpaces32 = {
496  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497  };
498 
499  const std::initializer_list<LLT> FPTypesBase = {
500  S32, S64
501  };
502 
503  const std::initializer_list<LLT> FPTypes16 = {
504  S32, S64, S16
505  };
506 
507  const std::initializer_list<LLT> FPTypesPK16 = {
508  S32, S64, S16, V2S16
509  };
510 
511  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513  // s1 for VCC branches, s32 for SCC branches.
514  getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517  // elements for v3s16
519  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520  .legalFor(AllS32Vectors)
521  .legalFor(AllS64Vectors)
522  .legalFor(AddrSpaces64)
523  .legalFor(AddrSpaces32)
524  .legalIf(isPointer(0))
525  .clampScalar(0, S16, S256)
526  .widenScalarToNextPow2(0, 32)
527  .clampMaxNumElements(0, S32, 16)
529  .scalarize(0);
530 
531  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532  // Full set of gfx9 features.
533  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534  .legalFor({S32, S16, V2S16})
535  .minScalar(0, S16)
536  .clampMaxNumElementsStrict(0, S16, 2)
538  .maxScalar(0, S32)
539  .scalarize(0);
540 
541  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542  .legalFor({S32, S16, V2S16}) // Clamp modifier
543  .minScalarOrElt(0, S16)
544  .clampMaxNumElementsStrict(0, S16, 2)
545  .scalarize(0)
546  .widenScalarToNextPow2(0, 32)
547  .lower();
548  } else if (ST.has16BitInsts()) {
549  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550  .legalFor({S32, S16})
551  .minScalar(0, S16)
553  .maxScalar(0, S32)
554  .scalarize(0);
555 
556  // Technically the saturating operations require clamp bit support, but this
557  // was introduced at the same time as 16-bit operations.
558  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559  .legalFor({S32, S16}) // Clamp modifier
560  .minScalar(0, S16)
561  .scalarize(0)
562  .widenScalarToNextPow2(0, 16)
563  .lower();
564 
565  // We're just lowering this, but it helps get a better result to try to
566  // coerce to the desired type first.
567  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568  .minScalar(0, S16)
569  .scalarize(0)
570  .lower();
571  } else {
572  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573  .legalFor({S32})
574  .widenScalarToNextMultipleOf(0, 32)
575  .clampScalar(0, S32, S32)
576  .scalarize(0);
577 
578  if (ST.hasIntClamp()) {
579  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580  .legalFor({S32}) // Clamp modifier.
581  .scalarize(0)
582  .minScalarOrElt(0, S32)
583  .lower();
584  } else {
585  // Clamp bit support was added in VI, along with 16-bit operations.
586  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587  .minScalar(0, S32)
588  .scalarize(0)
589  .lower();
590  }
591 
592  // FIXME: DAG expansion gets better results. The widening uses the smaller
593  // range values and goes for the min/max lowering directly.
594  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595  .minScalar(0, S32)
596  .scalarize(0)
597  .lower();
598  }
599 
601  {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602  .customFor({S32, S64})
603  .clampScalar(0, S32, S64)
604  .widenScalarToNextPow2(0, 32)
605  .scalarize(0);
606 
607  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608  .legalFor({S32})
609  .maxScalar(0, S32);
610 
611  if (ST.hasVOP3PInsts()) {
612  Mulh
613  .clampMaxNumElements(0, S8, 2)
614  .lowerFor({V2S8});
615  }
616 
617  Mulh
618  .scalarize(0)
619  .lower();
620 
621  // Report legal for any types we can handle anywhere. For the cases only legal
622  // on the SALU, RegBankSelect will be able to re-legalize.
623  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625  .clampScalar(0, S32, S64)
629  .scalarize(0);
630 
631  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633  .legalFor({{S32, S1}, {S32, S32}})
634  .minScalar(0, S32)
635  .scalarize(0)
636  .lower();
637 
638  getActionDefinitionsBuilder(G_BITCAST)
639  // Don't worry about the size constraint.
641  .lower();
642 
643 
644  getActionDefinitionsBuilder(G_CONSTANT)
645  .legalFor({S1, S32, S64, S16, GlobalPtr,
646  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647  .legalIf(isPointer(0))
648  .clampScalar(0, S32, S64)
650 
651  getActionDefinitionsBuilder(G_FCONSTANT)
652  .legalFor({S32, S64, S16})
653  .clampScalar(0, S16, S64);
654 
655  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656  .legalIf(isRegisterType(0))
657  // s1 and s16 are special cases because they have legal operations on
658  // them, but don't really occupy registers in the normal way.
659  .legalFor({S1, S16})
660  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661  .clampScalarOrElt(0, S32, MaxScalar)
662  .widenScalarToNextPow2(0, 32)
663  .clampMaxNumElements(0, S32, 16);
664 
665  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667  // If the amount is divergent, we have to do a wave reduction to get the
668  // maximum value, so this is expanded during RegBankSelect.
669  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670  .legalFor({{PrivatePtr, S32}});
671 
672  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673  .customIf(typeIsNot(0, PrivatePtr));
674 
675  getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677  auto &FPOpActions = getActionDefinitionsBuilder(
678  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679  .legalFor({S32, S64});
680  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681  .customFor({S32, S64});
682  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683  .customFor({S32, S64});
684 
685  if (ST.has16BitInsts()) {
686  if (ST.hasVOP3PInsts())
687  FPOpActions.legalFor({S16, V2S16});
688  else
689  FPOpActions.legalFor({S16});
690 
691  TrigActions.customFor({S16});
692  FDIVActions.customFor({S16});
693  }
694 
695  auto &MinNumMaxNum = getActionDefinitionsBuilder({
696  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698  if (ST.hasVOP3PInsts()) {
699  MinNumMaxNum.customFor(FPTypesPK16)
701  .clampMaxNumElements(0, S16, 2)
702  .clampScalar(0, S16, S64)
703  .scalarize(0);
704  } else if (ST.has16BitInsts()) {
705  MinNumMaxNum.customFor(FPTypes16)
706  .clampScalar(0, S16, S64)
707  .scalarize(0);
708  } else {
709  MinNumMaxNum.customFor(FPTypesBase)
710  .clampScalar(0, S32, S64)
711  .scalarize(0);
712  }
713 
714  if (ST.hasVOP3PInsts())
715  FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717  FPOpActions
718  .scalarize(0)
719  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721  TrigActions
722  .scalarize(0)
723  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725  FDIVActions
726  .scalarize(0)
727  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729  getActionDefinitionsBuilder({G_FNEG, G_FABS})
730  .legalFor(FPTypesPK16)
731  .clampMaxNumElementsStrict(0, S16, 2)
732  .scalarize(0)
733  .clampScalar(0, S16, S64);
734 
735  if (ST.has16BitInsts()) {
736  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737  .legalFor({S32, S64, S16})
738  .scalarize(0)
739  .clampScalar(0, S16, S64);
740  } else {
742  .legalFor({S32, S64})
743  .scalarize(0)
744  .clampScalar(0, S32, S64);
745 
746  if (ST.hasFractBug()) {
748  .customFor({S64})
749  .legalFor({S32, S64})
750  .scalarize(0)
751  .clampScalar(0, S32, S64);
752  } else {
754  .legalFor({S32, S64})
755  .scalarize(0)
756  .clampScalar(0, S32, S64);
757  }
758  }
759 
760  getActionDefinitionsBuilder(G_FPTRUNC)
761  .legalFor({{S32, S64}, {S16, S32}})
762  .scalarize(0)
763  .lower();
764 
766  .legalFor({{S64, S32}, {S32, S16}})
767  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768  .scalarize(0);
769 
771  // Use actual fsub instruction
772  .legalFor({S32})
773  // Must use fadd + fneg
774  .lowerFor({S64, S16, V2S16})
775  .scalarize(0)
776  .clampScalar(0, S32, S64);
777 
778  // Whether this is legal depends on the floating point mode for the function.
779  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781  FMad.customFor({S32, S16});
782  else if (ST.hasMadMacF32Insts())
783  FMad.customFor({S32});
784  else if (ST.hasMadF16())
785  FMad.customFor({S16});
786  FMad.scalarize(0)
787  .lower();
788 
789  auto &FRem = getActionDefinitionsBuilder(G_FREM);
790  if (ST.has16BitInsts()) {
791  FRem.customFor({S16, S32, S64});
792  } else {
793  FRem.minScalar(0, S32)
794  .customFor({S32, S64});
795  }
796  FRem.scalarize(0);
797 
798  // TODO: Do we need to clamp maximum bitwidth?
800  .legalIf(isScalar(0))
801  .legalFor({{V2S16, V2S32}})
802  .clampMaxNumElements(0, S16, 2)
803  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804  // situations (like an invalid implicit use), we don't want to infinite loop
805  // in the legalizer.
807  .alwaysLegal();
808 
809  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811  {S32, S1}, {S64, S1}, {S16, S1}})
812  .scalarize(0)
813  .clampScalar(0, S32, S64)
814  .widenScalarToNextPow2(1, 32);
815 
816  // TODO: Split s1->s64 during regbankselect for VALU.
817  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819  .lowerIf(typeIs(1, S1))
820  .customFor({{S32, S64}, {S64, S64}});
821  if (ST.has16BitInsts())
822  IToFP.legalFor({{S16, S16}});
823  IToFP.clampScalar(1, S32, S64)
824  .minScalar(0, S32)
825  .scalarize(0)
827 
828  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830  .customFor({{S64, S32}, {S64, S64}})
831  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832  if (ST.has16BitInsts())
833  FPToI.legalFor({{S16, S16}});
834  else
835  FPToI.minScalar(1, S32);
836 
837  FPToI.minScalar(0, S32)
838  .widenScalarToNextPow2(0, 32)
839  .scalarize(0)
840  .lower();
841 
842  getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
843  .customFor({S16, S32})
844  .scalarize(0)
845  .lower();
846 
847  // Lower roundeven into G_FRINT
848  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
849  .scalarize(0)
850  .lower();
851 
852  if (ST.has16BitInsts()) {
853  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854  .legalFor({S16, S32, S64})
855  .clampScalar(0, S16, S64)
856  .scalarize(0);
857  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
858  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859  .legalFor({S32, S64})
860  .clampScalar(0, S32, S64)
861  .scalarize(0);
862  } else {
863  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
864  .legalFor({S32})
865  .customFor({S64})
866  .clampScalar(0, S32, S64)
867  .scalarize(0);
868  }
869 
870  getActionDefinitionsBuilder(G_PTR_ADD)
871  .legalIf(all(isPointer(0), sameSize(0, 1)))
872  .scalarize(0)
873  .scalarSameSizeAs(1, 0);
874 
875  getActionDefinitionsBuilder(G_PTRMASK)
876  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
877  .scalarSameSizeAs(1, 0)
878  .scalarize(0);
879 
880  auto &CmpBuilder =
881  getActionDefinitionsBuilder(G_ICMP)
882  // The compare output type differs based on the register bank of the output,
883  // so make both s1 and s32 legal.
884  //
885  // Scalar compares producing output in scc will be promoted to s32, as that
886  // is the allocatable register type that will be needed for the copy from
887  // scc. This will be promoted during RegBankSelect, and we assume something
888  // before that won't try to use s32 result types.
889  //
890  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
891  // bank.
892  .legalForCartesianProduct(
893  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
894  .legalForCartesianProduct(
895  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
896  if (ST.has16BitInsts()) {
897  CmpBuilder.legalFor({{S1, S16}});
898  }
899 
900  CmpBuilder
901  .widenScalarToNextPow2(1)
902  .clampScalar(1, S32, S64)
903  .scalarize(0)
904  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
905 
906  getActionDefinitionsBuilder(G_FCMP)
907  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
908  .widenScalarToNextPow2(1)
909  .clampScalar(1, S32, S64)
910  .scalarize(0);
911 
912  // FIXME: fpow has a selection pattern that should move to custom lowering.
913  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
914  if (ST.has16BitInsts())
915  Exp2Ops.legalFor({S32, S16});
916  else
917  Exp2Ops.legalFor({S32});
918  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
919  Exp2Ops.scalarize(0);
920 
921  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
922  if (ST.has16BitInsts())
923  ExpOps.customFor({{S32}, {S16}});
924  else
925  ExpOps.customFor({S32});
926  ExpOps.clampScalar(0, MinScalarFPTy, S32)
927  .scalarize(0);
928 
929  getActionDefinitionsBuilder(G_FPOWI)
930  .clampScalar(0, MinScalarFPTy, S32)
931  .lower();
932 
933  // The 64-bit versions produce 32-bit results, but only on the SALU.
934  getActionDefinitionsBuilder(G_CTPOP)
935  .legalFor({{S32, S32}, {S32, S64}})
936  .clampScalar(0, S32, S32)
937  .widenScalarToNextPow2(1, 32)
938  .clampScalar(1, S32, S64)
939  .scalarize(0)
940  .widenScalarToNextPow2(0, 32);
941 
942 
943  // The hardware instructions return a different result on 0 than the generic
944  // instructions expect. The hardware produces -1, but these produce the
945  // bitwidth.
946  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
947  .scalarize(0)
948  .clampScalar(0, S32, S32)
949  .clampScalar(1, S32, S64)
950  .widenScalarToNextPow2(0, 32)
951  .widenScalarToNextPow2(1, 32)
952  .custom();
953 
954  // The 64-bit versions produce 32-bit results, but only on the SALU.
955  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
956  .legalFor({{S32, S32}, {S32, S64}})
957  .clampScalar(0, S32, S32)
958  .clampScalar(1, S32, S64)
959  .scalarize(0)
960  .widenScalarToNextPow2(0, 32)
961  .widenScalarToNextPow2(1, 32);
962 
963  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
964  // RegBankSelect.
965  getActionDefinitionsBuilder(G_BITREVERSE)
966  .legalFor({S32, S64})
967  .clampScalar(0, S32, S64)
968  .scalarize(0)
969  .widenScalarToNextPow2(0);
970 
971  if (ST.has16BitInsts()) {
972  getActionDefinitionsBuilder(G_BSWAP)
973  .legalFor({S16, S32, V2S16})
974  .clampMaxNumElementsStrict(0, S16, 2)
975  // FIXME: Fixing non-power-of-2 before clamp is workaround for
976  // narrowScalar limitation.
977  .widenScalarToNextPow2(0)
978  .clampScalar(0, S16, S32)
979  .scalarize(0);
980 
981  if (ST.hasVOP3PInsts()) {
982  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983  .legalFor({S32, S16, V2S16})
984  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
985  .clampMaxNumElements(0, S16, 2)
986  .minScalar(0, S16)
987  .widenScalarToNextPow2(0)
988  .scalarize(0)
989  .lower();
990  } else {
991  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
992  .legalFor({S32, S16})
993  .widenScalarToNextPow2(0)
994  .minScalar(0, S16)
995  .scalarize(0)
996  .lower();
997  }
998  } else {
999  // TODO: Should have same legality without v_perm_b32
1000  getActionDefinitionsBuilder(G_BSWAP)
1001  .legalFor({S32})
1002  .lowerIf(scalarNarrowerThan(0, 32))
1003  // FIXME: Fixing non-power-of-2 before clamp is workaround for
1004  // narrowScalar limitation.
1005  .widenScalarToNextPow2(0)
1006  .maxScalar(0, S32)
1007  .scalarize(0)
1008  .lower();
1009 
1010  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1011  .legalFor({S32})
1012  .minScalar(0, S32)
1013  .widenScalarToNextPow2(0)
1014  .scalarize(0)
1015  .lower();
1016  }
1017 
1018  getActionDefinitionsBuilder(G_INTTOPTR)
1019  // List the common cases
1020  .legalForCartesianProduct(AddrSpaces64, {S64})
1021  .legalForCartesianProduct(AddrSpaces32, {S32})
1022  .scalarize(0)
1023  // Accept any address space as long as the size matches
1024  .legalIf(sameSize(0, 1))
1025  .widenScalarIf(smallerThan(1, 0),
1026  [](const LegalityQuery &Query) {
1027  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1028  })
1029  .narrowScalarIf(largerThan(1, 0),
1030  [](const LegalityQuery &Query) {
1031  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1032  });
1033 
1034  getActionDefinitionsBuilder(G_PTRTOINT)
1035  // List the common cases
1036  .legalForCartesianProduct(AddrSpaces64, {S64})
1037  .legalForCartesianProduct(AddrSpaces32, {S32})
1038  .scalarize(0)
1039  // Accept any address space as long as the size matches
1040  .legalIf(sameSize(0, 1))
1041  .widenScalarIf(smallerThan(0, 1),
1042  [](const LegalityQuery &Query) {
1043  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1044  })
1045  .narrowScalarIf(
1046  largerThan(0, 1),
1047  [](const LegalityQuery &Query) {
1048  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1049  });
1050 
1051  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1052  .scalarize(0)
1053  .custom();
1054 
1055  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1056  bool IsLoad) -> bool {
1057  const LLT DstTy = Query.Types[0];
1058 
1059  // Split vector extloads.
1060  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1061 
1062  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1063  return true;
1064 
1065  const LLT PtrTy = Query.Types[1];
1066  unsigned AS = PtrTy.getAddressSpace();
1067  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1068  return true;
1069 
1070  // Catch weird sized loads that don't evenly divide into the access sizes
1071  // TODO: May be able to widen depending on alignment etc.
1072  unsigned NumRegs = (MemSize + 31) / 32;
1073  if (NumRegs == 3) {
1074  if (!ST.hasDwordx3LoadStores())
1075  return true;
1076  } else {
1077  // If the alignment allows, these should have been widened.
1078  if (!isPowerOf2_32(NumRegs))
1079  return true;
1080  }
1081 
1082  return false;
1083  };
1084 
1085  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1086  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1087  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1088 
1089  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1090  // LDS
1091  // TODO: Unsupported flat for SI.
1092 
1093  for (unsigned Op : {G_LOAD, G_STORE}) {
1094  const bool IsStore = Op == G_STORE;
1095 
1096  auto &Actions = getActionDefinitionsBuilder(Op);
1097  // Explicitly list some common cases.
1098  // TODO: Does this help compile time at all?
1099  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1100  {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1101  {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1102  {S64, GlobalPtr, S64, GlobalAlign32},
1103  {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1104  {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1105  {S32, GlobalPtr, S8, GlobalAlign8},
1106  {S32, GlobalPtr, S16, GlobalAlign16},
1107 
1108  {S32, LocalPtr, S32, 32},
1109  {S64, LocalPtr, S64, 32},
1110  {V2S32, LocalPtr, V2S32, 32},
1111  {S32, LocalPtr, S8, 8},
1112  {S32, LocalPtr, S16, 16},
1113  {V2S16, LocalPtr, S32, 32},
1114 
1115  {S32, PrivatePtr, S32, 32},
1116  {S32, PrivatePtr, S8, 8},
1117  {S32, PrivatePtr, S16, 16},
1118  {V2S16, PrivatePtr, S32, 32},
1119 
1120  {S32, ConstantPtr, S32, GlobalAlign32},
1121  {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1122  {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1123  {S64, ConstantPtr, S64, GlobalAlign32},
1124  {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1125  Actions.legalIf(
1126  [=](const LegalityQuery &Query) -> bool {
1127  return isLoadStoreLegal(ST, Query);
1128  });
1129 
1130  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1131  // 64-bits.
1132  //
1133  // TODO: Should generalize bitcast action into coerce, which will also cover
1134  // inserting addrspacecasts.
1135  Actions.customIf(typeIs(1, Constant32Ptr));
1136 
1137  // Turn any illegal element vectors into something easier to deal
1138  // with. These will ultimately produce 32-bit scalar shifts to extract the
1139  // parts anyway.
1140  //
1141  // For odd 16-bit element vectors, prefer to split those into pieces with
1142  // 16-bit vector parts.
1143  Actions.bitcastIf(
1144  [=](const LegalityQuery &Query) -> bool {
1145  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1146  Query.MMODescrs[0].MemoryTy);
1147  }, bitcastToRegisterType(0));
1148 
1149  if (!IsStore) {
1150  // Widen suitably aligned loads by loading extra bytes. The standard
1151  // legalization actions can't properly express widening memory operands.
1152  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1153  return shouldWidenLoad(ST, Query, G_LOAD);
1154  });
1155  }
1156 
1157  // FIXME: load/store narrowing should be moved to lower action
1158  Actions
1159  .narrowScalarIf(
1160  [=](const LegalityQuery &Query) -> bool {
1161  return !Query.Types[0].isVector() &&
1162  needToSplitMemOp(Query, Op == G_LOAD);
1163  },
1164  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1165  const LLT DstTy = Query.Types[0];
1166  const LLT PtrTy = Query.Types[1];
1167 
1168  const unsigned DstSize = DstTy.getSizeInBits();
1169  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1170 
1171  // Split extloads.
1172  if (DstSize > MemSize)
1173  return std::make_pair(0, LLT::scalar(MemSize));
1174 
1175  unsigned MaxSize = maxSizeForAddrSpace(ST,
1176  PtrTy.getAddressSpace(),
1177  Op == G_LOAD);
1178  if (MemSize > MaxSize)
1179  return std::make_pair(0, LLT::scalar(MaxSize));
1180 
1181  uint64_t Align = Query.MMODescrs[0].AlignInBits;
1182  return std::make_pair(0, LLT::scalar(Align));
1183  })
1184  .fewerElementsIf(
1185  [=](const LegalityQuery &Query) -> bool {
1186  return Query.Types[0].isVector() &&
1187  needToSplitMemOp(Query, Op == G_LOAD);
1188  },
1189  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1190  const LLT DstTy = Query.Types[0];
1191  const LLT PtrTy = Query.Types[1];
1192 
1193  LLT EltTy = DstTy.getElementType();
1194  unsigned MaxSize = maxSizeForAddrSpace(ST,
1195  PtrTy.getAddressSpace(),
1196  Op == G_LOAD);
1197 
1198  // FIXME: Handle widened to power of 2 results better. This ends
1199  // up scalarizing.
1200  // FIXME: 3 element stores scalarized on SI
1201 
1202  // Split if it's too large for the address space.
1203  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1204  if (MemSize > MaxSize) {
1205  unsigned NumElts = DstTy.getNumElements();
1206  unsigned EltSize = EltTy.getSizeInBits();
1207 
1208  if (MaxSize % EltSize == 0) {
1209  return std::make_pair(
1211  ElementCount::getFixed(MaxSize / EltSize), EltTy));
1212  }
1213 
1214  unsigned NumPieces = MemSize / MaxSize;
1215 
1216  // FIXME: Refine when odd breakdowns handled
1217  // The scalars will need to be re-legalized.
1218  if (NumPieces == 1 || NumPieces >= NumElts ||
1219  NumElts % NumPieces != 0)
1220  return std::make_pair(0, EltTy);
1221 
1222  return std::make_pair(
1223  0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1224  }
1225 
1226  // FIXME: We could probably handle weird extending loads better.
1227  if (DstTy.getSizeInBits() > MemSize)
1228  return std::make_pair(0, EltTy);
1229 
1230  unsigned EltSize = EltTy.getSizeInBits();
1231  unsigned DstSize = DstTy.getSizeInBits();
1232  if (!isPowerOf2_32(DstSize)) {
1233  // We're probably decomposing an odd sized store. Try to split
1234  // to the widest type. TODO: Account for alignment. As-is it
1235  // should be OK, since the new parts will be further legalized.
1236  unsigned FloorSize = PowerOf2Floor(DstSize);
1237  return std::make_pair(
1239  ElementCount::getFixed(FloorSize / EltSize), EltTy));
1240  }
1241 
1242  // May need relegalization for the scalars.
1243  return std::make_pair(0, EltTy);
1244  })
1245  .minScalar(0, S32)
1246  .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1247  .widenScalarToNextPow2(0)
1248  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1249  .lower();
1250  }
1251 
1252  // FIXME: Unaligned accesses not lowered.
1253  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1254  .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1255  {S32, GlobalPtr, S16, 2 * 8},
1256  {S32, LocalPtr, S8, 8},
1257  {S32, LocalPtr, S16, 16},
1258  {S32, PrivatePtr, S8, 8},
1259  {S32, PrivatePtr, S16, 16},
1260  {S32, ConstantPtr, S8, 8},
1261  {S32, ConstantPtr, S16, 2 * 8}})
1262  .legalIf(
1263  [=](const LegalityQuery &Query) -> bool {
1264  return isLoadStoreLegal(ST, Query);
1265  });
1266 
1267  if (ST.hasFlatAddressSpace()) {
1268  ExtLoads.legalForTypesWithMemDesc(
1269  {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1270  }
1271 
1272  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1273  // 64-bits.
1274  //
1275  // TODO: Should generalize bitcast action into coerce, which will also cover
1276  // inserting addrspacecasts.
1277  ExtLoads.customIf(typeIs(1, Constant32Ptr));
1278 
1279  ExtLoads.clampScalar(0, S32, S32)
1280  .widenScalarToNextPow2(0)
1281  .lower();
1282 
1283  auto &Atomics = getActionDefinitionsBuilder(
1284  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287  G_ATOMICRMW_UMIN})
1288  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289  {S64, GlobalPtr}, {S64, LocalPtr},
1290  {S32, RegionPtr}, {S64, RegionPtr}});
1291  if (ST.hasFlatAddressSpace()) {
1292  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293  }
1294 
1295  auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296  if (ST.hasLDSFPAtomicAdd()) {
1297  Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298  if (ST.hasGFX90AInsts())
1299  Atomic.legalFor({{S64, LocalPtr}});
1300  if (ST.hasGFX940Insts())
1301  Atomic.legalFor({{V2S16, LocalPtr}});
1302  }
1303  if (ST.hasAtomicFaddInsts())
1304  Atomic.legalFor({{S32, GlobalPtr}});
1305 
1306  if (ST.hasGFX90AInsts()) {
1307  // These are legal with some caveats, and should have undergone expansion in
1308  // the IR in most situations
1309  // TODO: Move atomic expansion into legalizer
1310  // TODO: Also supports <2 x f16>
1311  Atomic.legalFor({
1312  {S32, GlobalPtr},
1313  {S64, GlobalPtr},
1314  {S64, FlatPtr}
1315  });
1316  }
1317 
1318  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1319  // demarshalling
1320  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1321  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1322  {S32, FlatPtr}, {S64, FlatPtr}})
1323  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1324  {S32, RegionPtr}, {S64, RegionPtr}});
1325  // TODO: Pointer types, any 32-bit or 64-bit vector
1326 
1327  // Condition should be s32 for scalar, s1 for vector.
1328  getActionDefinitionsBuilder(G_SELECT)
1329  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1330  LocalPtr, FlatPtr, PrivatePtr,
1331  LLT::fixed_vector(2, LocalPtr),
1332  LLT::fixed_vector(2, PrivatePtr)},
1333  {S1, S32})
1334  .clampScalar(0, S16, S64)
1335  .scalarize(1)
1336  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1337  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1338  .clampMaxNumElements(0, S32, 2)
1339  .clampMaxNumElements(0, LocalPtr, 2)
1340  .clampMaxNumElements(0, PrivatePtr, 2)
1341  .scalarize(0)
1342  .widenScalarToNextPow2(0)
1343  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1344 
1345  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1346  // be more flexible with the shift amount type.
1347  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1348  .legalFor({{S32, S32}, {S64, S32}});
1349  if (ST.has16BitInsts()) {
1350  if (ST.hasVOP3PInsts()) {
1351  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1352  .clampMaxNumElements(0, S16, 2);
1353  } else
1354  Shifts.legalFor({{S16, S16}});
1355 
1356  // TODO: Support 16-bit shift amounts for all types
1357  Shifts.widenScalarIf(
1358  [=](const LegalityQuery &Query) {
1359  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1360  // 32-bit amount.
1361  const LLT ValTy = Query.Types[0];
1362  const LLT AmountTy = Query.Types[1];
1363  return ValTy.getSizeInBits() <= 16 &&
1364  AmountTy.getSizeInBits() < 16;
1365  }, changeTo(1, S16));
1366  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1367  Shifts.clampScalar(1, S32, S32);
1368  Shifts.widenScalarToNextPow2(0, 16);
1369  Shifts.clampScalar(0, S16, S64);
1370 
1371  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1372  .minScalar(0, S16)
1373  .scalarize(0)
1374  .lower();
1375  } else {
1376  // Make sure we legalize the shift amount type first, as the general
1377  // expansion for the shifted type will produce much worse code if it hasn't
1378  // been truncated already.
1379  Shifts.clampScalar(1, S32, S32);
1380  Shifts.widenScalarToNextPow2(0, 32);
1381  Shifts.clampScalar(0, S32, S64);
1382 
1383  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384  .minScalar(0, S32)
1385  .scalarize(0)
1386  .lower();
1387  }
1388  Shifts.scalarize(0);
1389 
1390  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1391  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1392  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1393  unsigned IdxTypeIdx = 2;
1394 
1395  getActionDefinitionsBuilder(Op)
1396  .customIf([=](const LegalityQuery &Query) {
1397  const LLT EltTy = Query.Types[EltTypeIdx];
1398  const LLT VecTy = Query.Types[VecTypeIdx];
1399  const LLT IdxTy = Query.Types[IdxTypeIdx];
1400  const unsigned EltSize = EltTy.getSizeInBits();
1401  return (EltSize == 32 || EltSize == 64) &&
1402  VecTy.getSizeInBits() % 32 == 0 &&
1403  VecTy.getSizeInBits() <= MaxRegisterSize &&
1404  IdxTy.getSizeInBits() == 32;
1405  })
1406  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1407  bitcastToVectorElement32(VecTypeIdx))
1408  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1409  .bitcastIf(
1410  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1411  [=](const LegalityQuery &Query) {
1412  // For > 64-bit element types, try to turn this into a 64-bit
1413  // element vector since we may be able to do better indexing
1414  // if this is scalar. If not, fall back to 32.
1415  const LLT EltTy = Query.Types[EltTypeIdx];
1416  const LLT VecTy = Query.Types[VecTypeIdx];
1417  const unsigned DstEltSize = EltTy.getSizeInBits();
1418  const unsigned VecSize = VecTy.getSizeInBits();
1419 
1420  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1421  return std::make_pair(
1422  VecTypeIdx,
1423  LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1424  })
1425  .clampScalar(EltTypeIdx, S32, S64)
1426  .clampScalar(VecTypeIdx, S32, S64)
1427  .clampScalar(IdxTypeIdx, S32, S32)
1428  .clampMaxNumElements(VecTypeIdx, S32, 32)
1429  // TODO: Clamp elements for 64-bit vectors?
1430  // It should only be necessary with variable indexes.
1431  // As a last resort, lower to the stack
1432  .lower();
1433  }
1434 
1435  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436  .unsupportedIf([=](const LegalityQuery &Query) {
1437  const LLT &EltTy = Query.Types[1].getElementType();
1438  return Query.Types[0] != EltTy;
1439  });
1440 
1441  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444 
1445  // FIXME: Doesn't handle extract of illegal sizes.
1446  getActionDefinitionsBuilder(Op)
1447  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448  .lowerIf([=](const LegalityQuery &Query) {
1449  // Sub-vector(or single element) insert and extract.
1450  // TODO: verify immediate offset here since lower only works with
1451  // whole elements.
1452  const LLT BigTy = Query.Types[BigTyIdx];
1453  return BigTy.isVector();
1454  })
1455  // FIXME: Multiples of 16 should not be legal.
1456  .legalIf([=](const LegalityQuery &Query) {
1457  const LLT BigTy = Query.Types[BigTyIdx];
1458  const LLT LitTy = Query.Types[LitTyIdx];
1459  return (BigTy.getSizeInBits() % 32 == 0) &&
1460  (LitTy.getSizeInBits() % 16 == 0);
1461  })
1462  .widenScalarIf(
1463  [=](const LegalityQuery &Query) {
1464  const LLT BigTy = Query.Types[BigTyIdx];
1465  return (BigTy.getScalarSizeInBits() < 16);
1466  },
1468  .widenScalarIf(
1469  [=](const LegalityQuery &Query) {
1470  const LLT LitTy = Query.Types[LitTyIdx];
1471  return (LitTy.getScalarSizeInBits() < 16);
1472  },
1474  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1475  .widenScalarToNextPow2(BigTyIdx, 32);
1476 
1477  }
1478 
1479  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1480  .legalForCartesianProduct(AllS32Vectors, {S32})
1481  .legalForCartesianProduct(AllS64Vectors, {S64})
1482  .clampNumElements(0, V16S32, V32S32)
1483  .clampNumElements(0, V2S64, V16S64)
1484  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1485 
1486  if (ST.hasScalarPackInsts()) {
1487  BuildVector
1488  // FIXME: Should probably widen s1 vectors straight to s32
1489  .minScalarOrElt(0, S16)
1490  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1491  .minScalar(1, S32);
1492 
1493  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1494  .legalFor({V2S16, S32})
1495  .lower();
1496  BuildVector.minScalarOrElt(0, S32);
1497  } else {
1498  BuildVector.customFor({V2S16, S16});
1499  BuildVector.minScalarOrElt(0, S32);
1500 
1501  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1502  .customFor({V2S16, S32})
1503  .lower();
1504  }
1505 
1506  BuildVector.legalIf(isRegisterType(0));
1507 
1508  // FIXME: Clamp maximum size
1509  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1510  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1511  .clampMaxNumElements(0, S32, 32)
1512  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1513  .clampMaxNumElements(0, S16, 64);
1514 
1515  // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1516  // pre-legalize.
1517  if (ST.hasVOP3PInsts()) {
1518  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1519  .customFor({V2S16, V2S16})
1520  .lower();
1521  } else
1522  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1523 
1524  // Merge/Unmerge
1525  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1526  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1527  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1528 
1529  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1530  const LLT Ty = Query.Types[TypeIdx];
1531  if (Ty.isVector()) {
1532  const LLT &EltTy = Ty.getElementType();
1533  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1534  return true;
1535  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1536  return true;
1537  }
1538  return false;
1539  };
1540 
1541  auto &Builder = getActionDefinitionsBuilder(Op)
1542  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1543  .lowerFor({{S16, V2S16}})
1544  .lowerIf([=](const LegalityQuery &Query) {
1545  const LLT BigTy = Query.Types[BigTyIdx];
1546  return BigTy.getSizeInBits() == 32;
1547  })
1548  // Try to widen to s16 first for small types.
1549  // TODO: Only do this on targets with legal s16 shifts
1550  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1551  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1552  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1553  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1554  elementTypeIs(1, S16)),
1555  changeTo(1, V2S16))
1556  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1557  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1558  // valid.
1559  .clampScalar(LitTyIdx, S32, S512)
1560  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1561  // Break up vectors with weird elements into scalars
1562  .fewerElementsIf(
1563  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1564  scalarize(0))
1565  .fewerElementsIf(
1566  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1567  scalarize(1))
1568  .clampScalar(BigTyIdx, S32, MaxScalar);
1569 
1570  if (Op == G_MERGE_VALUES) {
1571  Builder.widenScalarIf(
1572  // TODO: Use 16-bit shifts if legal for 8-bit values?
1573  [=](const LegalityQuery &Query) {
1574  const LLT Ty = Query.Types[LitTyIdx];
1575  return Ty.getSizeInBits() < 32;
1576  },
1577  changeTo(LitTyIdx, S32));
1578  }
1579 
1580  Builder.widenScalarIf(
1581  [=](const LegalityQuery &Query) {
1582  const LLT Ty = Query.Types[BigTyIdx];
1583  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1584  Ty.getSizeInBits() % 16 != 0;
1585  },
1586  [=](const LegalityQuery &Query) {
1587  // Pick the next power of 2, or a multiple of 64 over 128.
1588  // Whichever is smaller.
1589  const LLT &Ty = Query.Types[BigTyIdx];
1590  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1591  if (NewSizeInBits >= 256) {
1592  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1593  if (RoundedTo < NewSizeInBits)
1594  NewSizeInBits = RoundedTo;
1595  }
1596  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1597  })
1598  // Any vectors left are the wrong size. Scalarize them.
1599  .scalarize(0)
1600  .scalarize(1);
1601  }
1602 
1603  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1604  // RegBankSelect.
1605  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1606  .legalFor({{S32}, {S64}});
1607 
1608  if (ST.hasVOP3PInsts()) {
1609  SextInReg.lowerFor({{V2S16}})
1610  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1611  // get more vector shift opportunities, since we'll get those when
1612  // expanded.
1613  .clampMaxNumElementsStrict(0, S16, 2);
1614  } else if (ST.has16BitInsts()) {
1615  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1616  } else {
1617  // Prefer to promote to s32 before lowering if we don't have 16-bit
1618  // shifts. This avoid a lot of intermediate truncate and extend operations.
1619  SextInReg.lowerFor({{S32}, {S64}});
1620  }
1621 
1622  SextInReg
1623  .scalarize(0)
1624  .clampScalar(0, S32, S64)
1625  .lower();
1626 
1627  getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1628  .scalarize(0)
1629  .lower();
1630 
1631  // TODO: Only Try to form v2s16 with legal packed instructions.
1632  getActionDefinitionsBuilder(G_FSHR)
1633  .legalFor({{S32, S32}})
1634  .lowerFor({{V2S16, V2S16}})
1635  .clampMaxNumElementsStrict(0, S16, 2)
1636  .scalarize(0)
1637  .lower();
1638 
1639  if (ST.hasVOP3PInsts()) {
1640  getActionDefinitionsBuilder(G_FSHL)
1641  .lowerFor({{V2S16, V2S16}})
1642  .clampMaxNumElementsStrict(0, S16, 2)
1643  .scalarize(0)
1644  .lower();
1645  } else {
1646  getActionDefinitionsBuilder(G_FSHL)
1647  .scalarize(0)
1648  .lower();
1649  }
1650 
1651  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1652  .legalFor({S64});
1653 
1654  getActionDefinitionsBuilder(G_FENCE)
1655  .alwaysLegal();
1656 
1657  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1658  .scalarize(0)
1659  .minScalar(0, S32)
1660  .lower();
1661 
1662  getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1663  .legalFor({{S32, S32}, {S64, S32}})
1664  .clampScalar(1, S32, S32)
1665  .clampScalar(0, S32, S64)
1666  .widenScalarToNextPow2(0)
1667  .scalarize(0);
1668 
1669  getActionDefinitionsBuilder({
1670  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1671  G_FCOPYSIGN,
1672 
1673  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1674  G_ATOMICRMW_NAND,
1675  G_ATOMICRMW_FSUB,
1676  G_READ_REGISTER,
1677  G_WRITE_REGISTER,
1678 
1679  G_SADDO, G_SSUBO,
1680 
1681  // TODO: Implement
1682  G_FMINIMUM, G_FMAXIMUM}).lower();
1683 
1684  getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1685  .lower();
1686 
1687  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1688  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1689  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1690  .unsupported();
1691 
1692  getLegacyLegalizerInfo().computeTables();
1693  verify(*ST.getInstrInfo());
1694 }
1695 
1696 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1697  MachineInstr &MI) const {
1698  MachineIRBuilder &B = Helper.MIRBuilder;
1699  MachineRegisterInfo &MRI = *B.getMRI();
1700 
1701  switch (MI.getOpcode()) {
1702  case TargetOpcode::G_ADDRSPACE_CAST:
1703  return legalizeAddrSpaceCast(MI, MRI, B);
1704  case TargetOpcode::G_FRINT:
1705  return legalizeFrint(MI, MRI, B);
1706  case TargetOpcode::G_FCEIL:
1707  return legalizeFceil(MI, MRI, B);
1708  case TargetOpcode::G_FREM:
1709  return legalizeFrem(MI, MRI, B);
1710  case TargetOpcode::G_INTRINSIC_TRUNC:
1711  return legalizeIntrinsicTrunc(MI, MRI, B);
1712  case TargetOpcode::G_SITOFP:
1713  return legalizeITOFP(MI, MRI, B, true);
1714  case TargetOpcode::G_UITOFP:
1715  return legalizeITOFP(MI, MRI, B, false);
1716  case TargetOpcode::G_FPTOSI:
1717  return legalizeFPTOI(MI, MRI, B, true);
1718  case TargetOpcode::G_FPTOUI:
1719  return legalizeFPTOI(MI, MRI, B, false);
1720  case TargetOpcode::G_FMINNUM:
1721  case TargetOpcode::G_FMAXNUM:
1722  case TargetOpcode::G_FMINNUM_IEEE:
1723  case TargetOpcode::G_FMAXNUM_IEEE:
1724  return legalizeMinNumMaxNum(Helper, MI);
1725  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1726  return legalizeExtractVectorElt(MI, MRI, B);
1727  case TargetOpcode::G_INSERT_VECTOR_ELT:
1728  return legalizeInsertVectorElt(MI, MRI, B);
1729  case TargetOpcode::G_SHUFFLE_VECTOR:
1730  return legalizeShuffleVector(MI, MRI, B);
1731  case TargetOpcode::G_FSIN:
1732  case TargetOpcode::G_FCOS:
1733  return legalizeSinCos(MI, MRI, B);
1734  case TargetOpcode::G_GLOBAL_VALUE:
1735  return legalizeGlobalValue(MI, MRI, B);
1736  case TargetOpcode::G_LOAD:
1737  case TargetOpcode::G_SEXTLOAD:
1738  case TargetOpcode::G_ZEXTLOAD:
1739  return legalizeLoad(Helper, MI);
1740  case TargetOpcode::G_FMAD:
1741  return legalizeFMad(MI, MRI, B);
1742  case TargetOpcode::G_FDIV:
1743  return legalizeFDIV(MI, MRI, B);
1744  case TargetOpcode::G_UDIV:
1745  case TargetOpcode::G_UREM:
1746  case TargetOpcode::G_UDIVREM:
1747  return legalizeUnsignedDIV_REM(MI, MRI, B);
1748  case TargetOpcode::G_SDIV:
1749  case TargetOpcode::G_SREM:
1750  case TargetOpcode::G_SDIVREM:
1751  return legalizeSignedDIV_REM(MI, MRI, B);
1752  case TargetOpcode::G_ATOMIC_CMPXCHG:
1753  return legalizeAtomicCmpXChg(MI, MRI, B);
1754  case TargetOpcode::G_FLOG:
1755  return legalizeFlog(MI, B, numbers::ln2f);
1756  case TargetOpcode::G_FLOG10:
1758  case TargetOpcode::G_FEXP:
1759  return legalizeFExp(MI, B);
1760  case TargetOpcode::G_FPOW:
1761  return legalizeFPow(MI, B);
1762  case TargetOpcode::G_FFLOOR:
1763  return legalizeFFloor(MI, MRI, B);
1764  case TargetOpcode::G_BUILD_VECTOR:
1765  return legalizeBuildVector(MI, MRI, B);
1766  case TargetOpcode::G_CTLZ:
1767  case TargetOpcode::G_CTTZ:
1768  return legalizeCTLZ_CTTZ(MI, MRI, B);
1769  case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1770  return legalizeFPTruncRound(MI, B);
1771  default:
1772  return false;
1773  }
1774 
1775  llvm_unreachable("expected switch to return");
1776 }
1777 
1779  unsigned AS,
1781  MachineIRBuilder &B) const {
1782  MachineFunction &MF = B.getMF();
1783  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1784  const LLT S32 = LLT::scalar(32);
1785 
1787 
1788  if (ST.hasApertureRegs()) {
1789  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1790  // getreg.
1791  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1794  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1797  unsigned Encoding =
1799  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1800  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1801 
1802  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1803 
1804  B.buildInstr(AMDGPU::S_GETREG_B32)
1805  .addDef(GetReg)
1806  .addImm(Encoding);
1807  MRI.setType(GetReg, S32);
1808 
1809  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1810  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1811  }
1812 
1813  // TODO: can we be smarter about machine pointer info?
1817  // For code object version 5, private_base and shared_base are passed through
1818  // implicit kernargs.
1823  uint64_t Offset =
1824  ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1825 
1826  Register KernargPtrReg = MRI.createGenericVirtualRegister(
1828 
1829  if (!loadInputValue(KernargPtrReg, B,
1831  return Register();
1832 
1834  PtrInfo,
1837  LLT::scalar(32), commonAlignment(Align(64), Offset));
1838 
1839  // Pointer address
1840  B.buildPtrAdd(LoadAddr, KernargPtrReg,
1841  B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1842  // Load address
1843  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1844  }
1845 
1848 
1850  return Register();
1851 
1852  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1853  // private_segment_aperture_base_hi.
1854  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1855 
1857  PtrInfo,
1860  LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1861 
1862  B.buildPtrAdd(LoadAddr, QueuePtr,
1863  B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1864  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1865 }
1866 
1867 /// Return true if the value is a known valid address, such that a null check is
1868 /// not necessary.
1870  const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1871  MachineInstr *Def = MRI.getVRegDef(Val);
1872  switch (Def->getOpcode()) {
1873  case AMDGPU::G_FRAME_INDEX:
1874  case AMDGPU::G_GLOBAL_VALUE:
1875  case AMDGPU::G_BLOCK_ADDR:
1876  return true;
1877  case AMDGPU::G_CONSTANT: {
1878  const ConstantInt *CI = Def->getOperand(1).getCImm();
1879  return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1880  }
1881  default:
1882  return false;
1883  }
1884 
1885  return false;
1886 }
1887 
1890  MachineIRBuilder &B) const {
1891  MachineFunction &MF = B.getMF();
1892 
1893  const LLT S32 = LLT::scalar(32);
1894  Register Dst = MI.getOperand(0).getReg();
1895  Register Src = MI.getOperand(1).getReg();
1896 
1897  LLT DstTy = MRI.getType(Dst);
1898  LLT SrcTy = MRI.getType(Src);
1899  unsigned DestAS = DstTy.getAddressSpace();
1900  unsigned SrcAS = SrcTy.getAddressSpace();
1901 
1902  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1903  // vector element.
1904  assert(!DstTy.isVector());
1905 
1906  const AMDGPUTargetMachine &TM
1907  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1908 
1909  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1910  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1911  return true;
1912  }
1913 
1914  if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1915  (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1916  DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1917  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1918  // Extract low 32-bits of the pointer.
1919  B.buildExtract(Dst, Src, 0);
1920  MI.eraseFromParent();
1921  return true;
1922  }
1923 
1924  unsigned NullVal = TM.getNullPointerValue(DestAS);
1925 
1926  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1927  auto FlatNull = B.buildConstant(SrcTy, 0);
1928 
1929  // Extract low 32-bits of the pointer.
1930  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1931 
1932  auto CmpRes =
1933  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1934  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1935 
1936  MI.eraseFromParent();
1937  return true;
1938  }
1939 
1940  if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1941  (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1942  SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1943  if (!ST.hasFlatAddressSpace())
1944  return false;
1945 
1946  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1947  if (!ApertureReg.isValid())
1948  return false;
1949 
1950  // Coerce the type of the low half of the result so we can use merge_values.
1951  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1952 
1953  // TODO: Should we allow mismatched types but matching sizes in merges to
1954  // avoid the ptrtoint?
1955  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1956 
1957  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1958  B.buildCopy(Dst, BuildPtr);
1959  MI.eraseFromParent();
1960  return true;
1961  }
1962 
1963  auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1964  auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1965 
1966  auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
1967  SegmentNull.getReg(0));
1968 
1969  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1970 
1971  MI.eraseFromParent();
1972  return true;
1973  }
1974 
1975  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1976  SrcTy.getSizeInBits() == 64) {
1977  // Truncate.
1978  B.buildExtract(Dst, Src, 0);
1979  MI.eraseFromParent();
1980  return true;
1981  }
1982 
1983  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
1984  DstTy.getSizeInBits() == 64) {
1986  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1987 
1988  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1989  // another. Merge operands are required to be the same type, but creating an
1990  // extra ptrtoint would be kind of pointless.
1991  auto HighAddr = B.buildConstant(
1993  B.buildMerge(Dst, {Src, HighAddr});
1994  MI.eraseFromParent();
1995  return true;
1996  }
1997 
1998  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1999  MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2000 
2001  LLVMContext &Ctx = MF.getFunction().getContext();
2002  Ctx.diagnose(InvalidAddrSpaceCast);
2003  B.buildUndef(Dst);
2004  MI.eraseFromParent();
2005  return true;
2006 }
2007 
2010  MachineIRBuilder &B) const {
2011  Register Src = MI.getOperand(1).getReg();
2012  LLT Ty = MRI.getType(Src);
2013  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2014 
2015  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2016  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2017 
2018  auto C1 = B.buildFConstant(Ty, C1Val);
2019  auto CopySign = B.buildFCopysign(Ty, C1, Src);
2020 
2021  // TODO: Should this propagate fast-math-flags?
2022  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2023  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2024 
2025  auto C2 = B.buildFConstant(Ty, C2Val);
2026  auto Fabs = B.buildFAbs(Ty, Src);
2027 
2028  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2029  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2030  MI.eraseFromParent();
2031  return true;
2032 }
2033 
2036  MachineIRBuilder &B) const {
2037 
2038  const LLT S1 = LLT::scalar(1);
2039  const LLT S64 = LLT::scalar(64);
2040 
2041  Register Src = MI.getOperand(1).getReg();
2042  assert(MRI.getType(Src) == S64);
2043 
2044  // result = trunc(src)
2045  // if (src > 0.0 && src != result)
2046  // result += 1.0
2047 
2048  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2049 
2050  const auto Zero = B.buildFConstant(S64, 0.0);
2051  const auto One = B.buildFConstant(S64, 1.0);
2052  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2053  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2054  auto And = B.buildAnd(S1, Lt0, NeTrunc);
2055  auto Add = B.buildSelect(S64, And, One, Zero);
2056 
2057  // TODO: Should this propagate fast-math-flags?
2058  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2059  MI.eraseFromParent();
2060  return true;
2061 }
2062 
2065  MachineIRBuilder &B) const {
2066  Register DstReg = MI.getOperand(0).getReg();
2067  Register Src0Reg = MI.getOperand(1).getReg();
2068  Register Src1Reg = MI.getOperand(2).getReg();
2069  auto Flags = MI.getFlags();
2070  LLT Ty = MRI.getType(DstReg);
2071 
2072  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2073  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2074  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2075  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2076  MI.eraseFromParent();
2077  return true;
2078 }
2079 
2081  MachineIRBuilder &B) {
2082  const unsigned FractBits = 52;
2083  const unsigned ExpBits = 11;
2084  LLT S32 = LLT::scalar(32);
2085 
2086  auto Const0 = B.buildConstant(S32, FractBits - 32);
2087  auto Const1 = B.buildConstant(S32, ExpBits);
2088 
2089  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2090  .addUse(Hi)
2091  .addUse(Const0.getReg(0))
2092  .addUse(Const1.getReg(0));
2093 
2094  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2095 }
2096 
2099  MachineIRBuilder &B) const {
2100  const LLT S1 = LLT::scalar(1);
2101  const LLT S32 = LLT::scalar(32);
2102  const LLT S64 = LLT::scalar(64);
2103 
2104  Register Src = MI.getOperand(1).getReg();
2105  assert(MRI.getType(Src) == S64);
2106 
2107  // TODO: Should this use extract since the low half is unused?
2108  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2109  Register Hi = Unmerge.getReg(1);
2110 
2111  // Extract the upper half, since this is where we will find the sign and
2112  // exponent.
2113  auto Exp = extractF64Exponent(Hi, B);
2114 
2115  const unsigned FractBits = 52;
2116 
2117  // Extract the sign bit.
2118  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2119  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2120 
2121  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2122 
2123  const auto Zero32 = B.buildConstant(S32, 0);
2124 
2125  // Extend back to 64-bits.
2126  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2127 
2128  auto Shr = B.buildAShr(S64, FractMask, Exp);
2129  auto Not = B.buildNot(S64, Shr);
2130  auto Tmp0 = B.buildAnd(S64, Src, Not);
2131  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2132 
2133  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2134  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2135 
2136  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2137  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2138  MI.eraseFromParent();
2139  return true;
2140 }
2141 
2144  MachineIRBuilder &B, bool Signed) const {
2145 
2146  Register Dst = MI.getOperand(0).getReg();
2147  Register Src = MI.getOperand(1).getReg();
2148 
2149  const LLT S64 = LLT::scalar(64);
2150  const LLT S32 = LLT::scalar(32);
2151 
2152  assert(MRI.getType(Src) == S64);
2153 
2154  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2155  auto ThirtyTwo = B.buildConstant(S32, 32);
2156 
2157  if (MRI.getType(Dst) == S64) {
2158  auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2159  : B.buildUITOFP(S64, Unmerge.getReg(1));
2160 
2161  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2162  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2163  .addUse(CvtHi.getReg(0))
2164  .addUse(ThirtyTwo.getReg(0));
2165 
2166  // TODO: Should this propagate fast-math-flags?
2167  B.buildFAdd(Dst, LdExp, CvtLo);
2168  MI.eraseFromParent();
2169  return true;
2170  }
2171 
2172  assert(MRI.getType(Dst) == S32);
2173 
2174  auto One = B.buildConstant(S32, 1);
2175 
2176  MachineInstrBuilder ShAmt;
2177  if (Signed) {
2178  auto ThirtyOne = B.buildConstant(S32, 31);
2179  auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2180  auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2181  auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2182  auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2183  /*HasSideEffects=*/false)
2184  .addUse(Unmerge.getReg(1));
2185  auto LS2 = B.buildSub(S32, LS, One);
2186  ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2187  } else
2188  ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2189  auto Norm = B.buildShl(S64, Src, ShAmt);
2190  auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2191  auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2192  auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2193  auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2194  auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2195  B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2196  /*HasSideEffects=*/false)
2197  .addUse(FVal.getReg(0))
2198  .addUse(Scale.getReg(0));
2199  MI.eraseFromParent();
2200  return true;
2201 }
2202 
2203 // TODO: Copied from DAG implementation. Verify logic and document how this
2204 // actually works.
2208  bool Signed) const {
2209 
2210  Register Dst = MI.getOperand(0).getReg();
2211  Register Src = MI.getOperand(1).getReg();
2212 
2213  const LLT S64 = LLT::scalar(64);
2214  const LLT S32 = LLT::scalar(32);
2215 
2216  const LLT SrcLT = MRI.getType(Src);
2217  assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2218 
2219  unsigned Flags = MI.getFlags();
2220 
2221  // The basic idea of converting a floating point number into a pair of 32-bit
2222  // integers is illustrated as follows:
2223  //
2224  // tf := trunc(val);
2225  // hif := floor(tf * 2^-32);
2226  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2227  // hi := fptoi(hif);
2228  // lo := fptoi(lof);
2229  //
2230  auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2231  MachineInstrBuilder Sign;
2232  if (Signed && SrcLT == S32) {
2233  // However, a 32-bit floating point number has only 23 bits mantissa and
2234  // it's not enough to hold all the significant bits of `lof` if val is
2235  // negative. To avoid the loss of precision, We need to take the absolute
2236  // value after truncating and flip the result back based on the original
2237  // signedness.
2238  Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2239  Trunc = B.buildFAbs(S32, Trunc, Flags);
2240  }
2241  MachineInstrBuilder K0, K1;
2242  if (SrcLT == S64) {
2243  K0 = B.buildFConstant(S64,
2244  BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2245  K1 = B.buildFConstant(S64,
2246  BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2247  } else {
2248  K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2249  K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2250  }
2251 
2252  auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2253  auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2254  auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2255 
2256  auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2257  : B.buildFPTOUI(S32, FloorMul);
2258  auto Lo = B.buildFPTOUI(S32, Fma);
2259 
2260  if (Signed && SrcLT == S32) {
2261  // Flip the result based on the signedness, which is either all 0s or 1s.
2262  Sign = B.buildMerge(S64, {Sign, Sign});
2263  // r := xor({lo, hi}, sign) - sign;
2264  B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2265  } else
2266  B.buildMerge(Dst, {Lo, Hi});
2267  MI.eraseFromParent();
2268 
2269  return true;
2270 }
2271 
2273  MachineInstr &MI) const {
2274  MachineFunction &MF = Helper.MIRBuilder.getMF();
2276 
2277  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2278  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2279 
2280  // With ieee_mode disabled, the instructions have the correct behavior
2281  // already for G_FMINNUM/G_FMAXNUM
2282  if (!MFI->getMode().IEEE)
2283  return !IsIEEEOp;
2284 
2285  if (IsIEEEOp)
2286  return true;
2287 
2289 }
2290 
2293  MachineIRBuilder &B) const {
2294  // TODO: Should move some of this into LegalizerHelper.
2295 
2296  // TODO: Promote dynamic indexing of s16 to s32
2297 
2298  // FIXME: Artifact combiner probably should have replaced the truncated
2299  // constant before this, so we shouldn't need
2300  // getIConstantVRegValWithLookThrough.
2301  Optional<ValueAndVReg> MaybeIdxVal =
2302  getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2303  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2304  return true;
2305  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2306 
2307  Register Dst = MI.getOperand(0).getReg();
2308  Register Vec = MI.getOperand(1).getReg();
2309 
2310  LLT VecTy = MRI.getType(Vec);
2311  LLT EltTy = VecTy.getElementType();
2312  assert(EltTy == MRI.getType(Dst));
2313 
2314  if (IdxVal < VecTy.getNumElements()) {
2315  auto Unmerge = B.buildUnmerge(EltTy, Vec);
2316  B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2317  } else {
2318  B.buildUndef(Dst);
2319  }
2320 
2321  MI.eraseFromParent();
2322  return true;
2323 }
2324 
2327  MachineIRBuilder &B) const {
2328  // TODO: Should move some of this into LegalizerHelper.
2329 
2330  // TODO: Promote dynamic indexing of s16 to s32
2331 
2332  // FIXME: Artifact combiner probably should have replaced the truncated
2333  // constant before this, so we shouldn't need
2334  // getIConstantVRegValWithLookThrough.
2335  Optional<ValueAndVReg> MaybeIdxVal =
2336  getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2337  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2338  return true;
2339 
2340  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2341  Register Dst = MI.getOperand(0).getReg();
2342  Register Vec = MI.getOperand(1).getReg();
2343  Register Ins = MI.getOperand(2).getReg();
2344 
2345  LLT VecTy = MRI.getType(Vec);
2346  LLT EltTy = VecTy.getElementType();
2347  assert(EltTy == MRI.getType(Ins));
2348  (void)Ins;
2349 
2350  unsigned NumElts = VecTy.getNumElements();
2351  if (IdxVal < NumElts) {
2352  SmallVector<Register, 8> SrcRegs;
2353  for (unsigned i = 0; i < NumElts; ++i)
2354  SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2355  B.buildUnmerge(SrcRegs, Vec);
2356 
2357  SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2358  B.buildMerge(Dst, SrcRegs);
2359  } else {
2360  B.buildUndef(Dst);
2361  }
2362 
2363  MI.eraseFromParent();
2364  return true;
2365 }
2366 
2369  MachineIRBuilder &B) const {
2370  const LLT V2S16 = LLT::fixed_vector(2, 16);
2371 
2372  Register Dst = MI.getOperand(0).getReg();
2373  Register Src0 = MI.getOperand(1).getReg();
2374  LLT DstTy = MRI.getType(Dst);
2375  LLT SrcTy = MRI.getType(Src0);
2376 
2377  if (SrcTy == V2S16 && DstTy == V2S16 &&
2378  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2379  return true;
2380 
2381  MachineIRBuilder HelperBuilder(MI);
2382  GISelObserverWrapper DummyObserver;
2383  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2385 }
2386 
2389  MachineIRBuilder &B) const {
2390 
2391  Register DstReg = MI.getOperand(0).getReg();
2392  Register SrcReg = MI.getOperand(1).getReg();
2393  LLT Ty = MRI.getType(DstReg);
2394  unsigned Flags = MI.getFlags();
2395 
2396  Register TrigVal;
2397  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2398  if (ST.hasTrigReducedRange()) {
2399  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2400  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2401  .addUse(MulVal.getReg(0))
2402  .setMIFlags(Flags).getReg(0);
2403  } else
2404  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2405 
2406  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2407  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2408  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2409  .addUse(TrigVal)
2410  .setMIFlags(Flags);
2411  MI.eraseFromParent();
2412  return true;
2413 }
2414 
2417  const GlobalValue *GV,
2418  int64_t Offset,
2419  unsigned GAFlags) const {
2420  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2421  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2422  // to the following code sequence:
2423  //
2424  // For constant address space:
2425  // s_getpc_b64 s[0:1]
2426  // s_add_u32 s0, s0, $symbol
2427  // s_addc_u32 s1, s1, 0
2428  //
2429  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2430  // a fixup or relocation is emitted to replace $symbol with a literal
2431  // constant, which is a pc-relative offset from the encoding of the $symbol
2432  // operand to the global variable.
2433  //
2434  // For global address space:
2435  // s_getpc_b64 s[0:1]
2436  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2437  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2438  //
2439  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2440  // fixups or relocations are emitted to replace $symbol@*@lo and
2441  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2442  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2443  // operand to the global variable.
2444  //
2445  // What we want here is an offset from the value returned by s_getpc
2446  // (which is the address of the s_add_u32 instruction) to the global
2447  // variable, but since the encoding of $symbol starts 4 bytes after the start
2448  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2449  // small. This requires us to add 4 to the global variable offset in order to
2450  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2451  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2452  // instruction.
2453 
2454  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2455 
2456  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2457  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2458 
2459  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2460  .addDef(PCReg);
2461 
2462  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2463  if (GAFlags == SIInstrInfo::MO_NONE)
2464  MIB.addImm(0);
2465  else
2466  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2467 
2468  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2469 
2470  if (PtrTy.getSizeInBits() == 32)
2471  B.buildExtract(DstReg, PCReg, 0);
2472  return true;
2473  }
2474 
2477  MachineIRBuilder &B) const {
2478  Register DstReg = MI.getOperand(0).getReg();
2479  LLT Ty = MRI.getType(DstReg);
2480  unsigned AS = Ty.getAddressSpace();
2481 
2482  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2483  MachineFunction &MF = B.getMF();
2485 
2487  if (!MFI->isModuleEntryFunction() &&
2488  !GV->getName().equals("llvm.amdgcn.module.lds")) {
2489  const Function &Fn = MF.getFunction();
2490  DiagnosticInfoUnsupported BadLDSDecl(
2491  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2492  DS_Warning);
2493  Fn.getContext().diagnose(BadLDSDecl);
2494 
2495  // We currently don't have a way to correctly allocate LDS objects that
2496  // aren't directly associated with a kernel. We do force inlining of
2497  // functions that use local objects. However, if these dead functions are
2498  // not eliminated, we don't want a compile time error. Just emit a warning
2499  // and a trap, since there should be no callable path here.
2500  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2501  B.buildUndef(DstReg);
2502  MI.eraseFromParent();
2503  return true;
2504  }
2505 
2506  // TODO: We could emit code to handle the initialization somewhere.
2507  // We ignore the initializer for now and legalize it to allow selection.
2508  // The initializer will anyway get errored out during assembly emission.
2509  const SITargetLowering *TLI = ST.getTargetLowering();
2510  if (!TLI->shouldUseLDSConstAddress(GV)) {
2511  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2512  return true; // Leave in place;
2513  }
2514 
2515  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2516  Type *Ty = GV->getValueType();
2517  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2518  // zero-sized type in other languages to declare the dynamic shared
2519  // memory which size is not known at the compile time. They will be
2520  // allocated by the runtime and placed directly after the static
2521  // allocated ones. They all share the same offset.
2522  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2523  // Adjust alignment for that dynamic shared memory array.
2524  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2525  LLT S32 = LLT::scalar(32);
2526  auto Sz =
2527  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2528  B.buildIntToPtr(DstReg, Sz);
2529  MI.eraseFromParent();
2530  return true;
2531  }
2532  }
2533 
2534  B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2535  *cast<GlobalVariable>(GV)));
2536  MI.eraseFromParent();
2537  return true;
2538  }
2539 
2540  const SITargetLowering *TLI = ST.getTargetLowering();
2541 
2542  if (TLI->shouldEmitFixup(GV)) {
2543  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2544  MI.eraseFromParent();
2545  return true;
2546  }
2547 
2548  if (TLI->shouldEmitPCReloc(GV)) {
2549  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2550  MI.eraseFromParent();
2551  return true;
2552  }
2553 
2555  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2556 
2557  LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2562  LoadTy, Align(8));
2563 
2564  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2565 
2566  if (Ty.getSizeInBits() == 32) {
2567  // Truncate if this is a 32-bit constant address.
2568  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2569  B.buildExtract(DstReg, Load, 0);
2570  } else
2571  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2572 
2573  MI.eraseFromParent();
2574  return true;
2575 }
2576 
2578  if (Ty.isVector())
2579  return Ty.changeElementCount(
2581  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2582 }
2583 
2585  MachineInstr &MI) const {
2586  MachineIRBuilder &B = Helper.MIRBuilder;
2587  MachineRegisterInfo &MRI = *B.getMRI();
2588  GISelChangeObserver &Observer = Helper.Observer;
2589 
2590  Register PtrReg = MI.getOperand(1).getReg();
2591  LLT PtrTy = MRI.getType(PtrReg);
2592  unsigned AddrSpace = PtrTy.getAddressSpace();
2593 
2594  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2596  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2597  Observer.changingInstr(MI);
2598  MI.getOperand(1).setReg(Cast.getReg(0));
2599  Observer.changedInstr(MI);
2600  return true;
2601  }
2602 
2603  if (MI.getOpcode() != AMDGPU::G_LOAD)
2604  return false;
2605 
2606  Register ValReg = MI.getOperand(0).getReg();
2607  LLT ValTy = MRI.getType(ValReg);
2608 
2609  MachineMemOperand *MMO = *MI.memoperands_begin();
2610  const unsigned ValSize = ValTy.getSizeInBits();
2611  const LLT MemTy = MMO->getMemoryType();
2612  const Align MemAlign = MMO->getAlign();
2613  const unsigned MemSize = MemTy.getSizeInBits();
2614  const uint64_t AlignInBits = 8 * MemAlign.value();
2615 
2616  // Widen non-power-of-2 loads to the alignment if needed
2617  if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2618  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2619 
2620  // This was already the correct extending load result type, so just adjust
2621  // the memory type.
2622  if (WideMemSize == ValSize) {
2623  MachineFunction &MF = B.getMF();
2624 
2625  MachineMemOperand *WideMMO =
2626  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2627  Observer.changingInstr(MI);
2628  MI.setMemRefs(MF, {WideMMO});
2629  Observer.changedInstr(MI);
2630  return true;
2631  }
2632 
2633  // Don't bother handling edge case that should probably never be produced.
2634  if (ValSize > WideMemSize)
2635  return false;
2636 
2637  LLT WideTy = widenToNextPowerOf2(ValTy);
2638 
2639  Register WideLoad;
2640  if (!WideTy.isVector()) {
2641  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2642  B.buildTrunc(ValReg, WideLoad).getReg(0);
2643  } else {
2644  // Extract the subvector.
2645 
2646  if (isRegisterType(ValTy)) {
2647  // If this a case where G_EXTRACT is legal, use it.
2648  // (e.g. <3 x s32> -> <4 x s32>)
2649  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2650  B.buildExtract(ValReg, WideLoad, 0);
2651  } else {
2652  // For cases where the widened type isn't a nice register value, unmerge
2653  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2654  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2655  B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2656  }
2657  }
2658 
2659  MI.eraseFromParent();
2660  return true;
2661  }
2662 
2663  return false;
2664 }
2665 
2668  MachineIRBuilder &B) const {
2669  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2670  assert(Ty.isScalar());
2671 
2672  MachineFunction &MF = B.getMF();
2674 
2675  // TODO: Always legal with future ftz flag.
2676  // FIXME: Do we need just output?
2677  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2678  return true;
2679  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2680  return true;
2681 
2682  MachineIRBuilder HelperBuilder(MI);
2683  GISelObserverWrapper DummyObserver;
2684  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2685  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2686 }
2687 
2690  Register DstReg = MI.getOperand(0).getReg();
2691  Register PtrReg = MI.getOperand(1).getReg();
2692  Register CmpVal = MI.getOperand(2).getReg();
2693  Register NewVal = MI.getOperand(3).getReg();
2694 
2696  "this should not have been custom lowered");
2697 
2698  LLT ValTy = MRI.getType(CmpVal);
2699  LLT VecTy = LLT::fixed_vector(2, ValTy);
2700 
2701  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2702 
2703  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2704  .addDef(DstReg)
2705  .addUse(PtrReg)
2706  .addUse(PackedVal)
2707  .setMemRefs(MI.memoperands());
2708 
2709  MI.eraseFromParent();
2710  return true;
2711 }
2712 
2714  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2715  Register Dst = MI.getOperand(0).getReg();
2716  Register Src = MI.getOperand(1).getReg();
2717  LLT Ty = B.getMRI()->getType(Dst);
2718  unsigned Flags = MI.getFlags();
2719 
2720  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2721  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2722 
2723  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2724  MI.eraseFromParent();
2725  return true;
2726 }
2727 
2729  MachineIRBuilder &B) const {
2730  Register Dst = MI.getOperand(0).getReg();
2731  Register Src = MI.getOperand(1).getReg();
2732  unsigned Flags = MI.getFlags();
2733  LLT Ty = B.getMRI()->getType(Dst);
2734 
2735  auto K = B.buildFConstant(Ty, numbers::log2e);
2736  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2737  B.buildFExp2(Dst, Mul, Flags);
2738  MI.eraseFromParent();
2739  return true;
2740 }
2741 
2743  MachineIRBuilder &B) const {
2744  Register Dst = MI.getOperand(0).getReg();
2745  Register Src0 = MI.getOperand(1).getReg();
2746  Register Src1 = MI.getOperand(2).getReg();
2747  unsigned Flags = MI.getFlags();
2748  LLT Ty = B.getMRI()->getType(Dst);
2749  const LLT S16 = LLT::scalar(16);
2750  const LLT S32 = LLT::scalar(32);
2751 
2752  if (Ty == S32) {
2753  auto Log = B.buildFLog2(S32, Src0, Flags);
2754  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2755  .addUse(Log.getReg(0))
2756  .addUse(Src1)
2757  .setMIFlags(Flags);
2758  B.buildFExp2(Dst, Mul, Flags);
2759  } else if (Ty == S16) {
2760  // There's no f16 fmul_legacy, so we need to convert for it.
2761  auto Log = B.buildFLog2(S16, Src0, Flags);
2762  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2763  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2764  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2765  .addUse(Ext0.getReg(0))
2766  .addUse(Ext1.getReg(0))
2767  .setMIFlags(Flags);
2768 
2769  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2770  } else
2771  return false;
2772 
2773  MI.eraseFromParent();
2774  return true;
2775 }
2776 
2777 // Find a source register, ignoring any possible source modifiers.
2779  Register ModSrc = OrigSrc;
2780  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2781  ModSrc = SrcFNeg->getOperand(1).getReg();
2782  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2783  ModSrc = SrcFAbs->getOperand(1).getReg();
2784  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2785  ModSrc = SrcFAbs->getOperand(1).getReg();
2786  return ModSrc;
2787 }
2788 
2791  MachineIRBuilder &B) const {
2792 
2793  const LLT S1 = LLT::scalar(1);
2794  const LLT S64 = LLT::scalar(64);
2795  Register Dst = MI.getOperand(0).getReg();
2796  Register OrigSrc = MI.getOperand(1).getReg();
2797  unsigned Flags = MI.getFlags();
2798  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2799  "this should not have been custom lowered");
2800 
2801  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2802  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2803  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2804  // V_FRACT bug is:
2805  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2806  //
2807  // Convert floor(x) to (x - fract(x))
2808 
2809  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2810  .addUse(OrigSrc)
2811  .setMIFlags(Flags);
2812 
2813  // Give source modifier matching some assistance before obscuring a foldable
2814  // pattern.
2815 
2816  // TODO: We can avoid the neg on the fract? The input sign to fract
2817  // shouldn't matter?
2818  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2819 
2820  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2821 
2823 
2824  // We don't need to concern ourselves with the snan handling difference, so
2825  // use the one which will directly select.
2826  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2827  if (MFI->getMode().IEEE)
2828  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2829  else
2830  B.buildFMinNum(Min, Fract, Const, Flags);
2831 
2832  Register CorrectedFract = Min;
2833  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2834  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2835  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2836  }
2837 
2838  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2839  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2840 
2841  MI.eraseFromParent();
2842  return true;
2843 }
2844 
2845 // Turn an illegal packed v2s16 build vector into bit operations.
2846 // TODO: This should probably be a bitcast action in LegalizerHelper.
2849  Register Dst = MI.getOperand(0).getReg();
2850  const LLT S32 = LLT::scalar(32);
2851  assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2852 
2853  Register Src0 = MI.getOperand(1).getReg();
2854  Register Src1 = MI.getOperand(2).getReg();
2855  assert(MRI.getType(Src0) == LLT::scalar(16));
2856 
2857  auto Merge = B.buildMerge(S32, {Src0, Src1});
2858  B.buildBitcast(Dst, Merge);
2859 
2860  MI.eraseFromParent();
2861  return true;
2862 }
2863 
2864 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2865 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2866 // case with a single min instruction instead of a compare+select.
2869  MachineIRBuilder &B) const {
2870  Register Dst = MI.getOperand(0).getReg();
2871  Register Src = MI.getOperand(1).getReg();
2872  LLT DstTy = MRI.getType(Dst);
2873  LLT SrcTy = MRI.getType(Src);
2874 
2875  unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2876  ? AMDGPU::G_AMDGPU_FFBH_U32
2877  : AMDGPU::G_AMDGPU_FFBL_B32;
2878  auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2879  B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2880 
2881  MI.eraseFromParent();
2882  return true;
2883 }
2884 
2885 // Check that this is a G_XOR x, -1
2886 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2887  if (MI.getOpcode() != TargetOpcode::G_XOR)
2888  return false;
2889  auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2890  return ConstVal && *ConstVal == -1;
2891 }
2892 
2893 // Return the use branch instruction, otherwise null if the usage is invalid.
2894 static MachineInstr *
2896  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2897  Register CondDef = MI.getOperand(0).getReg();
2898  if (!MRI.hasOneNonDBGUse(CondDef))
2899  return nullptr;
2900 
2901  MachineBasicBlock *Parent = MI.getParent();
2903 
2904  if (isNot(MRI, *UseMI)) {
2905  Register NegatedCond = UseMI->getOperand(0).getReg();
2906  if (!MRI.hasOneNonDBGUse(NegatedCond))
2907  return nullptr;
2908 
2909  // We're deleting the def of this value, so we need to remove it.
2910  eraseInstr(*UseMI, MRI);
2911 
2912  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2913  Negated = true;
2914  }
2915 
2916  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2917  return nullptr;
2918 
2919  // Make sure the cond br is followed by a G_BR, or is the last instruction.
2920  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2921  if (Next == Parent->end()) {
2922  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2923  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2924  return nullptr;
2925  UncondBrTarget = &*NextMBB;
2926  } else {
2927  if (Next->getOpcode() != AMDGPU::G_BR)
2928  return nullptr;
2929  Br = &*Next;
2930  UncondBrTarget = Br->getOperand(0).getMBB();
2931  }
2932 
2933  return UseMI;
2934 }
2935 
2937  const ArgDescriptor *Arg,
2938  const TargetRegisterClass *ArgRC,
2939  LLT ArgTy) const {
2940  MCRegister SrcReg = Arg->getRegister();
2941  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2942  assert(DstReg.isVirtual() && "Virtual register expected");
2943 
2944  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2945  *ArgRC, B.getDebugLoc(), ArgTy);
2946  if (Arg->isMasked()) {
2947  // TODO: Should we try to emit this once in the entry block?
2948  const LLT S32 = LLT::scalar(32);
2949  const unsigned Mask = Arg->getMask();
2950  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2951 
2952  Register AndMaskSrc = LiveIn;
2953 
2954  // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2955  // 0.
2956  if (Shift != 0) {
2957  auto ShiftAmt = B.buildConstant(S32, Shift);
2958  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2959  }
2960 
2961  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2962  } else {
2963  B.buildCopy(DstReg, LiveIn);
2964  }
2965 
2966  return true;
2967 }
2968 
2970  Register DstReg, MachineIRBuilder &B,
2971  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2972  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2973  const ArgDescriptor *Arg;
2974  const TargetRegisterClass *ArgRC;
2975  LLT ArgTy;
2976  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2977 
2978  if (!Arg) {
2980  // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2981  // case the pointer argument may be missing and we use null.
2982  B.buildConstant(DstReg, 0);
2983  return true;
2984  }
2985 
2986  // It's undefined behavior if a function marked with the amdgpu-no-*
2987  // attributes uses the corresponding intrinsic.
2988  B.buildUndef(DstReg);
2989  return true;
2990  }
2991 
2992  if (!Arg->isRegister() || !Arg->getRegister().isValid())
2993  return false; // TODO: Handle these
2994  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2995 }
2996 
2999  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3000  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3001  return false;
3002 
3003  MI.eraseFromParent();
3004  return true;
3005 }
3006 
3008  int64_t C) {
3009  B.buildConstant(MI.getOperand(0).getReg(), C);
3010  MI.eraseFromParent();
3011  return true;
3012 }
3013 
3016  unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3017  unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3018  if (MaxID == 0)
3019  return replaceWithConstant(B, MI, 0);
3020 
3021  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3022  const ArgDescriptor *Arg;
3023  const TargetRegisterClass *ArgRC;
3024  LLT ArgTy;
3025  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3026 
3027  Register DstReg = MI.getOperand(0).getReg();
3028  if (!Arg) {
3029  // It's undefined behavior if a function marked with the amdgpu-no-*
3030  // attributes uses the corresponding intrinsic.
3031  B.buildUndef(DstReg);
3032  MI.eraseFromParent();
3033  return true;
3034  }
3035 
3036  if (Arg->isMasked()) {
3037  // Don't bother inserting AssertZext for packed IDs since we're emitting the
3038  // masking operations anyway.
3039  //
3040  // TODO: We could assert the top bit is 0 for the source copy.
3041  if (!loadInputValue(DstReg, B, ArgType))
3042  return false;
3043  } else {
3045  if (!loadInputValue(TmpReg, B, ArgType))
3046  return false;
3047  B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3048  }
3049 
3050  MI.eraseFromParent();
3051  return true;
3052 }
3053 
3055  int64_t Offset) const {
3057  Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3058 
3059  // TODO: If we passed in the base kernel offset we could have a better
3060  // alignment than 4, but we don't really need it.
3061  if (!loadInputValue(KernArgReg, B,
3063  llvm_unreachable("failed to find kernarg segment ptr");
3064 
3065  auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3066  // TODO: Should get nuw
3067  return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3068 }
3069 
3070 /// Legalize a value that's loaded from kernel arguments. This is only used by
3071 /// legacy intrinsics.
3074  uint64_t Offset,
3075  Align Alignment) const {
3076  Register DstReg = MI.getOperand(0).getReg();
3077 
3078  assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3079  "unexpected kernarg parameter type");
3080 
3081  Register Ptr = getKernargParameterPtr(B, Offset);
3083  B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3086  MI.eraseFromParent();
3087  return true;
3088 }
3089 
3092  MachineIRBuilder &B) const {
3093  Register Dst = MI.getOperand(0).getReg();
3094  LLT DstTy = MRI.getType(Dst);
3095  LLT S16 = LLT::scalar(16);
3096  LLT S32 = LLT::scalar(32);
3097  LLT S64 = LLT::scalar(64);
3098 
3099  if (DstTy == S16)
3100  return legalizeFDIV16(MI, MRI, B);
3101  if (DstTy == S32)
3102  return legalizeFDIV32(MI, MRI, B);
3103  if (DstTy == S64)
3104  return legalizeFDIV64(MI, MRI, B);
3105 
3106  return false;
3107 }
3108 
3110  Register DstDivReg,
3111  Register DstRemReg,
3112  Register X,
3113  Register Y) const {
3114  const LLT S1 = LLT::scalar(1);
3115  const LLT S32 = LLT::scalar(32);
3116 
3117  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3118  // algorithm used here.
3119 
3120  // Initial estimate of inv(y).
3121  auto FloatY = B.buildUITOFP(S32, Y);
3122  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3123  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3124  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3125  auto Z = B.buildFPTOUI(S32, ScaledY);
3126 
3127  // One round of UNR.
3128  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3129  auto NegYZ = B.buildMul(S32, NegY, Z);
3130  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3131 
3132  // Quotient/remainder estimate.
3133  auto Q = B.buildUMulH(S32, X, Z);
3134  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3135 
3136  // First quotient/remainder refinement.
3137  auto One = B.buildConstant(S32, 1);
3138  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3139  if (DstDivReg)
3140  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3141  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3142 
3143  // Second quotient/remainder refinement.
3144  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3145  if (DstDivReg)
3146  B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3147 
3148  if (DstRemReg)
3149  B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3150 }
3151 
3152 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3153 //
3154 // Return lo, hi of result
3155 //
3156 // %cvt.lo = G_UITOFP Val.lo
3157 // %cvt.hi = G_UITOFP Val.hi
3158 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3159 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3160 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3161 // %mul2 = G_FMUL %mul1, 2**(-32)
3162 // %trunc = G_INTRINSIC_TRUNC %mul2
3163 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3164 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3165 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3166  Register Val) {
3167  const LLT S32 = LLT::scalar(32);
3168  auto Unmerge = B.buildUnmerge(S32, Val);
3169 
3170  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3171  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3172 
3173  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3174  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3175 
3176  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3177  auto Mul1 =
3178  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3179 
3180  // 2**(-32)
3181  auto Mul2 =
3182  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3183  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3184 
3185  // -(2**32)
3186  auto Mad2 = B.buildFMAD(S32, Trunc,
3187  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3188 
3189  auto ResultLo = B.buildFPTOUI(S32, Mad2);
3190  auto ResultHi = B.buildFPTOUI(S32, Trunc);
3191 
3192  return {ResultLo.getReg(0), ResultHi.getReg(0)};
3193 }
3194 
3196  Register DstDivReg,
3197  Register DstRemReg,
3198  Register Numer,
3199  Register Denom) const {
3200  const LLT S32 = LLT::scalar(32);
3201  const LLT S64 = LLT::scalar(64);
3202  const LLT S1 = LLT::scalar(1);
3203  Register RcpLo, RcpHi;
3204 
3205  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3206 
3207  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3208 
3209  auto Zero64 = B.buildConstant(S64, 0);
3210  auto NegDenom = B.buildSub(S64, Zero64, Denom);
3211 
3212  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3213  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3214 
3215  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3216  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3217  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3218 
3219  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3220  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3221  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3222 
3223  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3224  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3225  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3226  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3227  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3228 
3229  auto Zero32 = B.buildConstant(S32, 0);
3230  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3231  auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3232  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3233 
3234  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3235  Register NumerLo = UnmergeNumer.getReg(0);
3236  Register NumerHi = UnmergeNumer.getReg(1);
3237 
3238  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3239  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3240  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3241  Register Mul3_Lo = UnmergeMul3.getReg(0);
3242  Register Mul3_Hi = UnmergeMul3.getReg(1);
3243  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3244  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3245  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3246  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3247 
3248  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3249  Register DenomLo = UnmergeDenom.getReg(0);
3250  Register DenomHi = UnmergeDenom.getReg(1);
3251 
3252  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3253  auto C1 = B.buildSExt(S32, CmpHi);
3254 
3255  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3256  auto C2 = B.buildSExt(S32, CmpLo);
3257 
3258  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3259  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3260 
3261  // TODO: Here and below portions of the code can be enclosed into if/endif.
3262  // Currently control flow is unconditional and we have 4 selects after
3263  // potential endif to substitute PHIs.
3264 
3265  // if C3 != 0 ...
3266  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3267  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3268  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3269  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3270 
3271  auto One64 = B.buildConstant(S64, 1);
3272  auto Add3 = B.buildAdd(S64, MulHi3, One64);
3273 
3274  auto C4 =
3275  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3276  auto C5 =
3277  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3278  auto C6 = B.buildSelect(
3279  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3280 
3281  // if (C6 != 0)
3282  auto Add4 = B.buildAdd(S64, Add3, One64);
3283  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3284 
3285  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3286  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3287  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3288 
3289  // endif C6
3290  // endif C3
3291 
3292  if (DstDivReg) {
3293  auto Sel1 = B.buildSelect(
3294  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3295  B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3296  Sel1, MulHi3);
3297  }
3298 
3299  if (DstRemReg) {
3300  auto Sel2 = B.buildSelect(
3301  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3302  B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3303  Sel2, Sub1);
3304  }
3305 }
3306 
3309  MachineIRBuilder &B) const {
3310  Register DstDivReg, DstRemReg;
3311  switch (MI.getOpcode()) {
3312  default:
3313  llvm_unreachable("Unexpected opcode!");
3314  case AMDGPU::G_UDIV: {
3315  DstDivReg = MI.getOperand(0).getReg();
3316  break;
3317  }
3318  case AMDGPU::G_UREM: {
3319  DstRemReg = MI.getOperand(0).getReg();
3320  break;
3321  }
3322  case AMDGPU::G_UDIVREM: {
3323  DstDivReg = MI.getOperand(0).getReg();
3324  DstRemReg = MI.getOperand(1).getReg();
3325  break;
3326  }
3327  }
3328 
3329  const LLT S64 = LLT::scalar(64);
3330  const LLT S32 = LLT::scalar(32);
3331  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3332  Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3333  Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3334  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3335 
3336  if (Ty == S32)
3337  legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3338  else if (Ty == S64)
3339  legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3340  else
3341  return false;
3342 
3343  MI.eraseFromParent();
3344  return true;
3345 }
3346 
3349  MachineIRBuilder &B) const {
3350  const LLT S64 = LLT::scalar(64);
3351  const LLT S32 = LLT::scalar(32);
3352 
3353  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3354  if (Ty != S32 && Ty != S64)
3355  return false;
3356 
3357  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3358  Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3359  Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3360 
3361  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3362  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3363  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3364 
3365  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3366  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3367 
3368  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3369  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3370 
3371  Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3372  switch (MI.getOpcode()) {
3373  default:
3374  llvm_unreachable("Unexpected opcode!");
3375  case AMDGPU::G_SDIV: {
3376  DstDivReg = MI.getOperand(0).getReg();
3377  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3378  break;
3379  }
3380  case AMDGPU::G_SREM: {
3381  DstRemReg = MI.getOperand(0).getReg();
3382  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3383  break;
3384  }
3385  case AMDGPU::G_SDIVREM: {
3386  DstDivReg = MI.getOperand(0).getReg();
3387  DstRemReg = MI.getOperand(1).getReg();
3388  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3389  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3390  break;
3391  }
3392  }
3393 
3394  if (Ty == S32)
3395  legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3396  else
3397  legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3398 
3399  if (DstDivReg) {
3400  auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3401  auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3402  B.buildSub(DstDivReg, SignXor, Sign);
3403  }
3404 
3405  if (DstRemReg) {
3406  auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3407  auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3408  B.buildSub(DstRemReg, SignXor, Sign);
3409  }
3410 
3411  MI.eraseFromParent();
3412  return true;
3413 }
3414 
3417  MachineIRBuilder &B) const {
3418  Register Res = MI.getOperand(0).getReg();
3419  Register LHS = MI.getOperand(1).getReg();
3420  Register RHS = MI.getOperand(2).getReg();
3421  uint16_t Flags = MI.getFlags();
3422  LLT ResTy = MRI.getType(Res);
3423 
3424  const MachineFunction &MF = B.getMF();
3425  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3426  MI.getFlag(MachineInstr::FmAfn);
3427 
3428  if (!AllowInaccurateRcp)
3429  return false;
3430 
3431  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3432  // 1 / x -> RCP(x)
3433  if (CLHS->isExactlyValue(1.0)) {
3434  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3435  .addUse(RHS)
3436  .setMIFlags(Flags);
3437 
3438  MI.eraseFromParent();
3439  return true;
3440  }
3441 
3442  // -1 / x -> RCP( FNEG(x) )
3443  if (CLHS->isExactlyValue(-1.0)) {
3444  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3445  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3446  .addUse(FNeg.getReg(0))
3447  .setMIFlags(Flags);
3448 
3449  MI.eraseFromParent();
3450  return true;
3451  }
3452  }
3453 
3454  // x / y -> x * (1.0 / y)
3455  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3456  .addUse(RHS)
3457  .setMIFlags(Flags);
3458  B.buildFMul(Res, LHS, RCP, Flags);
3459 
3460  MI.eraseFromParent();
3461  return true;
3462 }
3463 
3466  MachineIRBuilder &B) const {
3467  Register Res = MI.getOperand(0).getReg();
3468  Register X = MI.getOperand(1).getReg();
3469  Register Y = MI.getOperand(2).getReg();
3470  uint16_t Flags = MI.getFlags();
3471  LLT ResTy = MRI.getType(Res);
3472 
3473  const MachineFunction &MF = B.getMF();
3474  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3475  MI.getFlag(MachineInstr::FmAfn);
3476 
3477  if (!AllowInaccurateRcp)
3478  return false;
3479 
3480  auto NegY = B.buildFNeg(ResTy, Y);
3481  auto One = B.buildFConstant(ResTy, 1.0);
3482 
3483  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3484  .addUse(Y)
3485  .setMIFlags(Flags);
3486 
3487  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3488  R = B.buildFMA(ResTy, Tmp0, R, R);
3489 
3490  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3491  R = B.buildFMA(ResTy, Tmp1, R, R);
3492 
3493  auto Ret = B.buildFMul(ResTy, X, R);
3494  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3495 
3496  B.buildFMA(Res, Tmp2, R, Ret);
3497  MI.eraseFromParent();
3498  return true;
3499 }
3500 
3503  MachineIRBuilder &B) const {
3504  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3505  return true;
3506 
3507  Register Res = MI.getOperand(0).getReg();
3508  Register LHS = MI.getOperand(1).getReg();
3509  Register RHS = MI.getOperand(2).getReg();
3510 
3511  uint16_t Flags = MI.getFlags();
3512 
3513  LLT S16 = LLT::scalar(16);
3514  LLT S32 = LLT::scalar(32);
3515 
3516  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3517  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3518 
3519  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3520  .addUse(RHSExt.getReg(0))
3521  .setMIFlags(Flags);
3522 
3523  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3524  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3525 
3526  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3527  .addUse(RDst.getReg(0))
3528  .addUse(RHS)
3529  .addUse(LHS)
3530  .setMIFlags(Flags);
3531 
3532  MI.eraseFromParent();
3533  return true;
3534 }
3535 
3536 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3537 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3538 static void toggleSPDenormMode(bool Enable,
3540  const GCNSubtarget &ST,
3541  AMDGPU::SIModeRegisterDefaults Mode) {
3542  // Set SP denorm mode to this value.
3543  unsigned SPDenormMode =
3544  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3545 
3546  if (ST.hasDenormModeInst()) {
3547  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3548  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3549 
3550  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3551  B.buildInstr(AMDGPU::S_DENORM_MODE)
3552  .addImm(NewDenormModeValue);
3553 
3554  } else {
3555  // Select FP32 bit field in mode register.
3556  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3559 
3560  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3561  .addImm(SPDenormMode)
3562  .addImm(SPDenormModeBitField);
3563  }
3564 }
3565 
3568  MachineIRBuilder &B) const {
3569  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3570  return true;
3571 
3572  Register Res = MI.getOperand(0).getReg();
3573  Register LHS = MI.getOperand(1).getReg();
3574  Register RHS = MI.getOperand(2).getReg();
3575  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3577 
3578  uint16_t Flags = MI.getFlags();
3579 
3580  LLT S32 = LLT::scalar(32);
3581  LLT S1 = LLT::scalar(1);
3582 
3583  auto One = B.buildFConstant(S32, 1.0f);
3584 
3585  auto DenominatorScaled =
3586  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3587  .addUse(LHS)
3588  .addUse(RHS)
3589  .addImm(0)
3590  .setMIFlags(Flags);
3591  auto NumeratorScaled =
3592  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3593  .addUse(LHS)
3594  .addUse(RHS)
3595  .addImm(1)
3596  .setMIFlags(Flags);
3597 
3598  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3599  .addUse(DenominatorScaled.getReg(0))
3600  .setMIFlags(Flags);
3601  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3602 
3603  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3604  // aren't modeled as reading it.
3605  if (!Mode.allFP32Denormals())
3606  toggleSPDenormMode(true, B, ST, Mode);
3607 
3608  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3609  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3610  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3611  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3612  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3613  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3614 
3615  if (!Mode.allFP32Denormals())
3616  toggleSPDenormMode(false, B, ST, Mode);
3617 
3618  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3619  .addUse(Fma4.getReg(0))
3620  .addUse(Fma1.getReg(0))
3621  .addUse(Fma3.getReg(0))
3622  .addUse(NumeratorScaled.getReg(1))
3623  .setMIFlags(Flags);
3624 
3625  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3626  .addUse(Fmas.getReg(0))
3627  .addUse(RHS)
3628  .addUse(LHS)
3629  .setMIFlags(Flags);
3630 
3631  MI.eraseFromParent();
3632  return true;
3633 }
3634 
3637  MachineIRBuilder &B) const {
3639  return true;
3640 
3641  Register Res = MI.getOperand(0).getReg();
3642  Register LHS = MI.getOperand(1).getReg();
3643  Register RHS = MI.getOperand(2).getReg();
3644 
3645  uint16_t Flags = MI.getFlags();
3646 
3647  LLT S64 = LLT::scalar(64);
3648  LLT S1 = LLT::scalar(1);
3649 
3650  auto One = B.buildFConstant(S64, 1.0);
3651 
3652  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3653  .addUse(LHS)
3654  .addUse(RHS)
3655  .addImm(0)
3656  .setMIFlags(Flags);
3657 
3658  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3659 
3660  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3661  .addUse(DivScale0.getReg(0))
3662  .setMIFlags(Flags);
3663 
3664  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3665  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3666  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3667 
3668  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3669  .addUse(LHS)
3670  .addUse(RHS)
3671  .addImm(1)
3672  .setMIFlags(Flags);
3673 
3674  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3675  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3676  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3677 
3678  Register Scale;
3680  // Workaround a hardware bug on SI where the condition output from div_scale
3681  // is not usable.
3682 
3683  LLT S32 = LLT::scalar(32);
3684 
3685  auto NumUnmerge = B.buildUnmerge(S32, LHS);
3686  auto DenUnmerge = B.buildUnmerge(S32, RHS);
3687  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3688  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3689 
3690  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3691  Scale1Unmerge.getReg(1));
3692  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3693  Scale0Unmerge.getReg(1));
3694  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3695  } else {
3696  Scale = DivScale1.getReg(1);
3697  }
3698 
3699  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3700  .addUse(Fma4.getReg(0))
3701  .addUse(Fma3.getReg(0))
3702  .addUse(Mul.getReg(0))
3703  .addUse(Scale)
3704  .setMIFlags(Flags);
3705 
3706  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3707  .addUse(Fmas.getReg(0))
3708  .addUse(RHS)
3709  .addUse(LHS)
3710  .setMIFlags(Flags);
3711 
3712  MI.eraseFromParent();
3713  return true;
3714 }
3715 
3718  MachineIRBuilder &B) const {
3719  Register Res = MI.getOperand(0).getReg();
3720  Register LHS = MI.getOperand(2).getReg();
3721  Register RHS = MI.getOperand(3).getReg();
3722  uint16_t Flags = MI.getFlags();
3723 
3724  LLT S32 = LLT::scalar(32);
3725  LLT S1 = LLT::scalar(1);
3726 
3727  auto Abs = B.buildFAbs(S32, RHS, Flags);
3728  const APFloat C0Val(1.0f);
3729 
3730  auto C0 = B.buildConstant(S32, 0x6f800000);
3731  auto C1 = B.buildConstant(S32, 0x2f800000);
3732  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3733 
3734  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3735  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3736 
3737  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3738 
3739  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3740  .addUse(Mul0.getReg(0))
3741  .setMIFlags(Flags);
3742 
3743  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3744 
3745  B.buildFMul(Res, Sel, Mul1, Flags);
3746 
3747  MI.eraseFromParent();
3748  return true;
3749 }
3750 
3751 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3752 // FIXME: Why do we handle this one but not other removed instructions?
3753 //
3754 // Reciprocal square root. The clamp prevents infinite results, clamping
3755 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3756 // +-max_float.
3759  MachineIRBuilder &B) const {
3761  return true;
3762 
3763  Register Dst = MI.getOperand(0).getReg();
3764  Register Src = MI.getOperand(2).getReg();
3765  auto Flags = MI.getFlags();
3766 
3767  LLT Ty = MRI.getType(Dst);
3768 
3769  const fltSemantics *FltSemantics;
3770  if (Ty == LLT::scalar(32))
3771  FltSemantics = &APFloat::IEEEsingle();
3772  else if (Ty == LLT::scalar(64))
3773  FltSemantics = &APFloat::IEEEdouble();
3774  else
3775  return false;
3776 
3777  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3778  .addUse(Src)
3779  .setMIFlags(Flags);
3780 
3781  // We don't need to concern ourselves with the snan handling difference, since
3782  // the rsq quieted (or not) so use the one which will directly select.
3783  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3784  const bool UseIEEE = MFI->getMode().IEEE;
3785 
3786  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3787  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3788  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3789 
3790  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3791 
3792  if (UseIEEE)
3793  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3794  else
3795  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3796  MI.eraseFromParent();
3797  return true;
3798 }
3799 
3800 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3801  switch (IID) {
3802  case Intrinsic::amdgcn_ds_fadd:
3803  return AMDGPU::G_ATOMICRMW_FADD;
3804  case Intrinsic::amdgcn_ds_fmin:
3805  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3806  case Intrinsic::amdgcn_ds_fmax:
3807  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3808  default:
3809  llvm_unreachable("not a DS FP intrinsic");
3810  }
3811 }
3812 
3814  MachineInstr &MI,
3815  Intrinsic::ID IID) const {
3816  GISelChangeObserver &Observer = Helper.Observer;
3817  Observer.changingInstr(MI);
3818 
3819  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3820 
3821  // The remaining operands were used to set fields in the MemOperand on
3822  // construction.
3823  for (int I = 6; I > 3; --I)
3824  MI.removeOperand(I);
3825 
3826  MI.removeOperand(1); // Remove the intrinsic ID.
3827  Observer.changedInstr(MI);
3828  return true;
3829 }
3830 
3833  MachineIRBuilder &B) const {
3834  uint64_t Offset =
3837  LLT DstTy = MRI.getType(DstReg);
3838  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3839 
3840  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3841  if (!loadInputValue(KernargPtrReg, B,
3843  return false;
3844 
3845  // FIXME: This should be nuw
3846  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3847  return true;
3848 }
3849 
3852  MachineIRBuilder &B) const {
3853  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3854  if (!MFI->isEntryFunction()) {
3857  }
3858 
3859  Register DstReg = MI.getOperand(0).getReg();
3860  if (!getImplicitArgPtr(DstReg, MRI, B))
3861  return false;
3862 
3863  MI.eraseFromParent();
3864  return true;
3865 }
3866 
3870  unsigned AddrSpace) const {
3871  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3872  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3873  Register Hi32 = Unmerge.getReg(1);
3874 
3875  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3876  MI.eraseFromParent();
3877  return true;
3878 }
3879 
3880 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3881 // offset (the offset that is included in bounds checking and swizzling, to be
3882 // split between the instruction's voffset and immoffset fields) and soffset
3883 // (the offset that is excluded from bounds checking and swizzling, to go in
3884 // the instruction's soffset field). This function takes the first kind of
3885 // offset and figures out how to split it between voffset and immoffset.
3886 std::pair<Register, unsigned>
3888  Register OrigOffset) const {
3889  const unsigned MaxImm = 4095;
3890  Register BaseReg;
3891  unsigned ImmOffset;
3892  const LLT S32 = LLT::scalar(32);
3893  MachineRegisterInfo &MRI = *B.getMRI();
3894 
3895  std::tie(BaseReg, ImmOffset) =
3897 
3898  // If BaseReg is a pointer, convert it to int.
3899  if (MRI.getType(BaseReg).isPointer())
3900  BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3901 
3902  // If the immediate value is too big for the immoffset field, put the value
3903  // and -4096 into the immoffset field so that the value that is copied/added
3904  // for the voffset field is a multiple of 4096, and it stands more chance
3905  // of being CSEd with the copy/add for another similar load/store.
3906  // However, do not do that rounding down to a multiple of 4096 if that is a
3907  // negative number, as it appears to be illegal to have a negative offset
3908  // in the vgpr, even if adding the immediate offset makes it positive.
3909  unsigned Overflow = ImmOffset & ~MaxImm;
3910  ImmOffset -= Overflow;
3911  if ((int32_t)Overflow < 0) {
3912  Overflow += ImmOffset;
3913  ImmOffset = 0;
3914  }
3915 
3916  if (Overflow != 0) {
3917  if (!BaseReg) {
3918  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3919  } else {
3920  auto OverflowVal = B.buildConstant(S32, Overflow);
3921  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3922  }
3923  }
3924 
3925  if (!BaseReg)
3926  BaseReg = B.buildConstant(S32, 0).getReg(0);
3927 
3928  return std::make_pair(BaseReg, ImmOffset);
3929 }
3930 
3931 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3933  Register VOffset, Register SOffset,
3934  unsigned ImmOffset, Register VIndex,
3935  MachineRegisterInfo &MRI) const {
3936  Optional<ValueAndVReg> MaybeVOffsetVal =
3938  Optional<ValueAndVReg> MaybeSOffsetVal =
3940  Optional<ValueAndVReg> MaybeVIndexVal =
3942  // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3943  // update the MMO with that offset. The stride is unknown so we can only do
3944  // this if VIndex is constant 0.
3945  if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3946  MaybeVIndexVal->Value == 0) {
3947  uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3948  MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3949  MMO->setOffset(TotalOffset);
3950  } else {
3951  // We don't have a constant combined offset to use in the MMO. Give up.
3952  MMO->setValue((Value *)nullptr);
3953  }
3954 }
3955 
3956 /// Handle register layout difference for f16 images for some subtargets.
3959  Register Reg,
3960  bool ImageStore) const {
3961  const LLT S16 = LLT::scalar(16);
3962  const LLT S32 = LLT::scalar(32);
3963  LLT StoreVT = MRI.getType(Reg);
3964  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3965 
3966  if (ST.hasUnpackedD16VMem()) {
3967  auto Unmerge = B.buildUnmerge(S16, Reg);
3968 
3969  SmallVector<Register, 4> WideRegs;
3970  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3971  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3972 
3973  int NumElts = StoreVT.getNumElements();
3974 
3975  return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3976  .getReg(0);
3977  }
3978 
3979  if (ImageStore && ST.hasImageStoreD16Bug()) {
3980  if (StoreVT.getNumElements() == 2) {
3981  SmallVector<Register, 4> PackedRegs;
3982  Reg = B.buildBitcast(S32, Reg).getReg(0);
3983  PackedRegs.push_back(Reg);
3984  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3985  return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3986  .getReg(0);
3987  }
3988 
3989  if (StoreVT.getNumElements() == 3) {
3990  SmallVector<Register, 4> PackedRegs;
3991  auto Unmerge = B.buildUnmerge(S16, Reg);
3992  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3993  PackedRegs.push_back(Unmerge.getReg(I));
3994  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3995  Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3996  return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3997  }
3998 
3999  if (StoreVT.getNumElements() == 4) {
4000  SmallVector<Register, 4> PackedRegs;
4001  Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4002  auto Unmerge = B.buildUnmerge(S32, Reg);
4003  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4004  PackedRegs.push_back(Unmerge.getReg(I));
4005  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4006  return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4007  .getReg(0);
4008  }
4009 
4010  llvm_unreachable("invalid data type");
4011  }
4012 
4013  if (StoreVT == LLT::fixed_vector(3, S16)) {
4014  Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4015  .getReg(0);
4016  }
4017  return Reg;
4018 }
4019 
4021  MachineIRBuilder &B, Register VData, bool IsFormat) const {
4022  MachineRegisterInfo *MRI = B.getMRI();
4023  LLT Ty = MRI->getType(VData);
4024 
4025  const LLT S16 = LLT::scalar(16);
4026 
4027  // Fixup illegal register types for i8 stores.
4028  if (Ty == LLT::scalar(8) || Ty == S16) {
4029  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4030  return AnyExt;
4031  }
4032 
4033  if (Ty.isVector()) {
4034  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4035  if (IsFormat)
4036  return handleD16VData(B, *MRI, VData);
4037  }
4038  }
4039 
4040  return VData;
4041 }
4042 
4046  bool IsTyped,
4047  bool IsFormat) const {
4048  Register VData = MI.getOperand(1).getReg();
4049  LLT Ty = MRI.getType(VData);
4050  LLT EltTy = Ty.getScalarType();
4051  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4052  const LLT S32 = LLT::scalar(32);
4053 
4054  VData = fixStoreSourceType(B, VData, IsFormat);
4055  Register RSrc = MI.getOperand(2).getReg();
4056 
4057  MachineMemOperand *MMO = *MI.memoperands_begin();
4058  const int MemSize = MMO->getSize();
4059 
4060  unsigned ImmOffset;
4061 
4062  // The typed intrinsics add an immediate after the registers.
4063  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4064 
4065  // The struct intrinsic variants add one additional operand over raw.
4066  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4067  Register VIndex;
4068  int OpOffset = 0;
4069  if (HasVIndex) {
4070  VIndex = MI.getOperand(3).getReg();
4071  OpOffset = 1;
4072  } else {
4073  VIndex = B.buildConstant(S32, 0).getReg(0);
4074  }
4075 
4076  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4077  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4078 
4079  unsigned Format = 0;
4080  if (IsTyped) {
4081  Format = MI.getOperand(5 + OpOffset).getImm();
4082  ++OpOffset;
4083  }
4084 
4085  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4086 
4087  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4088  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4089 
4090  unsigned Opc;
4091  if (IsTyped) {
4092  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4093  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4094  } else if (IsFormat) {
4095  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4096  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4097  } else {
4098  switch (MemSize) {
4099  case 1:
4100  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4101  break;
4102  case 2:
4103  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4104  break;
4105  default:
4106  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4107  break;
4108  }
4109  }
4110 
4111  auto MIB = B.buildInstr(Opc)
4112  .addUse(VData) // vdata
4113  .addUse(RSrc) // rsrc
4114  .addUse(VIndex) // vindex
4115  .addUse(VOffset) // voffset
4116  .addUse(SOffset) // soffset
4117  .addImm(ImmOffset); // offset(imm)
4118 
4119  if (IsTyped)
4120  MIB.addImm(Format);
4121 
4122  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4123  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4124  .addMemOperand(MMO);
4125 
4126  MI.eraseFromParent();
4127  return true;
4128 }
4129 
4133  bool IsFormat,
4134  bool IsTyped) const {
4135  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4136  MachineMemOperand *MMO = *MI.memoperands_begin();
4137  const LLT MemTy = MMO->getMemoryType();
4138  const LLT S32 = LLT::scalar(32);
4139 
4140  Register Dst = MI.getOperand(0).getReg();
4141  Register RSrc = MI.getOperand(2).getReg();
4142 
4143  // The typed intrinsics add an immediate after the registers.
4144  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4145 
4146  // The struct intrinsic variants add one additional operand over raw.
4147  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4148  Register VIndex;
4149  int OpOffset = 0;
4150  if (HasVIndex) {
4151  VIndex = MI.getOperand(3).getReg();
4152  OpOffset = 1;
4153  } else {
4154  VIndex = B.buildConstant(S32, 0).getReg(0);
4155  }
4156 
4157  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4158  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4159 
4160  unsigned Format = 0;
4161  if (IsTyped) {
4162  Format = MI.getOperand(5 + OpOffset).getImm();
4163  ++OpOffset;
4164  }
4165 
4166  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4167  unsigned ImmOffset;
4168 
4169  LLT Ty = MRI.getType(Dst);
4170  LLT EltTy = Ty.getScalarType();
4171  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4172  const bool Unpacked = ST.hasUnpackedD16VMem();
4173 
4174  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4175  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4176 
4177  unsigned Opc;
4178 
4179  if (IsTyped) {
4180  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4181  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4182  } else if (IsFormat) {
4183  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4184  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4185  } else {
4186  switch (MemTy.getSizeInBits()) {
4187  case 8:
4188  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4189  break;
4190  case 16:
4191  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4192  break;
4193  default:
4194  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4195  break;
4196  }
4197  }
4198 
4199  Register LoadDstReg;
4200 
4201  bool IsExtLoad =
4202  (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4203  LLT UnpackedTy = Ty.changeElementSize(32);
4204 
4205  if (IsExtLoad)
4206  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4207  else if (Unpacked && IsD16 && Ty.isVector())
4208  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4209  else
4210  LoadDstReg = Dst;
4211 
4212  auto MIB = B.buildInstr(Opc)
4213  .addDef(LoadDstReg) // vdata
4214  .addUse(RSrc) // rsrc
4215  .addUse(VIndex) // vindex
4216  .addUse(VOffset) // voffset
4217  .addUse(SOffset) // soffset
4218  .addImm(ImmOffset); // offset(imm)
4219 
4220  if (IsTyped)
4221  MIB.addImm(Format);
4222 
4223  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4224  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4225  .addMemOperand(MMO);
4226 
4227  if (LoadDstReg != Dst) {
4228  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4229 
4230  // Widen result for extending loads was widened.
4231  if (IsExtLoad)
4232  B.buildTrunc(Dst, LoadDstReg);
4233  else {
4234  // Repack to original 16-bit vector result
4235  // FIXME: G_TRUNC should work, but legalization currently fails
4236  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4237  SmallVector<Register, 4> Repack;
4238  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4239  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4240  B.buildMerge(Dst, Repack);
4241  }
4242  }
4243 
4244  MI.eraseFromParent();
4245  return true;
4246 }
4247 
4250  bool IsInc) const {
4251  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4252  AMDGPU::G_AMDGPU_ATOMIC_DEC;
4253  B.buildInstr(Opc)
4254  .addDef(MI.getOperand(0).getReg())
4255  .addUse(MI.getOperand(2).getReg())
4256  .addUse(MI.getOperand(3).getReg())
4257  .cloneMemRefs(MI);
4258  MI.eraseFromParent();
4259  return true;
4260 }
4261 
4262 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4263  switch (IntrID) {
4264  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4265  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4266  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4267  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4268  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4269  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4270  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4271  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4272  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4273  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4274  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4275  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4276  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4277  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4278  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4279  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4280  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4281  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4282  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4283  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4284  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4285  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4286  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4287  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4288  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4289  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4290  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4291  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4292  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4293  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4294  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4295  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4296  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4297  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4298  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4299  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4300  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4301  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4302  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4303  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4304  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4305  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4306  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4307  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4308  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4309  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4310  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4311  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4312  default:
4313  llvm_unreachable("unhandled atomic opcode");
4314  }
4315 }
4316 
4319  Intrinsic::ID IID) const {
4320  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4321  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4322  const bool HasReturn = MI.getNumExplicitDefs() != 0;
4323 
4324  Register Dst;
4325 
4326  int OpOffset = 0;
4327  if (HasReturn) {
4328  // A few FP atomics do not support return values.
4329  Dst = MI.getOperand(0).getReg();
4330  } else {
4331  OpOffset = -1;
4332  }
4333 
4334  Register VData = MI.getOperand(2 + OpOffset).getReg();
4335  Register CmpVal;
4336 
4337  if (IsCmpSwap) {
4338  CmpVal = MI.getOperand(3 + OpOffset).getReg();
4339  ++OpOffset;
4340  }
4341 
4342  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4343  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4344 
4345  // The struct intrinsic variants add one additional operand over raw.
4346  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4347  Register VIndex;
4348  if (HasVIndex) {
4349  VIndex = MI.getOperand(4 + OpOffset).getReg();
4350  ++OpOffset;
4351  } else {
4352  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4353  }
4354 
4355  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4356  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4357  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4358 
4359  MachineMemOperand *MMO = *MI.memoperands_begin();
4360 
4361  unsigned ImmOffset;
4362  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4363  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4364 
4365  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4366 
4367  if (HasReturn)
4368  MIB.addDef(Dst);
4369 
4370  MIB.addUse(VData); // vdata
4371 
4372  if (IsCmpSwap)
4373  MIB.addReg(CmpVal);
4374 
4375  MIB.addUse(RSrc) // rsrc
4376  .addUse(VIndex) // vindex
4377  .addUse(VOffset) // voffset
4378  .addUse(SOffset) // soffset
4379  .addImm(ImmOffset) // offset(imm)
4380  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4381  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4382  .addMemOperand(MMO);
4383 
4384  MI.eraseFromParent();
4385  return true;
4386 }
4387 
4388 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4389 /// vector with s16 typed elements.
4391  SmallVectorImpl<Register> &PackedAddrs,
4392  unsigned ArgOffset,
4393  const AMDGPU::ImageDimIntrinsicInfo *Intr,
4394  bool IsA16, bool IsG16) {
4395  const LLT S16 = LLT::scalar(16);
4396  const LLT V2S16 = LLT::fixed_vector(2, 16);
4397  auto EndIdx = Intr->VAddrEnd;
4398 
4399  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4400  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4401  if (!SrcOp.isReg())
4402  continue; // _L to _LZ may have eliminated this.
4403 
4404  Register AddrReg = SrcOp.getReg();
4405 
4406  if ((I < Intr->GradientStart) ||
4407  (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4408  (I >= Intr->CoordStart && !IsA16)) {
4409  if ((I < Intr->GradientStart) && IsA16 &&
4410  (B.getMRI()->getType(AddrReg) == S16)) {
4411  assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4412  // Special handling of bias when A16 is on. Bias is of type half but
4413  // occupies full 32-bit.
4414  PackedAddrs.push_back(
4415  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4416  .getReg(0));
4417  } else {
4418  assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4419  "Bias needs to be converted to 16 bit in A16 mode");
4420  // Handle any gradient or coordinate operands that should not be packed
4421  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4422  PackedAddrs.push_back(AddrReg);
4423  }
4424  } else {
4425  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4426  // derivatives dx/dh and dx/dv are packed with undef.
4427  if (((I + 1) >= EndIdx) ||
4428  ((Intr->NumGradients / 2) % 2 == 1 &&
4429  (I == static_cast<unsigned>(Intr->GradientStart +
4430  (Intr->NumGradients / 2) - 1) ||
4431  I == static_cast<unsigned>(Intr->GradientStart +
4432  Intr->NumGradients - 1))) ||
4433  // Check for _L to _LZ optimization
4434  !MI.getOperand(ArgOffset + I + 1).isReg()) {
4435  PackedAddrs.push_back(
4436  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4437  .getReg(0));
4438  } else {
4439  PackedAddrs.push_back(
4440  B.buildBuildVector(
4441  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4442  .getReg(0));
4443  ++I;
4444  }
4445  }
4446  }
4447 }
4448 
4449 /// Convert from separate vaddr components to a single vector address register,
4450 /// and replace the remaining operands with $noreg.
4452  int DimIdx, int NumVAddrs) {
4453  const LLT S32 = LLT::scalar(32);
4454 
4455  SmallVector<Register, 8> AddrRegs;
4456  for (int I = 0; I != NumVAddrs; ++I) {
4457  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4458  if (SrcOp.isReg()) {
4459  AddrRegs.push_back(SrcOp.getReg());
4460  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4461  }
4462  }
4463 
4464  int NumAddrRegs = AddrRegs.size();
4465  if (NumAddrRegs != 1) {
4466  // Above 8 elements round up to next power of 2 (i.e. 16).
4467  if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4468  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4469  auto Undef = B.buildUndef(S32);
4470  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4471  NumAddrRegs = RoundedNumRegs;
4472  }
4473 
4474  auto VAddr =
4475  B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4476  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4477  }
4478 
4479  for (int I = 1; I != NumVAddrs; ++I) {
4480  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4481  if (SrcOp.isReg())
4482  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4483  }
4484 }
4485 
4486 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4487 ///
4488 /// Depending on the subtarget, load/store with 16-bit element data need to be
4489 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4490 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4491 /// registers.
4492 ///
4493 /// We don't want to directly select image instructions just yet, but also want
4494 /// to exposes all register repacking to the legalizer/combiners. We also don't
4495 /// want a selected instruction entering RegBankSelect. In order to avoid
4496 /// defining a multitude of intermediate image instructions, directly hack on
4497 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4498 /// padding now unnecessary arguments with $noreg.
4501  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4502 
4503  const unsigned NumDefs = MI.getNumExplicitDefs();
4504  const unsigned ArgOffset = NumDefs + 1;
4505  bool IsTFE = NumDefs == 2;
4506  // We are only processing the operands of d16 image operations on subtargets
4507  // that use the unpacked register layout, or need to repack the TFE result.
4508 
4509  // TODO: Do we need to guard against already legalized intrinsics?
4510  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4511  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4512 
4513  MachineRegisterInfo *MRI = B.getMRI();
4514  const LLT S32 = LLT::scalar(32);
4515  const LLT S16 = LLT::scalar(16);
4516  const LLT V2S16 = LLT::fixed_vector(2, 16);
4517 
4518  unsigned DMask = 0;
4519  Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4520  LLT Ty = MRI->getType(VData);
4521 
4522  // Check for 16 bit addresses and pack if true.
4523  LLT GradTy =
4524  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4525  LLT AddrTy =
4526  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4527  const bool IsG16 = GradTy == S16;
4528  const bool IsA16 = AddrTy == S16;
4529  const bool IsD16 = Ty.getScalarType() == S16;
4530 
4531  int DMaskLanes = 0;
4532  if (!BaseOpcode->Atomic) {
4533  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4534  if (BaseOpcode->Gather4) {
4535  DMaskLanes = 4;
4536  } else if (DMask != 0) {
4537  DMaskLanes = countPopulation(DMask);
4538  } else if (!IsTFE && !BaseOpcode->Store) {
4539  // If dmask is 0, this is a no-op load. This can be eliminated.
4540  B.buildUndef(MI.getOperand(0));
4541  MI.eraseFromParent();
4542  return true;
4543  }
4544  }
4545 
4546  Observer.changingInstr(MI);
4547  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4548 
4549  const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4550  : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4551  const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4552  : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4553  unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4554 
4555  // Track that we legalized this
4556  MI.setDesc(B.getTII().get(NewOpcode));
4557 
4558  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4559  // dmask to be at least 1 otherwise the instruction will fail
4560  if (IsTFE && DMask == 0) {
4561  DMask = 0x1;
4562  DMaskLanes = 1;
4563  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4564  }
4565 
4566  if (BaseOpcode->Atomic) {
4567  Register VData0 = MI.getOperand(2).getReg();
4568  LLT Ty = MRI->getType(VData0);
4569 
4570  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4571  if (Ty.isVector())
4572  return false;
4573 
4574  if (BaseOpcode->AtomicX2) {
4575  Register VData1 = MI.getOperand(3).getReg();
4576  // The two values are packed in one register.
4577  LLT PackedTy = LLT::fixed_vector(2, Ty);
4578  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4579  MI.getOperand(2).setReg(Concat.getReg(0));
4580  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4581  }
4582  }
4583 
4584  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4585 
4586  // Rewrite the addressing register layout before doing anything else.
4587  if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4588  // 16 bit gradients are supported, but are tied to the A16 control
4589  // so both gradients and addresses must be 16 bit
4590  return false;
4591  }
4592 
4593  if (IsA16 && !ST.hasA16()) {
4594  // A16 not supported
4595  return false;
4596  }
4597 
4598  if (IsA16 || IsG16) {
4599  if (Intr->NumVAddrs > 1) {
4600  SmallVector<Register, 4> PackedRegs;
4601 
4602  packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4603  IsG16);
4604 
4605  // See also below in the non-a16 branch
4606  const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4607  PackedRegs.size() <= ST.getNSAMaxSize();
4608 
4609  if (!UseNSA && PackedRegs.size() > 1) {
4610  LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4611  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4612  PackedRegs[0] = Concat.getReg(0);
4613  PackedRegs.resize(1);
4614  }
4615 
4616  const unsigned NumPacked = PackedRegs.size();
4617  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4618  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4619  if (!SrcOp.isReg()) {
4620  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4621  continue;
4622  }
4623 
4624  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4625 
4626  if (I - Intr->VAddrStart < NumPacked)
4627  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4628  else
4629  SrcOp.setReg(AMDGPU::NoRegister);
4630  }
4631  }
4632  } else {
4633  // If the register allocator cannot place the address registers contiguously
4634  // without introducing moves, then using the non-sequential address encoding
4635  // is always preferable, since it saves VALU instructions and is usually a
4636  // wash in terms of code size or even better.
4637  //
4638  // However, we currently have no way of hinting to the register allocator
4639  // that MIMG addresses should be placed contiguously when it is possible to
4640  // do so, so force non-NSA for the common 2-address case as a heuristic.
4641  //
4642  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4643  // allocation when possible.
4644  const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4645  CorrectedNumVAddrs <= ST.getNSAMaxSize();
4646 
4647  if (!UseNSA && Intr->NumVAddrs > 1)
4648  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4649  Intr->NumVAddrs);
4650  }
4651 
4652  int Flags = 0;
4653  if (IsA16)
4654  Flags |= 1;
4655  if (IsG16)
4656  Flags |= 2;
4657  MI.addOperand(MachineOperand::CreateImm(Flags));
4658 
4659  if (BaseOpcode->Store) { // No TFE for stores?
4660  // TODO: Handle dmask trim
4661  if (!Ty.isVector() || !IsD16)
4662  return true;
4663 
4664  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4665  if (RepackedReg != VData) {
4666  MI.getOperand(1).setReg(RepackedReg);
4667  }
4668 
4669  return true;
4670  }
4671 
4672  Register DstReg = MI.getOperand(0).getReg();
4673  const LLT EltTy = Ty.getScalarType();
4674  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4675 
4676  // Confirm that the return type is large enough for the dmask specified
4677  if (NumElts < DMaskLanes)
4678  return false;
4679 
4680  if (NumElts > 4 || DMaskLanes > 4)
4681  return false;
4682 
4683  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4684  const LLT AdjustedTy =
4685  Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4686 
4687  // The raw dword aligned data component of the load. The only legal cases
4688  // where this matters should be when using the packed D16 format, for
4689  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4690  LLT RoundedTy;
4691 
4692  // S32 vector to to cover all data, plus TFE result element.
4693  LLT TFETy;
4694 
4695  // Register type to use for each loaded component. Will be S32 or V2S16.
4696  LLT RegTy;
4697 
4698  if (IsD16 && ST.hasUnpackedD16VMem()) {
4699  RoundedTy =
4700  LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4701  TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4702  RegTy = S32;
4703  } else {
4704  unsigned EltSize = EltTy.getSizeInBits();
4705  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4706  unsigned RoundedSize = 32 * RoundedElts;
4707  RoundedTy = LLT::scalarOrVector(
4708  ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4709  TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4710  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4711  }
4712 
4713  // The return type does not need adjustment.
4714  // TODO: Should we change s16 case to s32 or <2 x s16>?
4715  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4716  return true;
4717 
4718  Register Dst1Reg;
4719 
4720  // Insert after the instruction.
4721  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4722 
4723  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4724  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4725  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4726  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4727 
4728  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4729 
4730  MI.getOperand(0).setReg(NewResultReg);
4731 
4732  // In the IR, TFE is supposed to be used with a 2 element struct return
4733  // type. The instruction really returns these two values in one contiguous
4734  // register, with one additional dword beyond the loaded data. Rewrite the
4735  // return type to use a single register result.
4736 
4737  if (IsTFE) {
4738  Dst1Reg = MI.getOperand(1).getReg();
4739  if (MRI->getType(Dst1Reg) != S32)
4740  return false;
4741 
4742  // TODO: Make sure the TFE operand bit is set.
4743  MI.removeOperand(1);
4744 
4745  // Handle the easy case that requires no repack instructions.
4746  if (Ty == S32) {
4747  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4748  return true;
4749  }
4750  }
4751 
4752  // Now figure out how to copy the new result register back into the old
4753  // result.
4754  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4755 
4756  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4757 
4758  if (ResultNumRegs == 1) {
4759  assert(!IsTFE);
4760  ResultRegs[0] = NewResultReg;
4761  } else {
4762  // We have to repack into a new vector of some kind.
4763  for (int I = 0; I != NumDataRegs; ++I)
4764  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4765  B.buildUnmerge(ResultRegs, NewResultReg);
4766 
4767  // Drop the final TFE element to get the data part. The TFE result is
4768  // directly written to the right place already.
4769  if (IsTFE)
4770  ResultRegs.resize(NumDataRegs);
4771  }
4772 
4773  // For an s16 scalar result, we form an s32 result with a truncate regardless
4774  // of packed vs. unpacked.
4775  if (IsD16 && !Ty.isVector()) {
4776  B.buildTrunc(DstReg, ResultRegs[0]);
4777  return true;
4778  }
4779 
4780  // Avoid a build/concat_vector of 1 entry.
4781  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4782  B.buildBitcast(DstReg, ResultRegs[0]);
4783  return true;
4784  }
4785 
4786  assert(Ty.isVector());
4787 
4788  if (IsD16) {
4789  // For packed D16 results with TFE enabled, all the data components are
4790  // S32. Cast back to the expected type.
4791  //
4792  // TODO: We don't really need to use load s32 elements. We would only need one
4793  // cast for the TFE result if a multiple of v2s16 was used.
4794  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4795  for (Register &Reg : ResultRegs)
4796  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4797  } else if (ST.hasUnpackedD16VMem()) {
4798  for (Register &Reg : ResultRegs)
4799  Reg = B.buildTrunc(S16, Reg).getReg(0);
4800  }
4801  }
4802 
4803  auto padWithUndef = [&](LLT Ty, int NumElts) {
4804  if (NumElts == 0)
4805  return;
4806  Register Undef = B.buildUndef(Ty).getReg(0);
4807  for (int I = 0; I != NumElts; ++I)
4808  ResultRegs.push_back(Undef);
4809  };
4810 
4811  // Pad out any elements eliminated due to the dmask.
4812  LLT ResTy = MRI->getType(ResultRegs[0]);
4813  if (!ResTy.isVector()) {
4814  padWithUndef(ResTy, NumElts - ResultRegs.size());
4815  B.buildBuildVector(DstReg, ResultRegs);
4816  return true;
4817  }
4818 
4819  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4820  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4821 
4822  // Deal with the one annoying legal case.
4823  const LLT V3S16 = LLT::fixed_vector(3, 16);
4824  if (Ty == V3S16) {
4825  if (IsTFE) {
4826  if (ResultRegs.size() == 1) {
4827  NewResultReg = ResultRegs[0];
4828  } else if (ResultRegs.size() == 2) {
4829  LLT V4S16 = LLT::fixed_vector(4, 16);
4830  NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4831  } else {
4832  return false;
4833  }
4834  }
4835 
4836  if (MRI->getType(DstReg).getNumElements() <
4837  MRI->getType(NewResultReg).getNumElements()) {
4838  B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4839  } else {
4840  B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4841  }
4842  return true;
4843  }
4844 
4845  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4846  B.buildConcatVectors(DstReg, ResultRegs);
4847  return true;
4848 }
4849 
4851  LegalizerHelper &Helper, MachineInstr &MI) const {
4852  MachineIRBuilder &B = Helper.MIRBuilder;
4853  GISelChangeObserver &Observer = Helper.Observer;
4854 
4855  Register Dst = MI.getOperand(0).getReg();
4856  LLT Ty = B.getMRI()->getType(Dst);
4857  unsigned Size = Ty.getSizeInBits();
4858  MachineFunction &MF = B.getMF();
4859 
4860  Observer.changingInstr(MI);
4861 
4862  if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4863  Ty = getBitcastRegisterType(Ty);
4864  Helper.bitcastDst(MI, Ty, 0);
4865  Dst = MI.getOperand(0).getReg();
4866  B.setInsertPt(B.getMBB(), MI);
4867  }
4868 
4869  // FIXME: We don't really need this intermediate instruction. The intrinsic
4870  // should be fixed to have a memory operand. Since it's readnone, we're not
4871  // allowed to add one.
4872  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4873  MI.removeOperand(1); // Remove intrinsic ID
4874 
4875  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4876  // TODO: Should this use datalayout alignment?
4877  const unsigned MemSize = (Size + 7) / 8;
4878  const Align MemAlign(4);
4883  MemSize, MemAlign);
4884  MI.addMemOperand(MF, MMO);
4885 
4886  // There are no 96-bit result scalar loads, but widening to 128-bit should
4887  // always be legal. We may need to restore this to a 96-bit result if it turns
4888  // out this needs to be converted to a vector load during RegBankSelect.
4889  if (!isPowerOf2_32(Size)) {
4890  if (Ty.isVector())
4891  Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4892  else
4893  Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4894  }
4895 
4896  Observer.changedInstr(MI);
4897  return true;
4898 }
4899 
4900 // TODO: Move to selection
4903  MachineIRBuilder &B) const {
4904  if (!ST.isTrapHandlerEnabled() ||
4906  return legalizeTrapEndpgm(MI, MRI, B);
4907 
4908  if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4909  switch (*HsaAbiVer) {
4912  return legalizeTrapHsaQueuePtr(MI, MRI, B);
4915  return ST.supportsGetDoorbellID() ?
4916  legalizeTrapHsa(MI, MRI, B) :
4918  }
4919  }
4920 
4921  llvm_unreachable("Unknown trap handler");
4922 }
4923 
4926  B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4927  MI.eraseFromParent();
4928  return true;
4929 }
4930 
4933  MachineFunction &MF = B.getMF();
4934  const LLT S64 = LLT::scalar(64);
4935 
4936  Register SGPR01(AMDGPU::SGPR0_SGPR1);
4937  // For code object version 5, queue_ptr is passed through implicit kernarg.
4941  uint64_t Offset =
4943 
4944  Register KernargPtrReg = MRI.createGenericVirtualRegister(
4946 
4947  if (!loadInputValue(KernargPtrReg, B,
4949  return false;
4950 
4951  // TODO: can we be smarter about machine pointer info?
4954  PtrInfo,
4957  LLT::scalar(64), commonAlignment(Align(64), Offset));
4958 
4959  // Pointer address
4962  B.buildPtrAdd(LoadAddr, KernargPtrReg,
4963  B.buildConstant(LLT::scalar(64), Offset).getReg(0));
4964  // Load address
4965  Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
4966  B.buildCopy(SGPR01, Temp);
4967  B.buildInstr(AMDGPU::S_TRAP)
4968  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4969  .addReg(SGPR01, RegState::Implicit);
4970  MI.eraseFromParent();
4971  return true;
4972  }
4973 
4974  // Pass queue pointer to trap handler as input, and insert trap instruction
4975  // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4976  Register LiveIn =
4979  return false;
4980 
4981  B.buildCopy(SGPR01, LiveIn);
4982  B.buildInstr(AMDGPU::S_TRAP)
4983  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4984  .addReg(SGPR01, RegState::Implicit);
4985 
4986  MI.eraseFromParent();
4987  return true;
4988 }
4989 
4992  B.buildInstr(AMDGPU::S_TRAP)
4993  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4994  MI.eraseFromParent();
4995  return true;
4996 }
4997 
5000  // Is non-HSA path or trap-handler disabled? Then, report a warning
5001  // accordingly
5002  if (!ST.isTrapHandlerEnabled() ||
5004  DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5005  "debugtrap handler not supported",
5006  MI.getDebugLoc(), DS_Warning);
5007  LLVMContext &Ctx = B.getMF().getFunction().getContext();
5008  Ctx.diagnose(NoTrap);
5009  } else {
5010  // Insert debug-trap instruction
5011  B.buildInstr(AMDGPU::S_TRAP)
5012  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5013  }
5014 
5015  MI.eraseFromParent();
5016  return true;
5017 }
5018 
5020  MachineIRBuilder &B) const {
5021  MachineRegisterInfo &MRI = *B.getMRI();
5022  const LLT S16 = LLT::scalar(16);
5023  const LLT S32 = LLT::scalar(32);
5024 
5025  Register DstReg = MI.getOperand(0).getReg();
5026  Register NodePtr = MI.getOperand(2).getReg();
5027  Register RayExtent = MI.getOperand(3).getReg();
5028  Register RayOrigin = MI.getOperand(4).getReg();
5029  Register RayDir = MI.getOperand(5).getReg();
5030  Register RayInvDir = MI.getOperand(6).getReg();
5031  Register TDescr = MI.getOperand(7).getReg();
5032 
5033  if (!ST.hasGFX10_AEncoding()) {
5034  DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5035  "intrinsic not supported on subtarget",
5036  MI.getDebugLoc());
5037  B.getMF().getFunction().getContext().diagnose(BadIntrin);
5038  return false;
5039  }
5040 
5041  const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5042  const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5043  const unsigned NumVDataDwords = 4;
5044  const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5045  const bool UseNSA =
5046  ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
5047  const unsigned BaseOpcodes[2][2] = {
5048  {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5049  {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5050  AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5051  int Opcode;
5052  if (UseNSA) {
5053  Opcode =
5054  AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
5055  NumVDataDwords, NumVAddrDwords);
5056  } else {
5057  Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5058  AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
5059  PowerOf2Ceil(NumVAddrDwords));
5060  }
5061  assert(Opcode != -1);
5062 
5064  if (Is64) {
5065  auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5066  Ops.push_back(Unmerge.getReg(0));
5067  Ops.push_back(Unmerge.getReg(1));
5068  } else {
5069  Ops.push_back(NodePtr);
5070  }
5071  Ops.push_back(RayExtent);
5072 
5073  auto packLanes = [&Ops, &S32, &B](Register Src) {
5074  auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5075  Ops.push_back(Unmerge.getReg(0));
5076  Ops.push_back(Unmerge.getReg(1));
5077  Ops.push_back(Unmerge.getReg(2));
5078  };
5079 
5080  packLanes(RayOrigin);
5081  if (IsA16) {
5082  auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5083  auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5087  B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5088  B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5089  B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5090  Ops.push_back(R1);
5091  Ops.push_back(R2);
5092  Ops.push_back(R3);
5093  } else {
5094  packLanes(RayDir);
5095  packLanes(RayInvDir);
5096  }
5097 
5098  if (!UseNSA) {
5099  // Build a single vector containing all the operands so far prepared.
5100  LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5101  Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5102  Ops.clear();
5103  Ops.push_back(MergedOps);
5104  }
5105 
5106  auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5107  .addDef(DstReg)
5108  .addImm(Opcode);
5109 
5110  for (Register R : Ops) {
5111  MIB.addUse(R);
5112  }
5113 
5114  MIB.addUse(TDescr)
5115  .addImm(IsA16 ? 1 : 0)
5116  .cloneMemRefs(MI);
5117 
5118  MI.eraseFromParent();
5119  return true;
5120 }
5121 
5123  MachineIRBuilder &B) const {
5124  unsigned Opc;
5125  int RoundMode = MI.getOperand(2).getImm();
5126 
5127  if (RoundMode == (int)RoundingMode::TowardPositive)
5128  Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5129  else if (RoundMode == (int)RoundingMode::TowardNegative)
5130  Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5131  else
5132  return false;
5133 
5134  B.buildInstr(Opc)
5135  .addDef(MI.getOperand(0).getReg())
5136  .addUse(MI.getOperand(1).getReg());
5137 
5138  MI.eraseFromParent();
5139 
5140  return true;
5141 }
5142 
5144  MachineInstr &MI) const {
5145  MachineIRBuilder &B = Helper.MIRBuilder;
5146  MachineRegisterInfo &MRI = *B.getMRI();
5147 
5148  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5149  auto IntrID = MI.getIntrinsicID();
5150  switch (IntrID) {
5151  case Intrinsic::amdgcn_if:
5152  case Intrinsic::amdgcn_else: {
5153  MachineInstr *Br = nullptr;
5154  MachineBasicBlock *UncondBrTarget = nullptr;
5155  bool Negated = false;
5156  if (MachineInstr *BrCond =
5157  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5158  const SIRegisterInfo *TRI
5159  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5160 
5161  Register Def = MI.getOperand(1).getReg();
5162  Register Use = MI.getOperand(3).getReg();
5163 
5164  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5165 
5166  if (Negated)
5167  std::swap(CondBrTarget, UncondBrTarget);
5168 
5169  B.setInsertPt(B.getMBB(), BrCond->getIterator());
5170  if (IntrID == Intrinsic::amdgcn_if) {
5171  B.buildInstr(AMDGPU::SI_IF)
5172  .addDef(Def)
5173  .addUse(Use)
5174  .addMBB(UncondBrTarget);
5175  } else {
5176  B.buildInstr(AMDGPU::SI_ELSE)
5177  .addDef(Def)
5178  .addUse(Use)
5179  .addMBB(UncondBrTarget);
5180  }
5181 
5182  if (Br) {
5183  Br->getOperand(0).setMBB(CondBrTarget);
5184  } else {
5185  // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5186  // since we're swapping branch targets it needs to be reinserted.
5187  // FIXME: IRTranslator should probably not do this
5188  B.buildBr(*CondBrTarget);
5189  }
5190 
5191  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5192  MRI.setRegClass(Use,