LLVM  14.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
40  "amdgpu-global-isel-new-legality",
41  cl::desc("Use GlobalISel desired legality, rather than try to use"
42  "rules compatible with selection patterns"),
43  cl::init(false),
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
50  unsigned NElts = Ty.getNumElements();
51  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52  return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
57  unsigned Bits = Ty.getSizeInBits();
58  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59  return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66  return [=](const LegalityQuery &Query) {
67  const LLT Ty = Query.Types[TypeIdx];
68  if (!Ty.isVector())
69  return false;
70 
71  const LLT EltTy = Ty.getElementType();
72  const unsigned EltSize = EltTy.getSizeInBits();
73  return Ty.getNumElements() % 2 != 0 &&
74  EltSize > 1 && EltSize < 32 &&
75  Ty.getSizeInBits() % 32 != 0;
76  };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80  return [=](const LegalityQuery &Query) {
81  const LLT Ty = Query.Types[TypeIdx];
82  return Ty.getSizeInBits() % 32 == 0;
83  };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87  return [=](const LegalityQuery &Query) {
88  const LLT Ty = Query.Types[TypeIdx];
89  const LLT EltTy = Ty.getScalarType();
90  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91  };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95  return [=](const LegalityQuery &Query) {
96  const LLT Ty = Query.Types[TypeIdx];
97  const LLT EltTy = Ty.getElementType();
98  return std::make_pair(TypeIdx,
99  LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100  };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104  return [=](const LegalityQuery &Query) {
105  const LLT Ty = Query.Types[TypeIdx];
106  const LLT EltTy = Ty.getElementType();
107  unsigned Size = Ty.getSizeInBits();
108  unsigned Pieces = (Size + 63) / 64;
109  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110  return std::make_pair(
111  TypeIdx,
112  LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113  };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119  return [=](const LegalityQuery &Query) {
120  const LLT Ty = Query.Types[TypeIdx];
121 
122  const LLT EltTy = Ty.getElementType();
123  const int Size = Ty.getSizeInBits();
124  const int EltSize = EltTy.getSizeInBits();
125  const int NextMul32 = (Size + 31) / 32;
126 
127  assert(EltSize < 32);
128 
129  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130  return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131  };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135  const unsigned Size = Ty.getSizeInBits();
136 
137  LLT CoercedTy;
138  if (Size <= 32) {
139  // <2 x s8> -> s16
140  // <4 x s8> -> s32
141  return LLT::scalar(Size);
142  }
143 
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148  return [=](const LegalityQuery &Query) {
149  const LLT Ty = Query.Types[TypeIdx];
150  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151  };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155  return [=](const LegalityQuery &Query) {
156  const LLT Ty = Query.Types[TypeIdx];
157  unsigned Size = Ty.getSizeInBits();
158  assert(Size % 32 == 0);
159  return std::make_pair(
160  TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161  };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165  return [=](const LegalityQuery &Query) {
166  const LLT QueryTy = Query.Types[TypeIdx];
167  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168  };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172  return [=](const LegalityQuery &Query) {
173  const LLT QueryTy = Query.Types[TypeIdx];
174  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175  };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179  return [=](const LegalityQuery &Query) {
180  const LLT QueryTy = Query.Types[TypeIdx];
181  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182  };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186  return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190  const int EltSize = EltTy.getSizeInBits();
191  return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195  const int EltSize = Ty.getElementType().getSizeInBits();
196  return EltSize == 32 || EltSize == 64 ||
197  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198  EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202  if (!isRegisterSize(Ty.getSizeInBits()))
203  return false;
204 
205  if (Ty.isVector())
206  return isRegisterVectorType(Ty);
207 
208  return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214  return [=](const LegalityQuery &Query) {
215  return isRegisterType(Query.Types[TypeIdx]);
216  };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220  return [=](const LegalityQuery &Query) {
221  const LLT QueryTy = Query.Types[TypeIdx];
222  if (!QueryTy.isVector())
223  return false;
224  const LLT EltTy = QueryTy.getElementType();
225  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226  };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
232  return [=](const LegalityQuery &Query) {
233  const LLT Ty = Query.Types[TypeIdx];
234  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235  Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236  };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243  bool IsLoad) {
244  switch (AS) {
246  // FIXME: Private element size.
247  return ST.enableFlatScratch() ? 128 : 32;
249  return ST.useDS128() ? 128 : 64;
253  // Treat constant and global as identical. SMRD loads are sometimes usable for
254  // global loads (ideally constant address space should be eliminated)
255  // depending on the context. Legality cannot be context dependent, but
256  // RegBankSelect can split the load as necessary depending on the pointer
257  // register bank/uniformity and if the memory is invariant or not written in a
258  // kernel.
259  return IsLoad ? 512 : 128;
260  default:
261  // Flat addresses may contextually need to be split to 32-bit parts if they
262  // may alias scratch depending on the subtarget.
263  return 128;
264  }
265 }
266 
268  const LegalityQuery &Query) {
269  const LLT Ty = Query.Types[0];
270 
271  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272  const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274  unsigned RegSize = Ty.getSizeInBits();
275  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277  unsigned AS = Query.Types[1].getAddressSpace();
278 
279  // All of these need to be custom lowered to cast the pointer operand.
281  return false;
282 
283  // Do not handle extending vector loads.
284  if (Ty.isVector() && MemSize != RegSize)
285  return false;
286 
287  // TODO: We should be able to widen loads if the alignment is high enough, but
288  // we also need to modify the memory access size.
289 #if 0
290  // Accept widening loads based on alignment.
291  if (IsLoad && MemSize < Size)
292  MemSize = std::max(MemSize, Align);
293 #endif
294 
295  // Only 1-byte and 2-byte to 32-bit extloads are valid.
296  if (MemSize != RegSize && RegSize != 32)
297  return false;
298 
299  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300  return false;
301 
302  switch (MemSize) {
303  case 8:
304  case 16:
305  case 32:
306  case 64:
307  case 128:
308  break;
309  case 96:
310  if (!ST.hasDwordx3LoadStores())
311  return false;
312  break;
313  case 256:
314  case 512:
315  // These may contextually need to be broken down.
316  break;
317  default:
318  return false;
319  }
320 
321  assert(RegSize >= MemSize);
322 
323  if (AlignBits < MemSize) {
324  const SITargetLowering *TLI = ST.getTargetLowering();
325  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326  Align(AlignBits / 8)))
327  return false;
328  }
329 
330  return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338  if (EnableNewLegality)
339  return false;
340 
341  const unsigned Size = Ty.getSizeInBits();
342  if (Size <= 64)
343  return false;
344  if (!Ty.isVector())
345  return true;
346 
347  LLT EltTy = Ty.getElementType();
348  if (EltTy.isPointer())
349  return true;
350 
351  unsigned EltSize = EltTy.getSizeInBits();
352  return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356  const LLT Ty = Query.Types[0];
357  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364  const LLT MemTy) {
365  const unsigned MemSizeInBits = MemTy.getSizeInBits();
366  const unsigned Size = Ty.getSizeInBits();
367  if (Size != MemSizeInBits)
368  return Size <= 32 && Ty.isVector();
369 
371  return true;
372 
373  // Don't try to handle bitcasting vector ext loads for now.
374  return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375  (Size <= 32 || isRegisterSize(Size)) &&
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383  unsigned AlignInBits, unsigned AddrSpace,
384  unsigned Opcode) {
385  unsigned SizeInBits = MemoryTy.getSizeInBits();
386  // We don't want to widen cases that are naturally legal.
387  if (isPowerOf2_32(SizeInBits))
388  return false;
389 
390  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391  // end up widening these for a scalar load during RegBankSelect, since there
392  // aren't 96-bit scalar loads.
393  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394  return false;
395 
396  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397  return false;
398 
399  // A load is known dereferenceable up to the alignment, so it's legal to widen
400  // to it.
401  //
402  // TODO: Could check dereferenceable for less aligned cases.
403  unsigned RoundedSize = NextPowerOf2(SizeInBits);
404  if (AlignInBits < RoundedSize)
405  return false;
406 
407  // Do not widen if it would introduce a slow unaligned load.
408  const SITargetLowering *TLI = ST.getTargetLowering();
409  bool Fast = false;
411  RoundedSize, AddrSpace, Align(AlignInBits / 8),
412  MachineMemOperand::MOLoad, &Fast) &&
413  Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417  unsigned Opcode) {
418  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419  return false;
420 
421  return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422  Query.MMODescrs[0].AlignInBits,
423  Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
427  const GCNTargetMachine &TM)
428  : ST(ST_) {
429  using namespace TargetOpcode;
430 
431  auto GetAddrSpacePtr = [&TM](unsigned AS) {
432  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433  };
434 
435  const LLT S1 = LLT::scalar(1);
436  const LLT S8 = LLT::scalar(8);
437  const LLT S16 = LLT::scalar(16);
438  const LLT S32 = LLT::scalar(32);
439  const LLT S64 = LLT::scalar(64);
440  const LLT S128 = LLT::scalar(128);
441  const LLT S256 = LLT::scalar(256);
442  const LLT S512 = LLT::scalar(512);
443  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445  const LLT V2S8 = LLT::fixed_vector(2, 8);
446  const LLT V2S16 = LLT::fixed_vector(2, 16);
447  const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449  const LLT V2S32 = LLT::fixed_vector(2, 32);
450  const LLT V3S32 = LLT::fixed_vector(3, 32);
451  const LLT V4S32 = LLT::fixed_vector(4, 32);
452  const LLT V5S32 = LLT::fixed_vector(5, 32);
453  const LLT V6S32 = LLT::fixed_vector(6, 32);
454  const LLT V7S32 = LLT::fixed_vector(7, 32);
455  const LLT V8S32 = LLT::fixed_vector(8, 32);
456  const LLT V9S32 = LLT::fixed_vector(9, 32);
457  const LLT V10S32 = LLT::fixed_vector(10, 32);
458  const LLT V11S32 = LLT::fixed_vector(11, 32);
459  const LLT V12S32 = LLT::fixed_vector(12, 32);
460  const LLT V13S32 = LLT::fixed_vector(13, 32);
461  const LLT V14S32 = LLT::fixed_vector(14, 32);
462  const LLT V15S32 = LLT::fixed_vector(15, 32);
463  const LLT V16S32 = LLT::fixed_vector(16, 32);
464  const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466  const LLT V2S64 = LLT::fixed_vector(2, 64);
467  const LLT V3S64 = LLT::fixed_vector(3, 64);
468  const LLT V4S64 = LLT::fixed_vector(4, 64);
469  const LLT V5S64 = LLT::fixed_vector(5, 64);
470  const LLT V6S64 = LLT::fixed_vector(6, 64);
471  const LLT V7S64 = LLT::fixed_vector(7, 64);
472  const LLT V8S64 = LLT::fixed_vector(8, 64);
473  const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475  std::initializer_list<LLT> AllS32Vectors =
476  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478  std::initializer_list<LLT> AllS64Vectors =
479  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489  const LLT CodePtr = FlatPtr;
490 
491  const std::initializer_list<LLT> AddrSpaces64 = {
492  GlobalPtr, ConstantPtr, FlatPtr
493  };
494 
495  const std::initializer_list<LLT> AddrSpaces32 = {
496  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497  };
498 
499  const std::initializer_list<LLT> FPTypesBase = {
500  S32, S64
501  };
502 
503  const std::initializer_list<LLT> FPTypes16 = {
504  S32, S64, S16
505  };
506 
507  const std::initializer_list<LLT> FPTypesPK16 = {
508  S32, S64, S16, V2S16
509  };
510 
511  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513  // s1 for VCC branches, s32 for SCC branches.
514  getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517  // elements for v3s16
519  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520  .legalFor(AllS32Vectors)
521  .legalFor(AllS64Vectors)
522  .legalFor(AddrSpaces64)
523  .legalFor(AddrSpaces32)
524  .legalIf(isPointer(0))
525  .clampScalar(0, S16, S256)
526  .widenScalarToNextPow2(0, 32)
527  .clampMaxNumElements(0, S32, 16)
529  .scalarize(0);
530 
531  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532  // Full set of gfx9 features.
533  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534  .legalFor({S32, S16, V2S16})
535  .clampScalar(0, S16, S32)
536  .clampMaxNumElements(0, S16, 2)
537  .scalarize(0)
538  .widenScalarToNextPow2(0, 32);
539 
540  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541  .legalFor({S32, S16, V2S16}) // Clamp modifier
542  .minScalarOrElt(0, S16)
543  .clampMaxNumElements(0, S16, 2)
544  .scalarize(0)
545  .widenScalarToNextPow2(0, 32)
546  .lower();
547  } else if (ST.has16BitInsts()) {
548  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549  .legalFor({S32, S16})
550  .clampScalar(0, S16, S32)
551  .scalarize(0)
552  .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553 
554  // Technically the saturating operations require clamp bit support, but this
555  // was introduced at the same time as 16-bit operations.
556  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557  .legalFor({S32, S16}) // Clamp modifier
558  .minScalar(0, S16)
559  .scalarize(0)
560  .widenScalarToNextPow2(0, 16)
561  .lower();
562 
563  // We're just lowering this, but it helps get a better result to try to
564  // coerce to the desired type first.
565  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566  .minScalar(0, S16)
567  .scalarize(0)
568  .lower();
569  } else {
570  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571  .legalFor({S32})
572  .clampScalar(0, S32, S32)
573  .scalarize(0);
574 
575  if (ST.hasIntClamp()) {
576  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577  .legalFor({S32}) // Clamp modifier.
578  .scalarize(0)
579  .minScalarOrElt(0, S32)
580  .lower();
581  } else {
582  // Clamp bit support was added in VI, along with 16-bit operations.
583  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584  .minScalar(0, S32)
585  .scalarize(0)
586  .lower();
587  }
588 
589  // FIXME: DAG expansion gets better results. The widening uses the smaller
590  // range values and goes for the min/max lowering directly.
591  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592  .minScalar(0, S32)
593  .scalarize(0)
594  .lower();
595  }
596 
598  {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599  .customFor({S32, S64})
600  .clampScalar(0, S32, S64)
601  .widenScalarToNextPow2(0, 32)
602  .scalarize(0);
603 
604  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605  .legalFor({S32})
606  .maxScalarOrElt(0, S32);
607 
608  if (ST.hasVOP3PInsts()) {
609  Mulh
610  .clampMaxNumElements(0, S8, 2)
611  .lowerFor({V2S8});
612  }
613 
614  Mulh
615  .scalarize(0)
616  .lower();
617 
618  // Report legal for any types we can handle anywhere. For the cases only legal
619  // on the SALU, RegBankSelect will be able to re-legalize.
620  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622  .clampScalar(0, S32, S64)
626  .scalarize(0);
627 
628  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630  .legalFor({{S32, S1}, {S32, S32}})
631  .minScalar(0, S32)
632  // TODO: .scalarize(0)
633  .lower();
634 
635  getActionDefinitionsBuilder(G_BITCAST)
636  // Don't worry about the size constraint.
638  .lower();
639 
640 
641  getActionDefinitionsBuilder(G_CONSTANT)
642  .legalFor({S1, S32, S64, S16, GlobalPtr,
643  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644  .legalIf(isPointer(0))
645  .clampScalar(0, S32, S64)
647 
648  getActionDefinitionsBuilder(G_FCONSTANT)
649  .legalFor({S32, S64, S16})
650  .clampScalar(0, S16, S64);
651 
652  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653  .legalIf(isRegisterType(0))
654  // s1 and s16 are special cases because they have legal operations on
655  // them, but don't really occupy registers in the normal way.
656  .legalFor({S1, S16})
657  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658  .clampScalarOrElt(0, S32, MaxScalar)
659  .widenScalarToNextPow2(0, 32)
660  .clampMaxNumElements(0, S32, 16);
661 
662  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663 
664  // If the amount is divergent, we have to do a wave reduction to get the
665  // maximum value, so this is expanded during RegBankSelect.
666  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667  .legalFor({{PrivatePtr, S32}});
668 
669  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670  .customIf(typeIsNot(0, PrivatePtr));
671 
672  getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673 
674  auto &FPOpActions = getActionDefinitionsBuilder(
675  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676  .legalFor({S32, S64});
677  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678  .customFor({S32, S64});
679  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680  .customFor({S32, S64});
681 
682  if (ST.has16BitInsts()) {
683  if (ST.hasVOP3PInsts())
684  FPOpActions.legalFor({S16, V2S16});
685  else
686  FPOpActions.legalFor({S16});
687 
688  TrigActions.customFor({S16});
689  FDIVActions.customFor({S16});
690  }
691 
692  auto &MinNumMaxNum = getActionDefinitionsBuilder({
693  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694 
695  if (ST.hasVOP3PInsts()) {
696  MinNumMaxNum.customFor(FPTypesPK16)
698  .clampMaxNumElements(0, S16, 2)
699  .clampScalar(0, S16, S64)
700  .scalarize(0);
701  } else if (ST.has16BitInsts()) {
702  MinNumMaxNum.customFor(FPTypes16)
703  .clampScalar(0, S16, S64)
704  .scalarize(0);
705  } else {
706  MinNumMaxNum.customFor(FPTypesBase)
707  .clampScalar(0, S32, S64)
708  .scalarize(0);
709  }
710 
711  if (ST.hasVOP3PInsts())
712  FPOpActions.clampMaxNumElements(0, S16, 2);
713 
714  FPOpActions
715  .scalarize(0)
716  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717 
718  TrigActions
719  .scalarize(0)
720  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721 
722  FDIVActions
723  .scalarize(0)
724  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725 
726  getActionDefinitionsBuilder({G_FNEG, G_FABS})
727  .legalFor(FPTypesPK16)
728  .clampMaxNumElements(0, S16, 2)
729  .scalarize(0)
730  .clampScalar(0, S16, S64);
731 
732  if (ST.has16BitInsts()) {
733  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734  .legalFor({S32, S64, S16})
735  .scalarize(0)
736  .clampScalar(0, S16, S64);
737  } else {
739  .legalFor({S32, S64})
740  .scalarize(0)
741  .clampScalar(0, S32, S64);
742 
743  if (ST.hasFractBug()) {
745  .customFor({S64})
746  .legalFor({S32, S64})
747  .scalarize(0)
748  .clampScalar(0, S32, S64);
749  } else {
751  .legalFor({S32, S64})
752  .scalarize(0)
753  .clampScalar(0, S32, S64);
754  }
755  }
756 
757  getActionDefinitionsBuilder(G_FPTRUNC)
758  .legalFor({{S32, S64}, {S16, S32}})
759  .scalarize(0)
760  .lower();
761 
763  .legalFor({{S64, S32}, {S32, S16}})
764  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765  .scalarize(0);
766 
768  // Use actual fsub instruction
769  .legalFor({S32})
770  // Must use fadd + fneg
771  .lowerFor({S64, S16, V2S16})
772  .scalarize(0)
773  .clampScalar(0, S32, S64);
774 
775  // Whether this is legal depends on the floating point mode for the function.
776  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778  FMad.customFor({S32, S16});
779  else if (ST.hasMadMacF32Insts())
780  FMad.customFor({S32});
781  else if (ST.hasMadF16())
782  FMad.customFor({S16});
783  FMad.scalarize(0)
784  .lower();
785 
786  auto &FRem = getActionDefinitionsBuilder(G_FREM);
787  if (ST.has16BitInsts()) {
788  FRem.customFor({S16, S32, S64});
789  } else {
790  FRem.minScalar(0, S32)
791  .customFor({S32, S64});
792  }
793  FRem.scalarize(0);
794 
795  // TODO: Do we need to clamp maximum bitwidth?
797  .legalIf(isScalar(0))
798  .legalFor({{V2S16, V2S32}})
799  .clampMaxNumElements(0, S16, 2)
800  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801  // situations (like an invalid implicit use), we don't want to infinite loop
802  // in the legalizer.
804  .alwaysLegal();
805 
806  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808  {S32, S1}, {S64, S1}, {S16, S1}})
809  .scalarize(0)
810  .clampScalar(0, S32, S64)
811  .widenScalarToNextPow2(1, 32);
812 
813  // TODO: Split s1->s64 during regbankselect for VALU.
814  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816  .lowerFor({{S32, S64}})
817  .lowerIf(typeIs(1, S1))
818  .customFor({{S64, S64}});
819  if (ST.has16BitInsts())
820  IToFP.legalFor({{S16, S16}});
821  IToFP.clampScalar(1, S32, S64)
822  .minScalar(0, S32)
823  .scalarize(0)
825 
826  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
827  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
828  .customFor({{S64, S32}, {S64, S64}})
829  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
830  if (ST.has16BitInsts())
831  FPToI.legalFor({{S16, S16}});
832  else
833  FPToI.minScalar(1, S32);
834 
835  FPToI.minScalar(0, S32)
836  .widenScalarToNextPow2(0, 32)
837  .scalarize(0)
838  .lower();
839 
840  // Lower roundeven into G_FRINT
841  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
842  .scalarize(0)
843  .lower();
844 
845  if (ST.has16BitInsts()) {
846  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847  .legalFor({S16, S32, S64})
848  .clampScalar(0, S16, S64)
849  .scalarize(0);
850  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
851  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852  .legalFor({S32, S64})
853  .clampScalar(0, S32, S64)
854  .scalarize(0);
855  } else {
856  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
857  .legalFor({S32})
858  .customFor({S64})
859  .clampScalar(0, S32, S64)
860  .scalarize(0);
861  }
862 
863  getActionDefinitionsBuilder(G_PTR_ADD)
864  .legalIf(all(isPointer(0), sameSize(0, 1)))
865  .scalarize(0)
866  .scalarSameSizeAs(1, 0);
867 
868  getActionDefinitionsBuilder(G_PTRMASK)
869  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
870  .scalarSameSizeAs(1, 0)
871  .scalarize(0);
872 
873  auto &CmpBuilder =
874  getActionDefinitionsBuilder(G_ICMP)
875  // The compare output type differs based on the register bank of the output,
876  // so make both s1 and s32 legal.
877  //
878  // Scalar compares producing output in scc will be promoted to s32, as that
879  // is the allocatable register type that will be needed for the copy from
880  // scc. This will be promoted during RegBankSelect, and we assume something
881  // before that won't try to use s32 result types.
882  //
883  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
884  // bank.
885  .legalForCartesianProduct(
886  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
887  .legalForCartesianProduct(
888  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
889  if (ST.has16BitInsts()) {
890  CmpBuilder.legalFor({{S1, S16}});
891  }
892 
893  CmpBuilder
894  .widenScalarToNextPow2(1)
895  .clampScalar(1, S32, S64)
896  .scalarize(0)
897  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
898 
899  getActionDefinitionsBuilder(G_FCMP)
900  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
901  .widenScalarToNextPow2(1)
902  .clampScalar(1, S32, S64)
903  .scalarize(0);
904 
905  // FIXME: fpow has a selection pattern that should move to custom lowering.
906  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
907  if (ST.has16BitInsts())
908  Exp2Ops.legalFor({S32, S16});
909  else
910  Exp2Ops.legalFor({S32});
911  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
912  Exp2Ops.scalarize(0);
913 
914  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
915  if (ST.has16BitInsts())
916  ExpOps.customFor({{S32}, {S16}});
917  else
918  ExpOps.customFor({S32});
919  ExpOps.clampScalar(0, MinScalarFPTy, S32)
920  .scalarize(0);
921 
922  getActionDefinitionsBuilder(G_FPOWI)
923  .clampScalar(0, MinScalarFPTy, S32)
924  .lower();
925 
926  // The 64-bit versions produce 32-bit results, but only on the SALU.
927  getActionDefinitionsBuilder(G_CTPOP)
928  .legalFor({{S32, S32}, {S32, S64}})
929  .clampScalar(0, S32, S32)
930  .clampScalar(1, S32, S64)
931  .scalarize(0)
932  .widenScalarToNextPow2(0, 32)
933  .widenScalarToNextPow2(1, 32);
934 
935  // The hardware instructions return a different result on 0 than the generic
936  // instructions expect. The hardware produces -1, but these produce the
937  // bitwidth.
938  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
939  .scalarize(0)
940  .clampScalar(0, S32, S32)
941  .clampScalar(1, S32, S64)
942  .widenScalarToNextPow2(0, 32)
943  .widenScalarToNextPow2(1, 32)
944  .lower();
945 
946  // The 64-bit versions produce 32-bit results, but only on the SALU.
947  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
948  .legalFor({{S32, S32}, {S32, S64}})
949  .clampScalar(0, S32, S32)
950  .clampScalar(1, S32, S64)
951  .scalarize(0)
952  .widenScalarToNextPow2(0, 32)
953  .widenScalarToNextPow2(1, 32);
954 
955  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
956  // RegBankSelect.
957  getActionDefinitionsBuilder(G_BITREVERSE)
958  .legalFor({S32, S64})
959  .clampScalar(0, S32, S64)
960  .scalarize(0)
961  .widenScalarToNextPow2(0);
962 
963  if (ST.has16BitInsts()) {
964  getActionDefinitionsBuilder(G_BSWAP)
965  .legalFor({S16, S32, V2S16})
966  .clampMaxNumElements(0, S16, 2)
967  // FIXME: Fixing non-power-of-2 before clamp is workaround for
968  // narrowScalar limitation.
969  .widenScalarToNextPow2(0)
970  .clampScalar(0, S16, S32)
971  .scalarize(0);
972 
973  if (ST.hasVOP3PInsts()) {
974  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
975  .legalFor({S32, S16, V2S16})
976  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
977  .clampMaxNumElements(0, S16, 2)
978  .minScalar(0, S16)
979  .widenScalarToNextPow2(0)
980  .scalarize(0)
981  .lower();
982  } else {
983  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
984  .legalFor({S32, S16})
985  .widenScalarToNextPow2(0)
986  .minScalar(0, S16)
987  .scalarize(0)
988  .lower();
989  }
990  } else {
991  // TODO: Should have same legality without v_perm_b32
992  getActionDefinitionsBuilder(G_BSWAP)
993  .legalFor({S32})
994  .lowerIf(scalarNarrowerThan(0, 32))
995  // FIXME: Fixing non-power-of-2 before clamp is workaround for
996  // narrowScalar limitation.
997  .widenScalarToNextPow2(0)
998  .maxScalar(0, S32)
999  .scalarize(0)
1000  .lower();
1001 
1002  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1003  .legalFor({S32})
1004  .minScalar(0, S32)
1005  .widenScalarToNextPow2(0)
1006  .scalarize(0)
1007  .lower();
1008  }
1009 
1010  getActionDefinitionsBuilder(G_INTTOPTR)
1011  // List the common cases
1012  .legalForCartesianProduct(AddrSpaces64, {S64})
1013  .legalForCartesianProduct(AddrSpaces32, {S32})
1014  .scalarize(0)
1015  // Accept any address space as long as the size matches
1016  .legalIf(sameSize(0, 1))
1017  .widenScalarIf(smallerThan(1, 0),
1018  [](const LegalityQuery &Query) {
1019  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1020  })
1021  .narrowScalarIf(largerThan(1, 0),
1022  [](const LegalityQuery &Query) {
1023  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1024  });
1025 
1026  getActionDefinitionsBuilder(G_PTRTOINT)
1027  // List the common cases
1028  .legalForCartesianProduct(AddrSpaces64, {S64})
1029  .legalForCartesianProduct(AddrSpaces32, {S32})
1030  .scalarize(0)
1031  // Accept any address space as long as the size matches
1032  .legalIf(sameSize(0, 1))
1033  .widenScalarIf(smallerThan(0, 1),
1034  [](const LegalityQuery &Query) {
1035  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036  })
1037  .narrowScalarIf(
1038  largerThan(0, 1),
1039  [](const LegalityQuery &Query) {
1040  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1041  });
1042 
1043  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1044  .scalarize(0)
1045  .custom();
1046 
1047  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1048  bool IsLoad) -> bool {
1049  const LLT DstTy = Query.Types[0];
1050 
1051  // Split vector extloads.
1052  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1053  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1054 
1055  if (MemSize < DstTy.getSizeInBits())
1056  MemSize = std::max(MemSize, AlignBits);
1057 
1058  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1059  return true;
1060 
1061  const LLT PtrTy = Query.Types[1];
1062  unsigned AS = PtrTy.getAddressSpace();
1063  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1064  return true;
1065 
1066  // Catch weird sized loads that don't evenly divide into the access sizes
1067  // TODO: May be able to widen depending on alignment etc.
1068  unsigned NumRegs = (MemSize + 31) / 32;
1069  if (NumRegs == 3) {
1070  if (!ST.hasDwordx3LoadStores())
1071  return true;
1072  } else {
1073  // If the alignment allows, these should have been widened.
1074  if (!isPowerOf2_32(NumRegs))
1075  return true;
1076  }
1077 
1078  if (AlignBits < MemSize) {
1079  const SITargetLowering *TLI = ST.getTargetLowering();
1080  return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1081  Align(AlignBits / 8));
1082  }
1083 
1084  return false;
1085  };
1086 
1087  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1088  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1089  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1090 
1091  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1092  // LDS
1093  // TODO: Unsupported flat for SI.
1094 
1095  for (unsigned Op : {G_LOAD, G_STORE}) {
1096  const bool IsStore = Op == G_STORE;
1097 
1098  auto &Actions = getActionDefinitionsBuilder(Op);
1099  // Explicitly list some common cases.
1100  // TODO: Does this help compile time at all?
1101  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1102  {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1103  {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1104  {S64, GlobalPtr, S64, GlobalAlign32},
1105  {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1106  {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1107  {S32, GlobalPtr, S8, GlobalAlign8},
1108  {S32, GlobalPtr, S16, GlobalAlign16},
1109 
1110  {S32, LocalPtr, S32, 32},
1111  {S64, LocalPtr, S64, 32},
1112  {V2S32, LocalPtr, V2S32, 32},
1113  {S32, LocalPtr, S8, 8},
1114  {S32, LocalPtr, S16, 16},
1115  {V2S16, LocalPtr, S32, 32},
1116 
1117  {S32, PrivatePtr, S32, 32},
1118  {S32, PrivatePtr, S8, 8},
1119  {S32, PrivatePtr, S16, 16},
1120  {V2S16, PrivatePtr, S32, 32},
1121 
1122  {S32, ConstantPtr, S32, GlobalAlign32},
1123  {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1124  {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1125  {S64, ConstantPtr, S64, GlobalAlign32},
1126  {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1127  Actions.legalIf(
1128  [=](const LegalityQuery &Query) -> bool {
1129  return isLoadStoreLegal(ST, Query);
1130  });
1131 
1132  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1133  // 64-bits.
1134  //
1135  // TODO: Should generalize bitcast action into coerce, which will also cover
1136  // inserting addrspacecasts.
1137  Actions.customIf(typeIs(1, Constant32Ptr));
1138 
1139  // Turn any illegal element vectors into something easier to deal
1140  // with. These will ultimately produce 32-bit scalar shifts to extract the
1141  // parts anyway.
1142  //
1143  // For odd 16-bit element vectors, prefer to split those into pieces with
1144  // 16-bit vector parts.
1145  Actions.bitcastIf(
1146  [=](const LegalityQuery &Query) -> bool {
1147  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1148  Query.MMODescrs[0].MemoryTy);
1149  }, bitcastToRegisterType(0));
1150 
1151  if (!IsStore) {
1152  // Widen suitably aligned loads by loading extra bytes. The standard
1153  // legalization actions can't properly express widening memory operands.
1154  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1155  return shouldWidenLoad(ST, Query, G_LOAD);
1156  });
1157  }
1158 
1159  // FIXME: load/store narrowing should be moved to lower action
1160  Actions
1161  .narrowScalarIf(
1162  [=](const LegalityQuery &Query) -> bool {
1163  return !Query.Types[0].isVector() &&
1164  needToSplitMemOp(Query, Op == G_LOAD);
1165  },
1166  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1167  const LLT DstTy = Query.Types[0];
1168  const LLT PtrTy = Query.Types[1];
1169 
1170  const unsigned DstSize = DstTy.getSizeInBits();
1171  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1172 
1173  // Split extloads.
1174  if (DstSize > MemSize)
1175  return std::make_pair(0, LLT::scalar(MemSize));
1176 
1177  if (!isPowerOf2_32(DstSize)) {
1178  // We're probably decomposing an odd sized store. Try to split
1179  // to the widest type. TODO: Account for alignment. As-is it
1180  // should be OK, since the new parts will be further legalized.
1181  unsigned FloorSize = PowerOf2Floor(DstSize);
1182  return std::make_pair(0, LLT::scalar(FloorSize));
1183  }
1184 
1185  if (DstSize > 32 && (DstSize % 32 != 0)) {
1186  // FIXME: Need a way to specify non-extload of larger size if
1187  // suitably aligned.
1188  return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1189  }
1190 
1191  unsigned MaxSize = maxSizeForAddrSpace(ST,
1192  PtrTy.getAddressSpace(),
1193  Op == G_LOAD);
1194  if (MemSize > MaxSize)
1195  return std::make_pair(0, LLT::scalar(MaxSize));
1196 
1197  unsigned Align = Query.MMODescrs[0].AlignInBits;
1198  return std::make_pair(0, LLT::scalar(Align));
1199  })
1200  .fewerElementsIf(
1201  [=](const LegalityQuery &Query) -> bool {
1202  return Query.Types[0].isVector() &&
1203  needToSplitMemOp(Query, Op == G_LOAD);
1204  },
1205  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1206  const LLT DstTy = Query.Types[0];
1207  const LLT PtrTy = Query.Types[1];
1208 
1209  LLT EltTy = DstTy.getElementType();
1210  unsigned MaxSize = maxSizeForAddrSpace(ST,
1211  PtrTy.getAddressSpace(),
1212  Op == G_LOAD);
1213 
1214  // FIXME: Handle widened to power of 2 results better. This ends
1215  // up scalarizing.
1216  // FIXME: 3 element stores scalarized on SI
1217 
1218  // Split if it's too large for the address space.
1219  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1220  if (MemSize > MaxSize) {
1221  unsigned NumElts = DstTy.getNumElements();
1222  unsigned EltSize = EltTy.getSizeInBits();
1223 
1224  if (MaxSize % EltSize == 0) {
1225  return std::make_pair(
1227  ElementCount::getFixed(MaxSize / EltSize), EltTy));
1228  }
1229 
1230  unsigned NumPieces = MemSize / MaxSize;
1231 
1232  // FIXME: Refine when odd breakdowns handled
1233  // The scalars will need to be re-legalized.
1234  if (NumPieces == 1 || NumPieces >= NumElts ||
1235  NumElts % NumPieces != 0)
1236  return std::make_pair(0, EltTy);
1237 
1238  return std::make_pair(
1239  0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1240  }
1241 
1242  // FIXME: We could probably handle weird extending loads better.
1243  if (DstTy.getSizeInBits() > MemSize)
1244  return std::make_pair(0, EltTy);
1245 
1246  unsigned EltSize = EltTy.getSizeInBits();
1247  unsigned DstSize = DstTy.getSizeInBits();
1248  if (!isPowerOf2_32(DstSize)) {
1249  // We're probably decomposing an odd sized store. Try to split
1250  // to the widest type. TODO: Account for alignment. As-is it
1251  // should be OK, since the new parts will be further legalized.
1252  unsigned FloorSize = PowerOf2Floor(DstSize);
1253  return std::make_pair(
1255  ElementCount::getFixed(FloorSize / EltSize), EltTy));
1256  }
1257 
1258  // Need to split because of alignment.
1259  unsigned Align = Query.MMODescrs[0].AlignInBits;
1260  if (EltSize > Align &&
1261  (EltSize / Align < DstTy.getNumElements())) {
1262  return std::make_pair(
1263  0, LLT::fixed_vector(EltSize / Align, EltTy));
1264  }
1265 
1266  // May need relegalization for the scalars.
1267  return std::make_pair(0, EltTy);
1268  })
1269  .minScalar(0, S32)
1270  .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1271  .widenScalarToNextPow2(0)
1272  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1273  .lower();
1274  }
1275 
1276  // FIXME: Unaligned accesses not lowered.
1277  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1278  .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1279  {S32, GlobalPtr, S16, 2 * 8},
1280  {S32, LocalPtr, S8, 8},
1281  {S32, LocalPtr, S16, 16},
1282  {S32, PrivatePtr, S8, 8},
1283  {S32, PrivatePtr, S16, 16},
1284  {S32, ConstantPtr, S8, 8},
1285  {S32, ConstantPtr, S16, 2 * 8}})
1286  .legalIf(
1287  [=](const LegalityQuery &Query) -> bool {
1288  return isLoadStoreLegal(ST, Query);
1289  });
1290 
1291  if (ST.hasFlatAddressSpace()) {
1292  ExtLoads.legalForTypesWithMemDesc(
1293  {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1294  }
1295 
1296  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1297  // 64-bits.
1298  //
1299  // TODO: Should generalize bitcast action into coerce, which will also cover
1300  // inserting addrspacecasts.
1301  ExtLoads.customIf(typeIs(1, Constant32Ptr));
1302 
1303  ExtLoads.clampScalar(0, S32, S32)
1304  .widenScalarToNextPow2(0)
1305  .lower();
1306 
1307  auto &Atomics = getActionDefinitionsBuilder(
1308  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1309  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1310  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1311  G_ATOMICRMW_UMIN})
1312  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1313  {S64, GlobalPtr}, {S64, LocalPtr},
1314  {S32, RegionPtr}, {S64, RegionPtr}});
1315  if (ST.hasFlatAddressSpace()) {
1316  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1317  }
1318 
1319  auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1320  if (ST.hasLDSFPAtomics()) {
1321  Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1322  if (ST.hasGFX90AInsts())
1323  Atomic.legalFor({{S64, LocalPtr}});
1324  }
1325  if (ST.hasAtomicFaddInsts())
1326  Atomic.legalFor({{S32, GlobalPtr}});
1327 
1328  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1329  // demarshalling
1330  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1331  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1332  {S32, FlatPtr}, {S64, FlatPtr}})
1333  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1334  {S32, RegionPtr}, {S64, RegionPtr}});
1335  // TODO: Pointer types, any 32-bit or 64-bit vector
1336 
1337  // Condition should be s32 for scalar, s1 for vector.
1338  getActionDefinitionsBuilder(G_SELECT)
1339  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1340  LocalPtr, FlatPtr, PrivatePtr,
1341  LLT::fixed_vector(2, LocalPtr),
1342  LLT::fixed_vector(2, PrivatePtr)},
1343  {S1, S32})
1344  .clampScalar(0, S16, S64)
1345  .scalarize(1)
1346  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1347  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1348  .clampMaxNumElements(0, S32, 2)
1349  .clampMaxNumElements(0, LocalPtr, 2)
1350  .clampMaxNumElements(0, PrivatePtr, 2)
1351  .scalarize(0)
1352  .widenScalarToNextPow2(0)
1353  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1354 
1355  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1356  // be more flexible with the shift amount type.
1357  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1358  .legalFor({{S32, S32}, {S64, S32}});
1359  if (ST.has16BitInsts()) {
1360  if (ST.hasVOP3PInsts()) {
1361  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1362  .clampMaxNumElements(0, S16, 2);
1363  } else
1364  Shifts.legalFor({{S16, S16}});
1365 
1366  // TODO: Support 16-bit shift amounts for all types
1367  Shifts.widenScalarIf(
1368  [=](const LegalityQuery &Query) {
1369  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1370  // 32-bit amount.
1371  const LLT ValTy = Query.Types[0];
1372  const LLT AmountTy = Query.Types[1];
1373  return ValTy.getSizeInBits() <= 16 &&
1374  AmountTy.getSizeInBits() < 16;
1375  }, changeTo(1, S16));
1376  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1377  Shifts.clampScalar(1, S32, S32);
1378  Shifts.clampScalar(0, S16, S64);
1379  Shifts.widenScalarToNextPow2(0, 16);
1380 
1381  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1382  .minScalar(0, S16)
1383  .scalarize(0)
1384  .lower();
1385  } else {
1386  // Make sure we legalize the shift amount type first, as the general
1387  // expansion for the shifted type will produce much worse code if it hasn't
1388  // been truncated already.
1389  Shifts.clampScalar(1, S32, S32);
1390  Shifts.clampScalar(0, S32, S64);
1391  Shifts.widenScalarToNextPow2(0, 32);
1392 
1393  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1394  .minScalar(0, S32)
1395  .scalarize(0)
1396  .lower();
1397  }
1398  Shifts.scalarize(0);
1399 
1400  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1401  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1402  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1403  unsigned IdxTypeIdx = 2;
1404 
1405  getActionDefinitionsBuilder(Op)
1406  .customIf([=](const LegalityQuery &Query) {
1407  const LLT EltTy = Query.Types[EltTypeIdx];
1408  const LLT VecTy = Query.Types[VecTypeIdx];
1409  const LLT IdxTy = Query.Types[IdxTypeIdx];
1410  const unsigned EltSize = EltTy.getSizeInBits();
1411  return (EltSize == 32 || EltSize == 64) &&
1412  VecTy.getSizeInBits() % 32 == 0 &&
1413  VecTy.getSizeInBits() <= MaxRegisterSize &&
1414  IdxTy.getSizeInBits() == 32;
1415  })
1416  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1417  bitcastToVectorElement32(VecTypeIdx))
1418  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1419  .bitcastIf(
1420  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1421  [=](const LegalityQuery &Query) {
1422  // For > 64-bit element types, try to turn this into a 64-bit
1423  // element vector since we may be able to do better indexing
1424  // if this is scalar. If not, fall back to 32.
1425  const LLT EltTy = Query.Types[EltTypeIdx];
1426  const LLT VecTy = Query.Types[VecTypeIdx];
1427  const unsigned DstEltSize = EltTy.getSizeInBits();
1428  const unsigned VecSize = VecTy.getSizeInBits();
1429 
1430  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1431  return std::make_pair(
1432  VecTypeIdx,
1433  LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1434  })
1435  .clampScalar(EltTypeIdx, S32, S64)
1436  .clampScalar(VecTypeIdx, S32, S64)
1437  .clampScalar(IdxTypeIdx, S32, S32)
1438  .clampMaxNumElements(VecTypeIdx, S32, 32)
1439  // TODO: Clamp elements for 64-bit vectors?
1440  // It should only be necessary with variable indexes.
1441  // As a last resort, lower to the stack
1442  .lower();
1443  }
1444 
1445  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1446  .unsupportedIf([=](const LegalityQuery &Query) {
1447  const LLT &EltTy = Query.Types[1].getElementType();
1448  return Query.Types[0] != EltTy;
1449  });
1450 
1451  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1452  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1453  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1454 
1455  // FIXME: Doesn't handle extract of illegal sizes.
1456  getActionDefinitionsBuilder(Op)
1457  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1458  // FIXME: Multiples of 16 should not be legal.
1459  .legalIf([=](const LegalityQuery &Query) {
1460  const LLT BigTy = Query.Types[BigTyIdx];
1461  const LLT LitTy = Query.Types[LitTyIdx];
1462  return (BigTy.getSizeInBits() % 32 == 0) &&
1463  (LitTy.getSizeInBits() % 16 == 0);
1464  })
1465  .widenScalarIf(
1466  [=](const LegalityQuery &Query) {
1467  const LLT BigTy = Query.Types[BigTyIdx];
1468  return (BigTy.getScalarSizeInBits() < 16);
1469  },
1471  .widenScalarIf(
1472  [=](const LegalityQuery &Query) {
1473  const LLT LitTy = Query.Types[LitTyIdx];
1474  return (LitTy.getScalarSizeInBits() < 16);
1475  },
1477  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1478  .widenScalarToNextPow2(BigTyIdx, 32);
1479 
1480  }
1481 
1482  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1483  .legalForCartesianProduct(AllS32Vectors, {S32})
1484  .legalForCartesianProduct(AllS64Vectors, {S64})
1485  .clampNumElements(0, V16S32, V32S32)
1486  .clampNumElements(0, V2S64, V16S64)
1487  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1488 
1489  if (ST.hasScalarPackInsts()) {
1490  BuildVector
1491  // FIXME: Should probably widen s1 vectors straight to s32
1492  .minScalarOrElt(0, S16)
1493  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1494  .minScalar(1, S32);
1495 
1496  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1497  .legalFor({V2S16, S32})
1498  .lower();
1499  BuildVector.minScalarOrElt(0, S32);
1500  } else {
1501  BuildVector.customFor({V2S16, S16});
1502  BuildVector.minScalarOrElt(0, S32);
1503 
1504  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1505  .customFor({V2S16, S32})
1506  .lower();
1507  }
1508 
1509  BuildVector.legalIf(isRegisterType(0));
1510 
1511  // FIXME: Clamp maximum size
1512  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1513  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1514  .clampMaxNumElements(0, S32, 32)
1515  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1516  .clampMaxNumElements(0, S16, 64);
1517 
1518  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1519  // pre-legalize.
1520  if (ST.hasVOP3PInsts()) {
1521  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1522  .customFor({V2S16, V2S16})
1523  .lower();
1524  } else
1525  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1526 
1527  // Merge/Unmerge
1528  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1529  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1530  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1531 
1532  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1533  const LLT Ty = Query.Types[TypeIdx];
1534  if (Ty.isVector()) {
1535  const LLT &EltTy = Ty.getElementType();
1536  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1537  return true;
1538  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1539  return true;
1540  }
1541  return false;
1542  };
1543 
1544  auto &Builder = getActionDefinitionsBuilder(Op)
1545  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1546  .lowerFor({{S16, V2S16}})
1547  .lowerIf([=](const LegalityQuery &Query) {
1548  const LLT BigTy = Query.Types[BigTyIdx];
1549  return BigTy.getSizeInBits() == 32;
1550  })
1551  // Try to widen to s16 first for small types.
1552  // TODO: Only do this on targets with legal s16 shifts
1553  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1554  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1555  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1556  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1557  elementTypeIs(1, S16)),
1558  changeTo(1, V2S16))
1559  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1560  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1561  // valid.
1562  .clampScalar(LitTyIdx, S32, S512)
1563  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1564  // Break up vectors with weird elements into scalars
1565  .fewerElementsIf(
1566  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1567  scalarize(0))
1568  .fewerElementsIf(
1569  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1570  scalarize(1))
1571  .clampScalar(BigTyIdx, S32, MaxScalar);
1572 
1573  if (Op == G_MERGE_VALUES) {
1574  Builder.widenScalarIf(
1575  // TODO: Use 16-bit shifts if legal for 8-bit values?
1576  [=](const LegalityQuery &Query) {
1577  const LLT Ty = Query.Types[LitTyIdx];
1578  return Ty.getSizeInBits() < 32;
1579  },
1580  changeTo(LitTyIdx, S32));
1581  }
1582 
1583  Builder.widenScalarIf(
1584  [=](const LegalityQuery &Query) {
1585  const LLT Ty = Query.Types[BigTyIdx];
1586  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1587  Ty.getSizeInBits() % 16 != 0;
1588  },
1589  [=](const LegalityQuery &Query) {
1590  // Pick the next power of 2, or a multiple of 64 over 128.
1591  // Whichever is smaller.
1592  const LLT &Ty = Query.Types[BigTyIdx];
1593  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1594  if (NewSizeInBits >= 256) {
1595  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1596  if (RoundedTo < NewSizeInBits)
1597  NewSizeInBits = RoundedTo;
1598  }
1599  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1600  })
1601  // Any vectors left are the wrong size. Scalarize them.
1602  .scalarize(0)
1603  .scalarize(1);
1604  }
1605 
1606  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1607  // RegBankSelect.
1608  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1609  .legalFor({{S32}, {S64}});
1610 
1611  if (ST.hasVOP3PInsts()) {
1612  SextInReg.lowerFor({{V2S16}})
1613  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1614  // get more vector shift opportunities, since we'll get those when
1615  // expanded.
1616  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1617  } else if (ST.has16BitInsts()) {
1618  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1619  } else {
1620  // Prefer to promote to s32 before lowering if we don't have 16-bit
1621  // shifts. This avoid a lot of intermediate truncate and extend operations.
1622  SextInReg.lowerFor({{S32}, {S64}});
1623  }
1624 
1625  SextInReg
1626  .scalarize(0)
1627  .clampScalar(0, S32, S64)
1628  .lower();
1629 
1630  // TODO: Only Try to form v2s16 with legal packed instructions.
1631  getActionDefinitionsBuilder(G_FSHR)
1632  .legalFor({{S32, S32}})
1633  .lowerFor({{V2S16, V2S16}})
1634  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1635  .scalarize(0)
1636  .lower();
1637 
1638  if (ST.hasVOP3PInsts()) {
1639  getActionDefinitionsBuilder(G_FSHL)
1640  .lowerFor({{V2S16, V2S16}})
1641  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1642  .scalarize(0)
1643  .lower();
1644  } else {
1645  getActionDefinitionsBuilder(G_FSHL)
1646  .scalarize(0)
1647  .lower();
1648  }
1649 
1650  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1651  .legalFor({S64});
1652 
1653  getActionDefinitionsBuilder(G_FENCE)
1654  .alwaysLegal();
1655 
1656  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1657  .scalarize(0)
1658  .minScalar(0, S32)
1659  .lower();
1660 
1661  getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1662  .legalFor({{S32, S32}, {S64, S32}})
1663  .clampScalar(1, S32, S32)
1664  .clampScalar(0, S32, S64)
1665  .widenScalarToNextPow2(0)
1666  .scalarize(0);
1667 
1668  getActionDefinitionsBuilder({
1669  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1670  G_FCOPYSIGN,
1671 
1672  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1673  G_ATOMICRMW_NAND,
1674  G_ATOMICRMW_FSUB,
1675  G_READ_REGISTER,
1676  G_WRITE_REGISTER,
1677 
1678  G_SADDO, G_SSUBO,
1679 
1680  // TODO: Implement
1681  G_FMINIMUM, G_FMAXIMUM}).lower();
1682 
1683  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1684  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1685  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1686  .unsupported();
1687 
1688  getLegacyLegalizerInfo().computeTables();
1689  verify(*ST.getInstrInfo());
1690 }
1691 
1692 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1693  MachineInstr &MI) const {
1694  MachineIRBuilder &B = Helper.MIRBuilder;
1695  MachineRegisterInfo &MRI = *B.getMRI();
1696 
1697  switch (MI.getOpcode()) {
1698  case TargetOpcode::G_ADDRSPACE_CAST:
1699  return legalizeAddrSpaceCast(MI, MRI, B);
1700  case TargetOpcode::G_FRINT:
1701  return legalizeFrint(MI, MRI, B);
1702  case TargetOpcode::G_FCEIL:
1703  return legalizeFceil(MI, MRI, B);
1704  case TargetOpcode::G_FREM:
1705  return legalizeFrem(MI, MRI, B);
1706  case TargetOpcode::G_INTRINSIC_TRUNC:
1707  return legalizeIntrinsicTrunc(MI, MRI, B);
1708  case TargetOpcode::G_SITOFP:
1709  return legalizeITOFP(MI, MRI, B, true);
1710  case TargetOpcode::G_UITOFP:
1711  return legalizeITOFP(MI, MRI, B, false);
1712  case TargetOpcode::G_FPTOSI:
1713  return legalizeFPTOI(MI, MRI, B, true);
1714  case TargetOpcode::G_FPTOUI:
1715  return legalizeFPTOI(MI, MRI, B, false);
1716  case TargetOpcode::G_FMINNUM:
1717  case TargetOpcode::G_FMAXNUM:
1718  case TargetOpcode::G_FMINNUM_IEEE:
1719  case TargetOpcode::G_FMAXNUM_IEEE:
1720  return legalizeMinNumMaxNum(Helper, MI);
1721  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1722  return legalizeExtractVectorElt(MI, MRI, B);
1723  case TargetOpcode::G_INSERT_VECTOR_ELT:
1724  return legalizeInsertVectorElt(MI, MRI, B);
1725  case TargetOpcode::G_SHUFFLE_VECTOR:
1726  return legalizeShuffleVector(MI, MRI, B);
1727  case TargetOpcode::G_FSIN:
1728  case TargetOpcode::G_FCOS:
1729  return legalizeSinCos(MI, MRI, B);
1730  case TargetOpcode::G_GLOBAL_VALUE:
1731  return legalizeGlobalValue(MI, MRI, B);
1732  case TargetOpcode::G_LOAD:
1733  case TargetOpcode::G_SEXTLOAD:
1734  case TargetOpcode::G_ZEXTLOAD:
1735  return legalizeLoad(Helper, MI);
1736  case TargetOpcode::G_FMAD:
1737  return legalizeFMad(MI, MRI, B);
1738  case TargetOpcode::G_FDIV:
1739  return legalizeFDIV(MI, MRI, B);
1740  case TargetOpcode::G_UDIV:
1741  case TargetOpcode::G_UREM:
1742  case TargetOpcode::G_UDIVREM:
1743  return legalizeUnsignedDIV_REM(MI, MRI, B);
1744  case TargetOpcode::G_SDIV:
1745  case TargetOpcode::G_SREM:
1746  case TargetOpcode::G_SDIVREM:
1747  return legalizeSignedDIV_REM(MI, MRI, B);
1748  case TargetOpcode::G_ATOMIC_CMPXCHG:
1749  return legalizeAtomicCmpXChg(MI, MRI, B);
1750  case TargetOpcode::G_FLOG:
1751  return legalizeFlog(MI, B, numbers::ln2f);
1752  case TargetOpcode::G_FLOG10:
1754  case TargetOpcode::G_FEXP:
1755  return legalizeFExp(MI, B);
1756  case TargetOpcode::G_FPOW:
1757  return legalizeFPow(MI, B);
1758  case TargetOpcode::G_FFLOOR:
1759  return legalizeFFloor(MI, MRI, B);
1760  case TargetOpcode::G_BUILD_VECTOR:
1761  return legalizeBuildVector(MI, MRI, B);
1762  default:
1763  return false;
1764  }
1765 
1766  llvm_unreachable("expected switch to return");
1767 }
1768 
1770  unsigned AS,
1772  MachineIRBuilder &B) const {
1773  MachineFunction &MF = B.getMF();
1774  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1775  const LLT S32 = LLT::scalar(32);
1776 
1778 
1779  if (ST.hasApertureRegs()) {
1780  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1781  // getreg.
1782  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1785  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1788  unsigned Encoding =
1791  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1792 
1793  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1794 
1795  B.buildInstr(AMDGPU::S_GETREG_B32)
1796  .addDef(GetReg)
1797  .addImm(Encoding);
1798  MRI.setType(GetReg, S32);
1799 
1800  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1801  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1802  }
1803 
1806 
1808  return Register();
1809 
1810  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1811  // private_segment_aperture_base_hi.
1812  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1813 
1814  // TODO: can we be smarter about machine pointer info?
1817  PtrInfo,
1820  LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1821 
1822  Register LoadAddr;
1823 
1824  B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1825  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1826 }
1827 
1830  MachineIRBuilder &B) const {
1831  MachineFunction &MF = B.getMF();
1832 
1833  const LLT S32 = LLT::scalar(32);
1834  Register Dst = MI.getOperand(0).getReg();
1835  Register Src = MI.getOperand(1).getReg();
1836 
1837  LLT DstTy = MRI.getType(Dst);
1838  LLT SrcTy = MRI.getType(Src);
1839  unsigned DestAS = DstTy.getAddressSpace();
1840  unsigned SrcAS = SrcTy.getAddressSpace();
1841 
1842  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1843  // vector element.
1844  assert(!DstTy.isVector());
1845 
1846  const AMDGPUTargetMachine &TM
1847  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1848 
1849  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1850  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1851  return true;
1852  }
1853 
1854  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1855  // Truncate.
1856  B.buildExtract(Dst, Src, 0);
1857  MI.eraseFromParent();
1858  return true;
1859  }
1860 
1861  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1864 
1865  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1866  // another. Merge operands are required to be the same type, but creating an
1867  // extra ptrtoint would be kind of pointless.
1868  auto HighAddr = B.buildConstant(
1870  B.buildMerge(Dst, {Src, HighAddr});
1871  MI.eraseFromParent();
1872  return true;
1873  }
1874 
1875  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1876  assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1877  DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1878  unsigned NullVal = TM.getNullPointerValue(DestAS);
1879 
1880  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1881  auto FlatNull = B.buildConstant(SrcTy, 0);
1882 
1883  // Extract low 32-bits of the pointer.
1884  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1885 
1886  auto CmpRes =
1887  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1888  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1889 
1890  MI.eraseFromParent();
1891  return true;
1892  }
1893 
1894  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1895  return false;
1896 
1897  if (!ST.hasFlatAddressSpace())
1898  return false;
1899 
1900  auto SegmentNull =
1901  B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1902  auto FlatNull =
1903  B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1904 
1905  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1906  if (!ApertureReg.isValid())
1907  return false;
1908 
1909  auto CmpRes =
1910  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1911 
1912  // Coerce the type of the low half of the result so we can use merge_values.
1913  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1914 
1915  // TODO: Should we allow mismatched types but matching sizes in merges to
1916  // avoid the ptrtoint?
1917  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1918  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1919 
1920  MI.eraseFromParent();
1921  return true;
1922 }
1923 
1926  MachineIRBuilder &B) const {
1927  Register Src = MI.getOperand(1).getReg();
1928  LLT Ty = MRI.getType(Src);
1929  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1930 
1931  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1932  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1933 
1934  auto C1 = B.buildFConstant(Ty, C1Val);
1935  auto CopySign = B.buildFCopysign(Ty, C1, Src);
1936 
1937  // TODO: Should this propagate fast-math-flags?
1938  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1939  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1940 
1941  auto C2 = B.buildFConstant(Ty, C2Val);
1942  auto Fabs = B.buildFAbs(Ty, Src);
1943 
1944  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1945  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1946  MI.eraseFromParent();
1947  return true;
1948 }
1949 
1952  MachineIRBuilder &B) const {
1953 
1954  const LLT S1 = LLT::scalar(1);
1955  const LLT S64 = LLT::scalar(64);
1956 
1957  Register Src = MI.getOperand(1).getReg();
1958  assert(MRI.getType(Src) == S64);
1959 
1960  // result = trunc(src)
1961  // if (src > 0.0 && src != result)
1962  // result += 1.0
1963 
1964  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1965 
1966  const auto Zero = B.buildFConstant(S64, 0.0);
1967  const auto One = B.buildFConstant(S64, 1.0);
1968  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1969  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1970  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1971  auto Add = B.buildSelect(S64, And, One, Zero);
1972 
1973  // TODO: Should this propagate fast-math-flags?
1974  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1975  return true;
1976 }
1977 
1980  MachineIRBuilder &B) const {
1981  Register DstReg = MI.getOperand(0).getReg();
1982  Register Src0Reg = MI.getOperand(1).getReg();
1983  Register Src1Reg = MI.getOperand(2).getReg();
1984  auto Flags = MI.getFlags();
1985  LLT Ty = MRI.getType(DstReg);
1986 
1987  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1988  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1989  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1990  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1991  MI.eraseFromParent();
1992  return true;
1993 }
1994 
1996  MachineIRBuilder &B) {
1997  const unsigned FractBits = 52;
1998  const unsigned ExpBits = 11;
1999  LLT S32 = LLT::scalar(32);
2000 
2001  auto Const0 = B.buildConstant(S32, FractBits - 32);
2002  auto Const1 = B.buildConstant(S32, ExpBits);
2003 
2004  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2005  .addUse(Hi)
2006  .addUse(Const0.getReg(0))
2007  .addUse(Const1.getReg(0));
2008 
2009  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2010 }
2011 
2014  MachineIRBuilder &B) const {
2015  const LLT S1 = LLT::scalar(1);
2016  const LLT S32 = LLT::scalar(32);
2017  const LLT S64 = LLT::scalar(64);
2018 
2019  Register Src = MI.getOperand(1).getReg();
2020  assert(MRI.getType(Src) == S64);
2021 
2022  // TODO: Should this use extract since the low half is unused?
2023  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2024  Register Hi = Unmerge.getReg(1);
2025 
2026  // Extract the upper half, since this is where we will find the sign and
2027  // exponent.
2028  auto Exp = extractF64Exponent(Hi, B);
2029 
2030  const unsigned FractBits = 52;
2031 
2032  // Extract the sign bit.
2033  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2034  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2035 
2036  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2037 
2038  const auto Zero32 = B.buildConstant(S32, 0);
2039 
2040  // Extend back to 64-bits.
2041  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2042 
2043  auto Shr = B.buildAShr(S64, FractMask, Exp);
2044  auto Not = B.buildNot(S64, Shr);
2045  auto Tmp0 = B.buildAnd(S64, Src, Not);
2046  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2047 
2048  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2049  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2050 
2051  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2052  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2053  MI.eraseFromParent();
2054  return true;
2055 }
2056 
2059  MachineIRBuilder &B, bool Signed) const {
2060 
2061  Register Dst = MI.getOperand(0).getReg();
2062  Register Src = MI.getOperand(1).getReg();
2063 
2064  const LLT S64 = LLT::scalar(64);
2065  const LLT S32 = LLT::scalar(32);
2066 
2067  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2068 
2069  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2070 
2071  auto CvtHi = Signed ?
2072  B.buildSITOFP(S64, Unmerge.getReg(1)) :
2073  B.buildUITOFP(S64, Unmerge.getReg(1));
2074 
2075  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2076 
2077  auto ThirtyTwo = B.buildConstant(S32, 32);
2078  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2079  .addUse(CvtHi.getReg(0))
2080  .addUse(ThirtyTwo.getReg(0));
2081 
2082  // TODO: Should this propagate fast-math-flags?
2083  B.buildFAdd(Dst, LdExp, CvtLo);
2084  MI.eraseFromParent();
2085  return true;
2086 }
2087 
2088 // TODO: Copied from DAG implementation. Verify logic and document how this
2089 // actually works.
2093  bool Signed) const {
2094 
2095  Register Dst = MI.getOperand(0).getReg();
2096  Register Src = MI.getOperand(1).getReg();
2097 
2098  const LLT S64 = LLT::scalar(64);
2099  const LLT S32 = LLT::scalar(32);
2100 
2101  const LLT SrcLT = MRI.getType(Src);
2102  assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2103 
2104  unsigned Flags = MI.getFlags();
2105 
2106  // The basic idea of converting a floating point number into a pair of 32-bit
2107  // integers is illustrated as follows:
2108  //
2109  // tf := trunc(val);
2110  // hif := floor(tf * 2^-32);
2111  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2112  // hi := fptoi(hif);
2113  // lo := fptoi(lof);
2114  //
2115  auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2116  MachineInstrBuilder Sign;
2117  if (Signed && SrcLT == S32) {
2118  // However, a 32-bit floating point number has only 23 bits mantissa and
2119  // it's not enough to hold all the significant bits of `lof` if val is
2120  // negative. To avoid the loss of precision, We need to take the absolute
2121  // value after truncating and flip the result back based on the original
2122  // signedness.
2123  Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2124  Trunc = B.buildFAbs(S32, Trunc, Flags);
2125  }
2126  MachineInstrBuilder K0, K1;
2127  if (SrcLT == S64) {
2128  K0 = B.buildFConstant(S64,
2129  BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2130  K1 = B.buildFConstant(S64,
2131  BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2132  } else {
2133  K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2134  K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2135  }
2136 
2137  auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2138  auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2139  auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2140 
2141  auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2142  : B.buildFPTOUI(S32, FloorMul);
2143  auto Lo = B.buildFPTOUI(S32, Fma);
2144 
2145  if (Signed && SrcLT == S32) {
2146  // Flip the result based on the signedness, which is either all 0s or 1s.
2147  Sign = B.buildMerge(S64, {Sign, Sign});
2148  // r := xor({lo, hi}, sign) - sign;
2149  B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2150  } else
2151  B.buildMerge(Dst, {Lo, Hi});
2152  MI.eraseFromParent();
2153 
2154  return true;
2155 }
2156 
2158  MachineInstr &MI) const {
2159  MachineFunction &MF = Helper.MIRBuilder.getMF();
2161 
2162  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2163  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2164 
2165  // With ieee_mode disabled, the instructions have the correct behavior
2166  // already for G_FMINNUM/G_FMAXNUM
2167  if (!MFI->getMode().IEEE)
2168  return !IsIEEEOp;
2169 
2170  if (IsIEEEOp)
2171  return true;
2172 
2174 }
2175 
2178  MachineIRBuilder &B) const {
2179  // TODO: Should move some of this into LegalizerHelper.
2180 
2181  // TODO: Promote dynamic indexing of s16 to s32
2182 
2183  // FIXME: Artifact combiner probably should have replaced the truncated
2184  // constant before this, so we shouldn't need
2185  // getConstantVRegValWithLookThrough.
2186  Optional<ValueAndVReg> MaybeIdxVal =
2187  getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2188  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2189  return true;
2190  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2191 
2192  Register Dst = MI.getOperand(0).getReg();
2193  Register Vec = MI.getOperand(1).getReg();
2194 
2195  LLT VecTy = MRI.getType(Vec);
2196  LLT EltTy = VecTy.getElementType();
2197  assert(EltTy == MRI.getType(Dst));
2198 
2199  if (IdxVal < VecTy.getNumElements())
2200  B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2201  else
2202  B.buildUndef(Dst);
2203 
2204  MI.eraseFromParent();
2205  return true;
2206 }
2207 
2210  MachineIRBuilder &B) const {
2211  // TODO: Should move some of this into LegalizerHelper.
2212 
2213  // TODO: Promote dynamic indexing of s16 to s32
2214 
2215  // FIXME: Artifact combiner probably should have replaced the truncated
2216  // constant before this, so we shouldn't need
2217  // getConstantVRegValWithLookThrough.
2218  Optional<ValueAndVReg> MaybeIdxVal =
2219  getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2220  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2221  return true;
2222 
2223  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2224  Register Dst = MI.getOperand(0).getReg();
2225  Register Vec = MI.getOperand(1).getReg();
2226  Register Ins = MI.getOperand(2).getReg();
2227 
2228  LLT VecTy = MRI.getType(Vec);
2229  LLT EltTy = VecTy.getElementType();
2230  assert(EltTy == MRI.getType(Ins));
2231 
2232  if (IdxVal < VecTy.getNumElements())
2233  B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2234  else
2235  B.buildUndef(Dst);
2236 
2237  MI.eraseFromParent();
2238  return true;
2239 }
2240 
2243  MachineIRBuilder &B) const {
2244  const LLT V2S16 = LLT::fixed_vector(2, 16);
2245 
2246  Register Dst = MI.getOperand(0).getReg();
2247  Register Src0 = MI.getOperand(1).getReg();
2248  LLT DstTy = MRI.getType(Dst);
2249  LLT SrcTy = MRI.getType(Src0);
2250 
2251  if (SrcTy == V2S16 && DstTy == V2S16 &&
2252  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2253  return true;
2254 
2255  MachineIRBuilder HelperBuilder(MI);
2256  GISelObserverWrapper DummyObserver;
2257  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2259 }
2260 
2263  MachineIRBuilder &B) const {
2264 
2265  Register DstReg = MI.getOperand(0).getReg();
2266  Register SrcReg = MI.getOperand(1).getReg();
2267  LLT Ty = MRI.getType(DstReg);
2268  unsigned Flags = MI.getFlags();
2269 
2270  Register TrigVal;
2271  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2272  if (ST.hasTrigReducedRange()) {
2273  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2274  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2275  .addUse(MulVal.getReg(0))
2276  .setMIFlags(Flags).getReg(0);
2277  } else
2278  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2279 
2280  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2281  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2282  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2283  .addUse(TrigVal)
2284  .setMIFlags(Flags);
2285  MI.eraseFromParent();
2286  return true;
2287 }
2288 
2291  const GlobalValue *GV,
2292  int64_t Offset,
2293  unsigned GAFlags) const {
2294  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2295  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2296  // to the following code sequence:
2297  //
2298  // For constant address space:
2299  // s_getpc_b64 s[0:1]
2300  // s_add_u32 s0, s0, $symbol
2301  // s_addc_u32 s1, s1, 0
2302  //
2303  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2304  // a fixup or relocation is emitted to replace $symbol with a literal
2305  // constant, which is a pc-relative offset from the encoding of the $symbol
2306  // operand to the global variable.
2307  //
2308  // For global address space:
2309  // s_getpc_b64 s[0:1]
2310  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2311  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2312  //
2313  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2314  // fixups or relocations are emitted to replace $symbol@*@lo and
2315  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2316  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2317  // operand to the global variable.
2318  //
2319  // What we want here is an offset from the value returned by s_getpc
2320  // (which is the address of the s_add_u32 instruction) to the global
2321  // variable, but since the encoding of $symbol starts 4 bytes after the start
2322  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2323  // small. This requires us to add 4 to the global variable offset in order to
2324  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2325  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2326  // instruction.
2327 
2328  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2329 
2330  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2331  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2332 
2333  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2334  .addDef(PCReg);
2335 
2336  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2337  if (GAFlags == SIInstrInfo::MO_NONE)
2338  MIB.addImm(0);
2339  else
2340  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2341 
2342  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2343 
2344  if (PtrTy.getSizeInBits() == 32)
2345  B.buildExtract(DstReg, PCReg, 0);
2346  return true;
2347  }
2348 
2351  MachineIRBuilder &B) const {
2352  Register DstReg = MI.getOperand(0).getReg();
2353  LLT Ty = MRI.getType(DstReg);
2354  unsigned AS = Ty.getAddressSpace();
2355 
2356  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2357  MachineFunction &MF = B.getMF();
2359 
2361  if (!MFI->isModuleEntryFunction() &&
2362  !GV->getName().equals("llvm.amdgcn.module.lds")) {
2363  const Function &Fn = MF.getFunction();
2364  DiagnosticInfoUnsupported BadLDSDecl(
2365  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2366  DS_Warning);
2367  Fn.getContext().diagnose(BadLDSDecl);
2368 
2369  // We currently don't have a way to correctly allocate LDS objects that
2370  // aren't directly associated with a kernel. We do force inlining of
2371  // functions that use local objects. However, if these dead functions are
2372  // not eliminated, we don't want a compile time error. Just emit a warning
2373  // and a trap, since there should be no callable path here.
2374  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2375  B.buildUndef(DstReg);
2376  MI.eraseFromParent();
2377  return true;
2378  }
2379 
2380  // TODO: We could emit code to handle the initialization somewhere.
2382  const SITargetLowering *TLI = ST.getTargetLowering();
2383  if (!TLI->shouldUseLDSConstAddress(GV)) {
2384  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2385  return true; // Leave in place;
2386  }
2387 
2388  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2389  Type *Ty = GV->getValueType();
2390  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2391  // zero-sized type in other languages to declare the dynamic shared
2392  // memory which size is not known at the compile time. They will be
2393  // allocated by the runtime and placed directly after the static
2394  // allocated ones. They all share the same offset.
2395  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2396  // Adjust alignment for that dynamic shared memory array.
2397  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2398  LLT S32 = LLT::scalar(32);
2399  auto Sz =
2400  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2401  B.buildIntToPtr(DstReg, Sz);
2402  MI.eraseFromParent();
2403  return true;
2404  }
2405  }
2406 
2407  B.buildConstant(
2408  DstReg,
2409  MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2410  MI.eraseFromParent();
2411  return true;
2412  }
2413 
2414  const Function &Fn = MF.getFunction();
2415  DiagnosticInfoUnsupported BadInit(
2416  Fn, "unsupported initializer for address space", MI.getDebugLoc());
2417  Fn.getContext().diagnose(BadInit);
2418  return true;
2419  }
2420 
2421  const SITargetLowering *TLI = ST.getTargetLowering();
2422 
2423  if (TLI->shouldEmitFixup(GV)) {
2424  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2425  MI.eraseFromParent();
2426  return true;
2427  }
2428 
2429  if (TLI->shouldEmitPCReloc(GV)) {
2430  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2431  MI.eraseFromParent();
2432  return true;
2433  }
2434 
2436  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2437 
2438  LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2443  LoadTy, Align(8));
2444 
2445  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2446 
2447  if (Ty.getSizeInBits() == 32) {
2448  // Truncate if this is a 32-bit constant adrdess.
2449  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2450  B.buildExtract(DstReg, Load, 0);
2451  } else
2452  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2453 
2454  MI.eraseFromParent();
2455  return true;
2456 }
2457 
2459  if (Ty.isVector())
2460  return Ty.changeElementCount(
2462  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2463 }
2464 
2466  MachineInstr &MI) const {
2467  MachineIRBuilder &B = Helper.MIRBuilder;
2468  MachineRegisterInfo &MRI = *B.getMRI();
2469  GISelChangeObserver &Observer = Helper.Observer;
2470 
2471  Register PtrReg = MI.getOperand(1).getReg();
2472  LLT PtrTy = MRI.getType(PtrReg);
2473  unsigned AddrSpace = PtrTy.getAddressSpace();
2474 
2475  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2477  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2478  Observer.changingInstr(MI);
2479  MI.getOperand(1).setReg(Cast.getReg(0));
2480  Observer.changedInstr(MI);
2481  return true;
2482  }
2483 
2484  if (MI.getOpcode() != AMDGPU::G_LOAD)
2485  return false;
2486 
2487  Register ValReg = MI.getOperand(0).getReg();
2488  LLT ValTy = MRI.getType(ValReg);
2489 
2490  MachineMemOperand *MMO = *MI.memoperands_begin();
2491  const unsigned ValSize = ValTy.getSizeInBits();
2492  const LLT MemTy = MMO->getMemoryType();
2493  const Align MemAlign = MMO->getAlign();
2494  const unsigned MemSize = MemTy.getSizeInBits();
2495  const unsigned AlignInBits = 8 * MemAlign.value();
2496 
2497  // Widen non-power-of-2 loads to the alignment if needed
2498  if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2499  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2500 
2501  // This was already the correct extending load result type, so just adjust
2502  // the memory type.
2503  if (WideMemSize == ValSize) {
2504  MachineFunction &MF = B.getMF();
2505 
2506  MachineMemOperand *WideMMO =
2507  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2508  Observer.changingInstr(MI);
2509  MI.setMemRefs(MF, {WideMMO});
2510  Observer.changedInstr(MI);
2511  return true;
2512  }
2513 
2514  // Don't bother handling edge case that should probably never be produced.
2515  if (ValSize > WideMemSize)
2516  return false;
2517 
2518  LLT WideTy = widenToNextPowerOf2(ValTy);
2519 
2520  Register WideLoad;
2521  if (!WideTy.isVector()) {
2522  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2523  B.buildTrunc(ValReg, WideLoad).getReg(0);
2524  } else {
2525  // Extract the subvector.
2526 
2527  if (isRegisterType(ValTy)) {
2528  // If this a case where G_EXTRACT is legal, use it.
2529  // (e.g. <3 x s32> -> <4 x s32>)
2530  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2531  B.buildExtract(ValReg, WideLoad, 0);
2532  } else {
2533  // For cases where the widened type isn't a nice register value, unmerge
2534  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2535  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2536  WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2537  B.setInsertPt(B.getMBB(), MI.getIterator());
2538  B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2539  }
2540  }
2541 
2542  MI.eraseFromParent();
2543  return true;
2544  }
2545 
2546  return false;
2547 }
2548 
2551  MachineIRBuilder &B) const {
2552  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2553  assert(Ty.isScalar());
2554 
2555  MachineFunction &MF = B.getMF();
2557 
2558  // TODO: Always legal with future ftz flag.
2559  // FIXME: Do we need just output?
2560  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2561  return true;
2562  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2563  return true;
2564 
2565  MachineIRBuilder HelperBuilder(MI);
2566  GISelObserverWrapper DummyObserver;
2567  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2568  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2569 }
2570 
2573  Register DstReg = MI.getOperand(0).getReg();
2574  Register PtrReg = MI.getOperand(1).getReg();
2575  Register CmpVal = MI.getOperand(2).getReg();
2576  Register NewVal = MI.getOperand(3).getReg();
2577 
2579  "this should not have been custom lowered");
2580 
2581  LLT ValTy = MRI.getType(CmpVal);
2582  LLT VecTy = LLT::fixed_vector(2, ValTy);
2583 
2584  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2585 
2586  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2587  .addDef(DstReg)
2588  .addUse(PtrReg)
2589  .addUse(PackedVal)
2590  .setMemRefs(MI.memoperands());
2591 
2592  MI.eraseFromParent();
2593  return true;
2594 }
2595 
2597  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2598  Register Dst = MI.getOperand(0).getReg();
2599  Register Src = MI.getOperand(1).getReg();
2600  LLT Ty = B.getMRI()->getType(Dst);
2601  unsigned Flags = MI.getFlags();
2602 
2603  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2604  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2605 
2606  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2607  MI.eraseFromParent();
2608  return true;
2609 }
2610 
2612  MachineIRBuilder &B) const {
2613  Register Dst = MI.getOperand(0).getReg();
2614  Register Src = MI.getOperand(1).getReg();
2615  unsigned Flags = MI.getFlags();
2616  LLT Ty = B.getMRI()->getType(Dst);
2617 
2618  auto K = B.buildFConstant(Ty, numbers::log2e);
2619  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2620  B.buildFExp2(Dst, Mul, Flags);
2621  MI.eraseFromParent();
2622  return true;
2623 }
2624 
2626  MachineIRBuilder &B) const {
2627  Register Dst = MI.getOperand(0).getReg();
2628  Register Src0 = MI.getOperand(1).getReg();
2629  Register Src1 = MI.getOperand(2).getReg();
2630  unsigned Flags = MI.getFlags();
2631  LLT Ty = B.getMRI()->getType(Dst);
2632  const LLT S16 = LLT::scalar(16);
2633  const LLT S32 = LLT::scalar(32);
2634 
2635  if (Ty == S32) {
2636  auto Log = B.buildFLog2(S32, Src0, Flags);
2637  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2638  .addUse(Log.getReg(0))
2639  .addUse(Src1)
2640  .setMIFlags(Flags);
2641  B.buildFExp2(Dst, Mul, Flags);
2642  } else if (Ty == S16) {
2643  // There's no f16 fmul_legacy, so we need to convert for it.
2644  auto Log = B.buildFLog2(S16, Src0, Flags);
2645  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2646  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2647  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2648  .addUse(Ext0.getReg(0))
2649  .addUse(Ext1.getReg(0))
2650  .setMIFlags(Flags);
2651 
2652  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2653  } else
2654  return false;
2655 
2656  MI.eraseFromParent();
2657  return true;
2658 }
2659 
2660 // Find a source register, ignoring any possible source modifiers.
2662  Register ModSrc = OrigSrc;
2663  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2664  ModSrc = SrcFNeg->getOperand(1).getReg();
2665  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2666  ModSrc = SrcFAbs->getOperand(1).getReg();
2667  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2668  ModSrc = SrcFAbs->getOperand(1).getReg();
2669  return ModSrc;
2670 }
2671 
2674  MachineIRBuilder &B) const {
2675 
2676  const LLT S1 = LLT::scalar(1);
2677  const LLT S64 = LLT::scalar(64);
2678  Register Dst = MI.getOperand(0).getReg();
2679  Register OrigSrc = MI.getOperand(1).getReg();
2680  unsigned Flags = MI.getFlags();
2681  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2682  "this should not have been custom lowered");
2683 
2684  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2685  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2686  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2687  // V_FRACT bug is:
2688  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2689  //
2690  // Convert floor(x) to (x - fract(x))
2691 
2692  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2693  .addUse(OrigSrc)
2694  .setMIFlags(Flags);
2695 
2696  // Give source modifier matching some assistance before obscuring a foldable
2697  // pattern.
2698 
2699  // TODO: We can avoid the neg on the fract? The input sign to fract
2700  // shouldn't matter?
2701  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2702 
2703  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2704 
2706 
2707  // We don't need to concern ourselves with the snan handling difference, so
2708  // use the one which will directly select.
2709  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2710  if (MFI->getMode().IEEE)
2711  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2712  else
2713  B.buildFMinNum(Min, Fract, Const, Flags);
2714 
2715  Register CorrectedFract = Min;
2716  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2717  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2718  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2719  }
2720 
2721  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2722  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2723 
2724  MI.eraseFromParent();
2725  return true;
2726 }
2727 
2728 // Turn an illegal packed v2s16 build vector into bit operations.
2729 // TODO: This should probably be a bitcast action in LegalizerHelper.
2732  Register Dst = MI.getOperand(0).getReg();
2733  const LLT S32 = LLT::scalar(32);
2734  assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2735 
2736  Register Src0 = MI.getOperand(1).getReg();
2737  Register Src1 = MI.getOperand(2).getReg();
2738  assert(MRI.getType(Src0) == LLT::scalar(16));
2739 
2740  auto Merge = B.buildMerge(S32, {Src0, Src1});
2741  B.buildBitcast(Dst, Merge);
2742 
2743  MI.eraseFromParent();
2744  return true;
2745 }
2746 
2747 // Check that this is a G_XOR x, -1
2748 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2749  if (MI.getOpcode() != TargetOpcode::G_XOR)
2750  return false;
2751  auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2752  return ConstVal && *ConstVal == -1;
2753 }
2754 
2755 // Return the use branch instruction, otherwise null if the usage is invalid.
2756 static MachineInstr *
2758  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2759  Register CondDef = MI.getOperand(0).getReg();
2760  if (!MRI.hasOneNonDBGUse(CondDef))
2761  return nullptr;
2762 
2763  MachineBasicBlock *Parent = MI.getParent();
2765 
2766  if (isNot(MRI, *UseMI)) {
2767  Register NegatedCond = UseMI->getOperand(0).getReg();
2768  if (!MRI.hasOneNonDBGUse(NegatedCond))
2769  return nullptr;
2770 
2771  // We're deleting the def of this value, so we need to remove it.
2773 
2774  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2775  Negated = true;
2776  }
2777 
2778  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2779  return nullptr;
2780 
2781  // Make sure the cond br is followed by a G_BR, or is the last instruction.
2782  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2783  if (Next == Parent->end()) {
2784  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2785  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2786  return nullptr;
2787  UncondBrTarget = &*NextMBB;
2788  } else {
2789  if (Next->getOpcode() != AMDGPU::G_BR)
2790  return nullptr;
2791  Br = &*Next;
2792  UncondBrTarget = Br->getOperand(0).getMBB();
2793  }
2794 
2795  return UseMI;
2796 }
2797 
2799  const ArgDescriptor *Arg,
2800  const TargetRegisterClass *ArgRC,
2801  LLT ArgTy) const {
2802  MCRegister SrcReg = Arg->getRegister();
2803  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2804  assert(DstReg.isVirtual() && "Virtual register expected");
2805 
2806  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2807  ArgTy);
2808  if (Arg->isMasked()) {
2809  // TODO: Should we try to emit this once in the entry block?
2810  const LLT S32 = LLT::scalar(32);
2811  const unsigned Mask = Arg->getMask();
2812  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2813 
2814  Register AndMaskSrc = LiveIn;
2815 
2816  if (Shift != 0) {
2817  auto ShiftAmt = B.buildConstant(S32, Shift);
2818  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2819  }
2820 
2821  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2822  } else {
2823  B.buildCopy(DstReg, LiveIn);
2824  }
2825 
2826  return true;
2827 }
2828 
2830  Register DstReg, MachineIRBuilder &B,
2831  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2832  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2833  const ArgDescriptor *Arg;
2834  const TargetRegisterClass *ArgRC;
2835  LLT ArgTy;
2836  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2837 
2838  if (!Arg->isRegister() || !Arg->getRegister().isValid())
2839  return false; // TODO: Handle these
2840  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2841 }
2842 
2845  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2846  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2847  return false;
2848 
2849  MI.eraseFromParent();
2850  return true;
2851 }
2852 
2855  MachineIRBuilder &B) const {
2856  Register Dst = MI.getOperand(0).getReg();
2857  LLT DstTy = MRI.getType(Dst);
2858  LLT S16 = LLT::scalar(16);
2859  LLT S32 = LLT::scalar(32);
2860  LLT S64 = LLT::scalar(64);
2861 
2862  if (DstTy == S16)
2863  return legalizeFDIV16(MI, MRI, B);
2864  if (DstTy == S32)
2865  return legalizeFDIV32(MI, MRI, B);
2866  if (DstTy == S64)
2867  return legalizeFDIV64(MI, MRI, B);
2868 
2869  return false;
2870 }
2871 
2873  Register DstDivReg,
2874  Register DstRemReg,
2875  Register X,
2876  Register Y) const {
2877  const LLT S1 = LLT::scalar(1);
2878  const LLT S32 = LLT::scalar(32);
2879 
2880  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2881  // algorithm used here.
2882 
2883  // Initial estimate of inv(y).
2884  auto FloatY = B.buildUITOFP(S32, Y);
2885  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2886  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2887  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2888  auto Z = B.buildFPTOUI(S32, ScaledY);
2889 
2890  // One round of UNR.
2891  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2892  auto NegYZ = B.buildMul(S32, NegY, Z);
2893  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2894 
2895  // Quotient/remainder estimate.
2896  auto Q = B.buildUMulH(S32, X, Z);
2897  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2898 
2899  // First quotient/remainder refinement.
2900  auto One = B.buildConstant(S32, 1);
2901  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2902  if (DstDivReg)
2903  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2904  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2905 
2906  // Second quotient/remainder refinement.
2907  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2908  if (DstDivReg)
2909  B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2910 
2911  if (DstRemReg)
2912  B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2913 }
2914 
2915 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2916 //
2917 // Return lo, hi of result
2918 //
2919 // %cvt.lo = G_UITOFP Val.lo
2920 // %cvt.hi = G_UITOFP Val.hi
2921 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2922 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2923 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2924 // %mul2 = G_FMUL %mul1, 2**(-32)
2925 // %trunc = G_INTRINSIC_TRUNC %mul2
2926 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2927 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2928 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2929  Register Val) {
2930  const LLT S32 = LLT::scalar(32);
2931  auto Unmerge = B.buildUnmerge(S32, Val);
2932 
2933  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2934  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2935 
2936  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2937  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2938 
2939  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2940  auto Mul1 =
2941  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2942 
2943  // 2**(-32)
2944  auto Mul2 =
2945  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2946  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2947 
2948  // -(2**32)
2949  auto Mad2 = B.buildFMAD(S32, Trunc,
2950  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2951 
2952  auto ResultLo = B.buildFPTOUI(S32, Mad2);
2953  auto ResultHi = B.buildFPTOUI(S32, Trunc);
2954 
2955  return {ResultLo.getReg(0), ResultHi.getReg(0)};
2956 }
2957 
2959  Register DstDivReg,
2960  Register DstRemReg,
2961  Register Numer,
2962  Register Denom) const {
2963  const LLT S32 = LLT::scalar(32);
2964  const LLT S64 = LLT::scalar(64);
2965  const LLT S1 = LLT::scalar(1);
2966  Register RcpLo, RcpHi;
2967 
2968  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2969 
2970  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2971 
2972  auto Zero64 = B.buildConstant(S64, 0);
2973  auto NegDenom = B.buildSub(S64, Zero64, Denom);
2974 
2975  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2976  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2977 
2978  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2979  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2980  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2981 
2982  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2983  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2984  auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2985  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2986 
2987  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2988  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2989  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2990  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2991  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2992 
2993  auto Zero32 = B.buildConstant(S32, 0);
2994  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2995  auto Add2_HiC =
2996  B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2997  auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2998  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2999 
3000  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3001  Register NumerLo = UnmergeNumer.getReg(0);
3002  Register NumerHi = UnmergeNumer.getReg(1);
3003 
3004  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3005  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3006  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3007  Register Mul3_Lo = UnmergeMul3.getReg(0);
3008  Register Mul3_Hi = UnmergeMul3.getReg(1);
3009  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3010  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3011  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3012  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3013 
3014  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3015  Register DenomLo = UnmergeDenom.getReg(0);
3016  Register DenomHi = UnmergeDenom.getReg(1);
3017 
3018  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3019  auto C1 = B.buildSExt(S32, CmpHi);
3020 
3021  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3022  auto C2 = B.buildSExt(S32, CmpLo);
3023 
3024  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3025  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3026 
3027  // TODO: Here and below portions of the code can be enclosed into if/endif.
3028  // Currently control flow is unconditional and we have 4 selects after
3029  // potential endif to substitute PHIs.
3030 
3031  // if C3 != 0 ...
3032  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3033  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3034  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3035  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3036 
3037  auto One64 = B.buildConstant(S64, 1);
3038  auto Add3 = B.buildAdd(S64, MulHi3, One64);
3039 
3040  auto C4 =
3041  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3042  auto C5 =
3043  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3044  auto C6 = B.buildSelect(
3045  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3046 
3047  // if (C6 != 0)
3048  auto Add4 = B.buildAdd(S64, Add3, One64);
3049  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3050 
3051  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3052  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3053  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3054 
3055  // endif C6
3056  // endif C3
3057 
3058  if (DstDivReg) {
3059  auto Sel1 = B.buildSelect(
3060  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3061  B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3062  Sel1, MulHi3);
3063  }
3064 
3065  if (DstRemReg) {
3066  auto Sel2 = B.buildSelect(
3067  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3068  B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3069  Sel2, Sub1);
3070  }
3071 }
3072 
3075  MachineIRBuilder &B) const {
3076  Register DstDivReg, DstRemReg;
3077  switch (MI.getOpcode()) {
3078  default:
3079  llvm_unreachable("Unexpected opcode!");
3080  case AMDGPU::G_UDIV: {
3081  DstDivReg = MI.getOperand(0).getReg();
3082  break;
3083  }
3084  case AMDGPU::G_UREM: {
3085  DstRemReg = MI.getOperand(0).getReg();
3086  break;
3087  }
3088  case AMDGPU::G_UDIVREM: {
3089  DstDivReg = MI.getOperand(0).getReg();
3090  DstRemReg = MI.getOperand(1).getReg();
3091  break;
3092  }
3093  }
3094 
3095  const LLT S64 = LLT::scalar(64);
3096  const LLT S32 = LLT::scalar(32);
3097  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3098  Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3099  Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3100  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3101 
3102  if (Ty == S32)
3103  legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3104  else if (Ty == S64)
3105  legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3106  else
3107  return false;
3108 
3109  MI.eraseFromParent();
3110  return true;
3111 }
3112 
3115  MachineIRBuilder &B) const {
3116  const LLT S64 = LLT::scalar(64);
3117  const LLT S32 = LLT::scalar(32);
3118 
3119  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3120  if (Ty != S32 && Ty != S64)
3121  return false;
3122 
3123  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3124  Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3125  Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3126 
3127  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3128  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3129  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3130 
3131  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3132  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3133 
3134  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3135  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3136 
3137  Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3138  switch (MI.getOpcode()) {
3139  default:
3140  llvm_unreachable("Unexpected opcode!");
3141  case AMDGPU::G_SDIV: {
3142  DstDivReg = MI.getOperand(0).getReg();
3143  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3144  break;
3145  }
3146  case AMDGPU::G_SREM: {
3147  DstRemReg = MI.getOperand(0).getReg();
3148  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3149  break;
3150  }
3151  case AMDGPU::G_SDIVREM: {
3152  DstDivReg = MI.getOperand(0).getReg();
3153  DstRemReg = MI.getOperand(1).getReg();
3154  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3155  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3156  break;
3157  }
3158  }
3159 
3160  if (Ty == S32)
3161  legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3162  else
3163  legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3164 
3165  if (DstDivReg) {
3166  auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3167  auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3168  B.buildSub(DstDivReg, SignXor, Sign);
3169  }
3170 
3171  if (DstRemReg) {
3172  auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3173  auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3174  B.buildSub(DstRemReg, SignXor, Sign);
3175  }
3176 
3177  MI.eraseFromParent();
3178  return true;
3179 }
3180 
3183  MachineIRBuilder &B) const {
3184  Register Res = MI.getOperand(0).getReg();
3185  Register LHS = MI.getOperand(1).getReg();
3186  Register RHS = MI.getOperand(2).getReg();
3187  uint16_t Flags = MI.getFlags();
3188  LLT ResTy = MRI.getType(Res);
3189 
3190  const MachineFunction &MF = B.getMF();
3191  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3192  MI.getFlag(MachineInstr::FmAfn);
3193 
3194  if (!AllowInaccurateRcp)
3195  return false;
3196 
3197  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3198  // 1 / x -> RCP(x)
3199  if (CLHS->isExactlyValue(1.0)) {
3200  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3201  .addUse(RHS)
3202  .setMIFlags(Flags);
3203 
3204  MI.eraseFromParent();
3205  return true;
3206  }
3207 
3208  // -1 / x -> RCP( FNEG(x) )
3209  if (CLHS->isExactlyValue(-1.0)) {
3210  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3211  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3212  .addUse(FNeg.getReg(0))
3213  .setMIFlags(Flags);
3214 
3215  MI.eraseFromParent();
3216  return true;
3217  }
3218  }
3219 
3220  // x / y -> x * (1.0 / y)
3221  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3222  .addUse(RHS)
3223  .setMIFlags(Flags);
3224  B.buildFMul(Res, LHS, RCP, Flags);
3225 
3226  MI.eraseFromParent();
3227  return true;
3228 }
3229 
3232  MachineIRBuilder &B) const {
3233  Register Res = MI.getOperand(0).getReg();
3234  Register X = MI.getOperand(1).getReg();
3235  Register Y = MI.getOperand(2).getReg();
3236  uint16_t Flags = MI.getFlags();
3237  LLT ResTy = MRI.getType(Res);
3238 
3239  const MachineFunction &MF = B.getMF();
3240  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3241  MI.getFlag(MachineInstr::FmAfn);
3242 
3243  if (!AllowInaccurateRcp)
3244  return false;
3245 
3246  auto NegY = B.buildFNeg(ResTy, Y);
3247  auto One = B.buildFConstant(ResTy, 1.0);
3248 
3249  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3250  .addUse(Y)
3251  .setMIFlags(Flags);
3252 
3253  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3254  R = B.buildFMA(ResTy, Tmp0, R, R);
3255 
3256  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3257  R = B.buildFMA(ResTy, Tmp1, R, R);
3258 
3259  auto Ret = B.buildFMul(ResTy, X, R);
3260  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3261 
3262  B.buildFMA(Res, Tmp2, R, Ret);
3263  MI.eraseFromParent();
3264  return true;
3265 }
3266 
3269  MachineIRBuilder &B) const {
3270  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3271  return true;
3272 
3273  Register Res = MI.getOperand(0).getReg();
3274  Register LHS = MI.getOperand(1).getReg();
3275  Register RHS = MI.getOperand(2).getReg();
3276 
3277  uint16_t Flags = MI.getFlags();
3278 
3279  LLT S16 = LLT::scalar(16);
3280  LLT S32 = LLT::scalar(32);
3281 
3282  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3283  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3284 
3285  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3286  .addUse(RHSExt.getReg(0))
3287  .setMIFlags(Flags);
3288 
3289  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3290  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3291 
3292  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3293  .addUse(RDst.getReg(0))
3294  .addUse(RHS)
3295  .addUse(LHS)
3296  .setMIFlags(Flags);
3297 
3298  MI.eraseFromParent();
3299  return true;
3300 }
3301 
3302 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3303 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3304 static void toggleSPDenormMode(bool Enable,
3306  const GCNSubtarget &ST,
3308  // Set SP denorm mode to this value.
3309  unsigned SPDenormMode =
3310  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3311 
3312  if (ST.hasDenormModeInst()) {
3313  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3314  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3315 
3316  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3317  B.buildInstr(AMDGPU::S_DENORM_MODE)
3318  .addImm(NewDenormModeValue);
3319 
3320  } else {
3321  // Select FP32 bit field in mode register.
3322  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3325 
3326  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3327  .addImm(SPDenormMode)
3328  .addImm(SPDenormModeBitField);
3329  }
3330 }
3331 
3334  MachineIRBuilder &B) const {
3335  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3336  return true;
3337 
3338  Register Res = MI.getOperand(0).getReg();
3339  Register LHS = MI.getOperand(1).getReg();
3340  Register RHS = MI.getOperand(2).getReg();
3341  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3343 
3344  uint16_t Flags = MI.getFlags();
3345 
3346  LLT S32 = LLT::scalar(32);
3347  LLT S1 = LLT::scalar(1);
3348 
3349  auto One = B.buildFConstant(S32, 1.0f);
3350 
3351  auto DenominatorScaled =
3352  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3353  .addUse(LHS)
3354  .addUse(RHS)
3355  .addImm(0)
3356  .setMIFlags(Flags);
3357  auto NumeratorScaled =
3358  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3359  .addUse(LHS)
3360  .addUse(RHS)
3361  .addImm(1)
3362  .setMIFlags(Flags);
3363 
3364  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3365  .addUse(DenominatorScaled.getReg(0))
3366  .setMIFlags(Flags);
3367  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3368 
3369  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3370  // aren't modeled as reading it.
3371  if (!Mode.allFP32Denormals())
3372  toggleSPDenormMode(true, B, ST, Mode);
3373 
3374  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3375  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3376  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3377  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3378  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3379  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3380 
3381  if (!Mode.allFP32Denormals())
3382  toggleSPDenormMode(false, B, ST, Mode);
3383 
3384  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3385  .addUse(Fma4.getReg(0))
3386  .addUse(Fma1.getReg(0))
3387  .addUse(Fma3.getReg(0))
3388  .addUse(NumeratorScaled.getReg(1))
3389  .setMIFlags(Flags);
3390 
3391  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3392  .addUse(Fmas.getReg(0))
3393  .addUse(RHS)
3394  .addUse(LHS)
3395  .setMIFlags(Flags);
3396 
3397  MI.eraseFromParent();
3398  return true;
3399 }
3400 
3403  MachineIRBuilder &B) const {
3405  return true;
3406 
3407  Register Res = MI.getOperand(0).getReg();
3408  Register LHS = MI.getOperand(1).getReg();
3409  Register RHS = MI.getOperand(2).getReg();
3410 
3411  uint16_t Flags = MI.getFlags();
3412 
3413  LLT S64 = LLT::scalar(64);
3414  LLT S1 = LLT::scalar(1);
3415 
3416  auto One = B.buildFConstant(S64, 1.0);
3417 
3418  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3419  .addUse(LHS)
3420  .addUse(RHS)
3421  .addImm(0)
3422  .setMIFlags(Flags);
3423 
3424  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3425 
3426  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3427  .addUse(DivScale0.getReg(0))
3428  .setMIFlags(Flags);
3429 
3430  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3431  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3432  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3433 
3434  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3435  .addUse(LHS)
3436  .addUse(RHS)
3437  .addImm(1)
3438  .setMIFlags(Flags);
3439 
3440  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3441  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3442  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3443 
3444  Register Scale;
3446  // Workaround a hardware bug on SI where the condition output from div_scale
3447  // is not usable.
3448 
3449  LLT S32 = LLT::scalar(32);
3450 
3451  auto NumUnmerge = B.buildUnmerge(S32, LHS);
3452  auto DenUnmerge = B.buildUnmerge(S32, RHS);
3453  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3454  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3455 
3456  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3457  Scale1Unmerge.getReg(1));
3458  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3459  Scale0Unmerge.getReg(1));
3460  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3461  } else {
3462  Scale = DivScale1.getReg(1);
3463  }
3464 
3465  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3466  .addUse(Fma4.getReg(0))
3467  .addUse(Fma3.getReg(0))
3468  .addUse(Mul.getReg(0))
3469  .addUse(Scale)
3470  .setMIFlags(Flags);
3471 
3472  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3473  .addUse(Fmas.getReg(0))
3474  .addUse(RHS)
3475  .addUse(LHS)
3476  .setMIFlags(Flags);
3477 
3478  MI.eraseFromParent();
3479  return true;
3480 }
3481 
3484  MachineIRBuilder &B) const {
3485  Register Res = MI.getOperand(0).getReg();
3486  Register LHS = MI.getOperand(2).getReg();
3487  Register RHS = MI.getOperand(3).getReg();
3488  uint16_t Flags = MI.getFlags();
3489 
3490  LLT S32 = LLT::scalar(32);
3491  LLT S1 = LLT::scalar(1);
3492 
3493  auto Abs = B.buildFAbs(S32, RHS, Flags);
3494  const APFloat C0Val(1.0f);
3495 
3496  auto C0 = B.buildConstant(S32, 0x6f800000);
3497  auto C1 = B.buildConstant(S32, 0x2f800000);
3498  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3499 
3500  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3501  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3502 
3503  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3504 
3505  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3506  .addUse(Mul0.getReg(0))
3507  .setMIFlags(Flags);
3508 
3509  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3510 
3511  B.buildFMul(Res, Sel, Mul1, Flags);
3512 
3513  MI.eraseFromParent();
3514  return true;
3515 }
3516 
3517 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3518 // FIXME: Why do we handle this one but not other removed instructions?
3519 //
3520 // Reciprocal square root. The clamp prevents infinite results, clamping
3521 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3522 // +-max_float.
3525  MachineIRBuilder &B) const {
3527  return true;
3528 
3529  Register Dst = MI.getOperand(0).getReg();
3530  Register Src = MI.getOperand(2).getReg();
3531  auto Flags = MI.getFlags();
3532 
3533  LLT Ty = MRI.getType(Dst);
3534 
3535  const fltSemantics *FltSemantics;
3536  if (Ty == LLT::scalar(32))
3537  FltSemantics = &APFloat::IEEEsingle();
3538  else if (Ty == LLT::scalar(64))
3539  FltSemantics = &APFloat::IEEEdouble();
3540  else
3541  return false;
3542 
3543  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3544  .addUse(Src)
3545  .setMIFlags(Flags);
3546 
3547  // We don't need to concern ourselves with the snan handling difference, since
3548  // the rsq quieted (or not) so use the one which will directly select.
3549  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3550  const bool UseIEEE = MFI->getMode().IEEE;
3551 
3552  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3553  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3554  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3555 
3556  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3557 
3558  if (UseIEEE)
3559  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3560  else
3561  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3562  MI.eraseFromParent();
3563  return true;
3564 }
3565 
3566 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3567  switch (IID) {
3568  case Intrinsic::amdgcn_ds_fadd:
3569  return AMDGPU::G_ATOMICRMW_FADD;
3570  case Intrinsic::amdgcn_ds_fmin:
3571  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3572  case Intrinsic::amdgcn_ds_fmax:
3573  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3574  default:
3575  llvm_unreachable("not a DS FP intrinsic");
3576  }
3577 }
3578 
3580  MachineInstr &MI,
3581  Intrinsic::ID IID) const {
3582  GISelChangeObserver &Observer = Helper.Observer;
3583  Observer.changingInstr(MI);
3584 
3585  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3586 
3587  // The remaining operands were used to set fields in the MemOperand on
3588  // construction.
3589  for (int I = 6; I > 3; --I)
3590  MI.RemoveOperand(I);
3591 
3592  MI.RemoveOperand(1); // Remove the intrinsic ID.
3593  Observer.changedInstr(MI);
3594  return true;
3595 }
3596 
3599  MachineIRBuilder &B) const {
3600  uint64_t Offset =
3603  LLT DstTy = MRI.getType(DstReg);
3604  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3605 
3606  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3607  if (!loadInputValue(KernargPtrReg, B,
3609  return false;
3610 
3611  // FIXME: This should be nuw
3612  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3613  return true;
3614 }
3615 
3618  MachineIRBuilder &B) const {
3619  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3620  if (!MFI->isEntryFunction()) {
3623  }
3624 
3625  Register DstReg = MI.getOperand(0).getReg();
3626  if (!getImplicitArgPtr(DstReg, MRI, B))
3627  return false;
3628 
3629  MI.eraseFromParent();
3630  return true;
3631 }
3632 
3636  unsigned AddrSpace) const {
3637  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3638  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3639  Register Hi32 = Unmerge.getReg(1);
3640 
3641  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3642  MI.eraseFromParent();
3643  return true;
3644 }
3645 
3646 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3647 // offset (the offset that is included in bounds checking and swizzling, to be
3648 // split between the instruction's voffset and immoffset fields) and soffset
3649 // (the offset that is excluded from bounds checking and swizzling, to go in
3650 // the instruction's soffset field). This function takes the first kind of
3651 // offset and figures out how to split it between voffset and immoffset.
3652 std::pair<Register, unsigned>
3654  Register OrigOffset) const {
3655  const unsigned MaxImm = 4095;
3656  Register BaseReg;
3657  unsigned ImmOffset;
3658  const LLT S32 = LLT::scalar(32);
3659  MachineRegisterInfo &MRI = *B.getMRI();
3660 
3661  std::tie(BaseReg, ImmOffset) =
3663 
3664  // If BaseReg is a pointer, convert it to int.
3665  if (MRI.getType(BaseReg).isPointer())
3666  BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3667 
3668  // If the immediate value is too big for the immoffset field, put the value
3669  // and -4096 into the immoffset field so that the value that is copied/added
3670  // for the voffset field is a multiple of 4096, and it stands more chance
3671  // of being CSEd with the copy/add for another similar load/store.
3672  // However, do not do that rounding down to a multiple of 4096 if that is a
3673  // negative number, as it appears to be illegal to have a negative offset
3674  // in the vgpr, even if adding the immediate offset makes it positive.
3675  unsigned Overflow = ImmOffset & ~MaxImm;
3676  ImmOffset -= Overflow;
3677  if ((int32_t)Overflow < 0) {
3678  Overflow += ImmOffset;
3679  ImmOffset = 0;
3680  }
3681 
3682  if (Overflow != 0) {
3683  if (!BaseReg) {
3684  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3685  } else {
3686  auto OverflowVal = B.buildConstant(S32, Overflow);
3687  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3688  }
3689  }
3690 
3691  if (!BaseReg)
3692  BaseReg = B.buildConstant(S32, 0).getReg(0);
3693 
3694  return std::make_pair(BaseReg, ImmOffset);
3695 }
3696 
3697 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3699  Register VOffset, Register SOffset,
3700  unsigned ImmOffset, Register VIndex,
3701  MachineRegisterInfo &MRI) const {
3702  Optional<ValueAndVReg> MaybeVOffsetVal =
3704  Optional<ValueAndVReg> MaybeSOffsetVal =
3706  Optional<ValueAndVReg> MaybeVIndexVal =
3708  // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3709  // update the MMO with that offset. The stride is unknown so we can only do
3710  // this if VIndex is constant 0.
3711  if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3712  MaybeVIndexVal->Value == 0) {
3713  uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3714  MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3715  MMO->setOffset(TotalOffset);
3716  } else {
3717  // We don't have a constant combined offset to use in the MMO. Give up.
3718  MMO->setValue((Value *)nullptr);
3719  }
3720 }
3721 
3722 /// Handle register layout difference for f16 images for some subtargets.
3725  Register Reg,
3726  bool ImageStore) const {
3727  const LLT S16 = LLT::scalar(16);
3728  const LLT S32 = LLT::scalar(32);
3729  LLT StoreVT = MRI.getType(Reg);
3730  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3731 
3732  if (ST.hasUnpackedD16VMem()) {
3733  auto Unmerge = B.buildUnmerge(S16, Reg);
3734 
3735  SmallVector<Register, 4> WideRegs;
3736  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3737  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3738 
3739  int NumElts = StoreVT.getNumElements();
3740 
3741  return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3742  .getReg(0);
3743  }
3744 
3745  if (ImageStore && ST.hasImageStoreD16Bug()) {
3746  if (StoreVT.getNumElements() == 2) {
3747  SmallVector<Register, 4> PackedRegs;
3748  Reg = B.buildBitcast(S32, Reg).getReg(0);
3749  PackedRegs.push_back(Reg);
3750  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3751  return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3752  .getReg(0);
3753  }
3754 
3755  if (StoreVT.getNumElements() == 3) {
3756  SmallVector<Register, 4> PackedRegs;
3757  auto Unmerge = B.buildUnmerge(S16, Reg);
3758  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3759  PackedRegs.push_back(Unmerge.getReg(I));
3760  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3761  Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3762  return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3763  }
3764 
3765  if (StoreVT.getNumElements() == 4) {
3766  SmallVector<Register, 4> PackedRegs;
3767  Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3768  auto Unmerge = B.buildUnmerge(S32, Reg);
3769  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3770  PackedRegs.push_back(Unmerge.getReg(I));
3771  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3772  return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3773  .getReg(0);
3774  }
3775 
3776  llvm_unreachable("invalid data type");
3777  }
3778 
3779  return Reg;
3780 }
3781 
3783  MachineIRBuilder &B, Register VData, bool IsFormat) const {
3784  MachineRegisterInfo *MRI = B.getMRI();
3785  LLT Ty = MRI->getType(VData);
3786 
3787  const LLT S16 = LLT::scalar(16);
3788 
3789  // Fixup illegal register types for i8 stores.
3790  if (Ty == LLT::scalar(8) || Ty == S16) {
3791  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3792  return AnyExt;
3793  }
3794 
3795  if (Ty.isVector()) {
3796  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3797  if (IsFormat)
3798  return handleD16VData(B, *MRI, VData);
3799  }
3800  }
3801 
3802  return VData;
3803 }
3804 
3808  bool IsTyped,
3809  bool IsFormat) const {
3810  Register VData = MI.getOperand(1).getReg();
3811  LLT Ty = MRI.getType(VData);
3812  LLT EltTy = Ty.getScalarType();
3813  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3814  const LLT S32 = LLT::scalar(32);
3815 
3816  VData = fixStoreSourceType(B, VData, IsFormat);
3817  Register RSrc = MI.getOperand(2).getReg();
3818 
3819  MachineMemOperand *MMO = *MI.memoperands_begin();
3820  const int MemSize = MMO->getSize();
3821 
3822  unsigned ImmOffset;
3823 
3824  // The typed intrinsics add an immediate after the registers.
3825  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3826 
3827  // The struct intrinsic variants add one additional operand over raw.
3828  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3829  Register VIndex;
3830  int OpOffset = 0;
3831  if (HasVIndex) {
3832  VIndex = MI.getOperand(3).getReg();
3833  OpOffset = 1;
3834  } else {
3835  VIndex = B.buildConstant(S32, 0).getReg(0);
3836  }
3837 
3838  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3839  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3840 
3841  unsigned Format = 0;
3842  if (IsTyped) {
3843  Format = MI.getOperand(5 + OpOffset).getImm();
3844  ++OpOffset;
3845  }
3846 
3847  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3848 
3849  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3850  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3851 
3852  unsigned Opc;
3853  if (IsTyped) {
3854  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3855  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3856  } else if (IsFormat) {
3857  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3858  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3859  } else {
3860  switch (MemSize) {
3861  case 1:
3862  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3863  break;
3864  case 2:
3865  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3866  break;
3867  default:
3868  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3869  break;
3870  }
3871  }
3872 
3873  auto MIB = B.buildInstr(Opc)
3874  .addUse(VData) // vdata
3875  .addUse(RSrc) // rsrc
3876  .addUse(VIndex) // vindex
3877  .addUse(VOffset) // voffset
3878  .addUse(SOffset) // soffset
3879  .addImm(ImmOffset); // offset(imm)
3880 
3881  if (IsTyped)
3882  MIB.addImm(Format);
3883 
3884  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3885  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3886  .addMemOperand(MMO);
3887 
3888  MI.eraseFromParent();
3889  return true;
3890 }
3891 
3895  bool IsFormat,
3896  bool IsTyped) const {
3897  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3898  MachineMemOperand *MMO = *MI.memoperands_begin();
3899  const LLT MemTy = MMO->getMemoryType();
3900  const LLT S32 = LLT::scalar(32);
3901 
3902  Register Dst = MI.getOperand(0).getReg();
3903  Register RSrc = MI.getOperand(2).getReg();
3904 
3905  // The typed intrinsics add an immediate after the registers.
3906  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3907 
3908  // The struct intrinsic variants add one additional operand over raw.
3909  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3910  Register VIndex;
3911  int OpOffset = 0;
3912  if (HasVIndex) {
3913  VIndex = MI.getOperand(3).getReg();
3914  OpOffset = 1;
3915  } else {
3916  VIndex = B.buildConstant(S32, 0).getReg(0);
3917  }
3918 
3919  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3920  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3921 
3922  unsigned Format = 0;
3923  if (IsTyped) {
3924  Format = MI.getOperand(5 + OpOffset).getImm();
3925  ++OpOffset;
3926  }
3927 
3928  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3929  unsigned ImmOffset;
3930 
3931  LLT Ty = MRI.getType(Dst);
3932  LLT EltTy = Ty.getScalarType();
3933  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3934  const bool Unpacked = ST.hasUnpackedD16VMem();
3935 
3936  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3937  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3938 
3939  unsigned Opc;
3940 
3941  if (IsTyped) {
3942  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3943  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3944  } else if (IsFormat) {
3945  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3946  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3947  } else {
3948  switch (MemTy.getSizeInBits()) {
3949  case 8:
3950  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3951  break;
3952  case 16:
3953  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3954  break;
3955  default:
3956  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3957  break;
3958  }
3959  }
3960 
3961  Register LoadDstReg;
3962 
3963  bool IsExtLoad =
3964  (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
3965  LLT UnpackedTy = Ty.changeElementSize(32);
3966 
3967  if (IsExtLoad)
3968  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3969  else if (Unpacked && IsD16 && Ty.isVector())
3970  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3971  else
3972  LoadDstReg = Dst;
3973 
3974  auto MIB = B.buildInstr(Opc)
3975  .addDef(LoadDstReg) // vdata
3976  .addUse(RSrc) // rsrc
3977  .addUse(VIndex) // vindex
3978  .addUse(VOffset) // voffset
3979  .addUse(SOffset) // soffset
3980  .addImm(ImmOffset); // offset(imm)
3981 
3982  if (IsTyped)
3983  MIB.addImm(Format);
3984 
3985  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3986  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3987  .addMemOperand(MMO);
3988 
3989  if (LoadDstReg != Dst) {
3990  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3991 
3992  // Widen result for extending loads was widened.
3993  if (IsExtLoad)
3994  B.buildTrunc(Dst, LoadDstReg);
3995  else {
3996  // Repack to original 16-bit vector result
3997  // FIXME: G_TRUNC should work, but legalization currently fails
3998  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3999  SmallVector<Register, 4> Repack;
4000  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4001  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4002  B.buildMerge(Dst, Repack);
4003  }
4004  }
4005 
4006  MI.eraseFromParent();
4007  return true;
4008 }
4009 
4012  bool IsInc) const {
4013  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4014  AMDGPU::G_AMDGPU_ATOMIC_DEC;
4015  B.buildInstr(Opc)
4016  .addDef(MI.getOperand(0).getReg())
4017  .addUse(MI.getOperand(2).getReg())
4018  .addUse(MI.getOperand(3).getReg())
4019  .cloneMemRefs(MI);
4020  MI.eraseFromParent();
4021  return true;
4022 }
4023 
4024 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4025  switch (IntrID) {
4026  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4027  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4028  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4029  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4030  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4031  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4032  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4033  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4034  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4035  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4036  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4037  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4038  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4039  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4040  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4041  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4042  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4043  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4044  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4045  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4046  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4047  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4048  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4049  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4050  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4051  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4052  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4053  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4054  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4055  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4056  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4057  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4058  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4059  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4060  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4061  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4062  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4063  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4064  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4065  case Intrinsic::amdgcn_buffer_atomic_fadd:
4066  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4067  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4068  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4069  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4070  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4071  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4072  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4073  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4074  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4075  default:
4076  llvm_unreachable("unhandled atomic opcode");
4077  }
4078 }
4079 
4082  Intrinsic::ID IID) const {
4083  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4084  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4085  const bool HasReturn = MI.getNumExplicitDefs() != 0;
4086 
4087  Register Dst;
4088 
4089  int OpOffset = 0;
4090  if (HasReturn) {
4091  // A few FP atomics do not support return values.
4092  Dst = MI.getOperand(0).getReg();
4093  } else {
4094  OpOffset = -1;
4095  }
4096 
4097  Register VData = MI.getOperand(2 + OpOffset).getReg();
4098  Register CmpVal;
4099 
4100  if (IsCmpSwap) {
4101  CmpVal = MI.getOperand(3 + OpOffset).getReg();
4102  ++OpOffset;
4103  }
4104 
4105  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4106  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4107 
4108  // The struct intrinsic variants add one additional operand over raw.
4109  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4110  Register VIndex;
4111  if (HasVIndex) {
4112  VIndex = MI.getOperand(4 + OpOffset).getReg();
4113  ++OpOffset;
4114  } else {
4115  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4116  }
4117 
4118  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4119  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4120  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4121 
4122  MachineMemOperand *MMO = *MI.memoperands_begin();
4123 
4124  unsigned ImmOffset;
4125  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4126  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4127 
4128  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4129 
4130  if (HasReturn)
4131  MIB.addDef(Dst);
4132 
4133  MIB.addUse(VData); // vdata
4134 
4135  if (IsCmpSwap)
4136  MIB.addReg(CmpVal);
4137 
4138  MIB.addUse(RSrc) // rsrc
4139  .addUse(VIndex) // vindex
4140  .addUse(VOffset) // voffset
4141  .addUse(SOffset) // soffset
4142  .addImm(ImmOffset) // offset(imm)
4143  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4144  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4145  .addMemOperand(MMO);
4146 
4147  MI.eraseFromParent();
4148  return true;
4149 }
4150 
4151 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4152 /// vector with s16 typed elements.
4154  SmallVectorImpl<Register> &PackedAddrs,
4155  unsigned ArgOffset,
4157  bool IsA16, bool IsG16) {
4158  const LLT S16 = LLT::scalar(16);
4159  const LLT V2S16 = LLT::fixed_vector(2, 16);
4160  auto EndIdx = Intr->VAddrEnd;
4161 
4162  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4163  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4164  if (!SrcOp.isReg())
4165  continue; // _L to _LZ may have eliminated this.
4166 
4167  Register AddrReg = SrcOp.getReg();
4168 
4169  if ((I < Intr->GradientStart) ||
4170  (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4171  (I >= Intr->CoordStart && !IsA16)) {
4172  // Handle any gradient or coordinate operands that should not be packed
4173  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4174  PackedAddrs.push_back(AddrReg);
4175  } else {
4176  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4177  // derivatives dx/dh and dx/dv are packed with undef.
4178  if (((I + 1) >= EndIdx) ||
4179  ((Intr->NumGradients / 2) % 2 == 1 &&
4180  (I == static_cast<unsigned>(Intr->GradientStart +
4181  (Intr->NumGradients / 2) - 1) ||
4182  I == static_cast<unsigned>(Intr->GradientStart +
4183  Intr->NumGradients - 1))) ||
4184  // Check for _L to _LZ optimization
4185  !MI.getOperand(ArgOffset + I + 1).isReg()) {
4186  PackedAddrs.push_back(
4187  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4188  .getReg(0));
4189  } else {
4190  PackedAddrs.push_back(
4191  B.buildBuildVector(
4192  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4193  .getReg(0));
4194  ++I;
4195  }
4196  }
4197  }
4198 }
4199 
4200 /// Convert from separate vaddr components to a single vector address register,
4201 /// and replace the remaining operands with $noreg.
4203  int DimIdx, int NumVAddrs) {
4204  const LLT S32 = LLT::scalar(32);
4205 
4206  SmallVector<Register, 8> AddrRegs;
4207  for (int I = 0; I != NumVAddrs; ++I) {
4208  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4209  if (SrcOp.isReg()) {
4210  AddrRegs.push_back(SrcOp.getReg());
4211  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4212  }
4213  }
4214 
4215  int NumAddrRegs = AddrRegs.size();
4216  if (NumAddrRegs != 1) {
4217  // Above 8 elements round up to next power of 2 (i.e. 16).
4218  if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4219  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4220  auto Undef = B.buildUndef(S32);
4221  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4222  NumAddrRegs = RoundedNumRegs;
4223  }
4224 
4225  auto VAddr =
4226  B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4227  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4228  }
4229 
4230  for (int I = 1; I != NumVAddrs; ++I) {
4231  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4232  if (SrcOp.isReg())
4233  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4234  }
4235 }
4236 
4237 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4238 ///
4239 /// Depending on the subtarget, load/store with 16-bit element data need to be
4240 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4241 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4242 /// registers.
4243 ///
4244 /// We don't want to directly select image instructions just yet, but also want
4245 /// to exposes all register repacking to the legalizer/combiners. We also don't
4246 /// want a selected instrution entering RegBankSelect. In order to avoid
4247 /// defining a multitude of intermediate image instructions, directly hack on
4248 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4249 /// now unnecessary arguments with $noreg.
4252  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4253 
4254  const unsigned NumDefs = MI.getNumExplicitDefs();
4255  const unsigned ArgOffset = NumDefs + 1;
4256  bool IsTFE = NumDefs == 2;
4257  // We are only processing the operands of d16 image operations on subtargets
4258  // that use the unpacked register layout, or need to repack the TFE result.
4259 
4260  // TODO: Do we need to guard against already legalized intrinsics?
4261  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4262  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4263 
4264  MachineRegisterInfo *MRI = B.getMRI();
4265  const LLT S32 = LLT::scalar(32);
4266  const LLT S16 = LLT::scalar(16);
4267  const LLT V2S16 = LLT::fixed_vector(2, 16);
4268 
4269  unsigned DMask = 0;
4270 
4271  // Check for 16 bit addresses and pack if true.
4272  LLT GradTy =
4273  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4274  LLT AddrTy =
4275  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4276  const bool IsG16 = GradTy == S16;
4277  const bool IsA16 = AddrTy == S16;
4278 
4279  int DMaskLanes = 0;
4280  if (!BaseOpcode->Atomic) {
4281  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4282  if (BaseOpcode->Gather4) {
4283  DMaskLanes = 4;
4284  } else if (DMask != 0) {
4285  DMaskLanes = countPopulation(DMask);
4286  } else if (!IsTFE && !BaseOpcode->Store) {
4287  // If dmask is 0, this is a no-op load. This can be eliminated.
4288  B.buildUndef(MI.getOperand(0));
4289  MI.eraseFromParent();
4290  return true;
4291  }
4292  }
4293 
4294  Observer.changingInstr(MI);
4295  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4296 
4297  unsigned NewOpcode = NumDefs == 0 ?
4298  AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4299 
4300  // Track that we legalized this
4301  MI.setDesc(B.getTII().get(NewOpcode));
4302 
4303  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4304  // dmask to be at least 1 otherwise the instruction will fail
4305  if (IsTFE && DMask == 0) {
4306  DMask = 0x1;
4307  DMaskLanes = 1;
4308  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4309  }
4310 
4311  if (BaseOpcode->Atomic) {
4312  Register VData0 = MI.getOperand(2).getReg();
4313  LLT Ty = MRI->getType(VData0);
4314 
4315  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4316  if (Ty.isVector())
4317  return false;
4318 
4319  if (BaseOpcode->AtomicX2) {
4320  Register VData1 = MI.getOperand(3).getReg();
4321  // The two values are packed in one register.
4322  LLT PackedTy = LLT::fixed_vector(2, Ty);
4323  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4324  MI.getOperand(2).setReg(Concat.getReg(0));
4325  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4326  }
4327  }
4328 
4329  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4330 
4331  // Optimize _L to _LZ when _L is zero
4332  if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4333  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4334  const ConstantFP *ConstantLod;
4335 
4336  if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4337  m_GFCst(ConstantLod))) {
4338  if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4339  // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4340  const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4342  Intr->Dim);
4343 
4344  // The starting indexes should remain in the same place.
4345  --CorrectedNumVAddrs;
4346 
4347  MI.getOperand(MI.getNumExplicitDefs())
4348  .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4349  MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4350  Intr = NewImageDimIntr;
4351  }
4352  }
4353  }
4354 
4355  // Optimize _mip away, when 'lod' is zero
4356  if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4357  int64_t ConstantLod;
4358  if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4359  m_ICst(ConstantLod))) {
4360  if (ConstantLod == 0) {
4361  // TODO: Change intrinsic opcode and remove operand instead or replacing
4362  // it with 0, as the _L to _LZ handling is done above.
4363  MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4364  --CorrectedNumVAddrs;
4365  }
4366  }
4367  }
4368 
4369  // Rewrite the addressing register layout before doing anything else.
4370  if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4371  // 16 bit gradients are supported, but are tied to the A16 control
4372  // so both gradients and addresses must be 16 bit
4373  return false;
4374  }
4375 
4376  if (IsA16 && !ST.hasA16()) {
4377  // A16 not supported
4378  return false;
4379  }
4380 
4381  if (IsA16 || IsG16) {
4382  if (Intr->NumVAddrs > 1) {
4383  SmallVector<Register, 4> PackedRegs;
4384 
4385  packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4386  IsG16);
4387 
4388  // See also below in the non-a16 branch
4389  const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4390  PackedRegs.size() <= ST.getNSAMaxSize();
4391 
4392  if (!UseNSA && PackedRegs.size() > 1) {
4393  LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4394  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4395  PackedRegs[0] = Concat.getReg(0);
4396  PackedRegs.resize(1);
4397  }
4398 
4399  const unsigned NumPacked = PackedRegs.size();
4400  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4401  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4402  if (!SrcOp.isReg()) {
4403  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4404  continue;
4405  }
4406 
4407  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4408 
4409  if (I - Intr->VAddrStart < NumPacked)
4410  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4411  else
4412  SrcOp.setReg(AMDGPU::NoRegister);
4413  }
4414  }
4415  } else {
4416  // If the register allocator cannot place the address registers contiguously
4417  // without introducing moves, then using the non-sequential address encoding
4418  // is always preferable, since it saves VALU instructions and is usually a
4419  // wash in terms of code size or even better.
4420  //
4421  // However, we currently have no way of hinting to the register allocator
4422  // that MIMG addresses should be placed contiguously when it is possible to
4423  // do so, so force non-NSA for the common 2-address case as a heuristic.
4424  //
4425  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4426  // allocation when possible.
4427  const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4428  CorrectedNumVAddrs <= ST.getNSAMaxSize();
4429 
4430  if (!UseNSA && Intr->NumVAddrs > 1)
4431  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4432  Intr->NumVAddrs);
4433  }
4434 
4435  int Flags = 0;
4436  if (IsA16)
4437  Flags |= 1;
4438  if (IsG16)
4439  Flags |= 2;
4440  MI.addOperand(MachineOperand::CreateImm(Flags));
4441 
4442  if (BaseOpcode->Store) { // No TFE for stores?
4443  // TODO: Handle dmask trim
4444  Register VData = MI.getOperand(1).getReg();
4445  LLT Ty = MRI->getType(VData);
4446  if (!Ty.isVector() || Ty.getElementType() != S16)
4447  return true;
4448 
4449  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4450  if (RepackedReg != VData) {
4451  MI.getOperand(1).setReg(RepackedReg);
4452  }
4453 
4454  return true;
4455  }
4456 
4457  Register DstReg = MI.getOperand(0).getReg();
4458  LLT Ty = MRI->getType(DstReg);
4459  const LLT EltTy = Ty.getScalarType();
4460  const bool IsD16 = Ty.getScalarType() == S16;
4461  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4462 
4463  // Confirm that the return type is large enough for the dmask specified
4464  if (NumElts < DMaskLanes)
4465  return false;
4466 
4467  if (NumElts > 4 || DMaskLanes > 4)
4468  return false;
4469 
4470  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4471  const LLT AdjustedTy =
4472  Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4473 
4474  // The raw dword aligned data component of the load. The only legal cases
4475  // where this matters should be when using the packed D16 format, for
4476  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4477  LLT RoundedTy;
4478 
4479  // S32 vector to to cover all data, plus TFE result element.
4480  LLT TFETy;
4481 
4482  // Register type to use for each loaded component. Will be S32 or V2S16.
4483  LLT RegTy;
4484 
4485  if (IsD16 && ST.hasUnpackedD16VMem()) {
4486  RoundedTy =
4487  LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4488  TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4489  RegTy = S32;
4490  } else {
4491  unsigned EltSize = EltTy.getSizeInBits();
4492  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4493  unsigned RoundedSize = 32 * RoundedElts;
4494  RoundedTy = LLT::scalarOrVector(
4495  ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4496  TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4497  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4498  }
4499 
4500  // The return type does not need adjustment.
4501  // TODO: Should we change s16 case to s32 or <2 x s16>?
4502  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4503  return true;
4504 
4505  Register Dst1Reg;
4506 
4507  // Insert after the instruction.
4508  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4509 
4510  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4511  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4512  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4513  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4514 
4515  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4516 
4517  MI.getOperand(0).setReg(NewResultReg);
4518 
4519  // In the IR, TFE is supposed to be used with a 2 element struct return
4520  // type. The intruction really returns these two values in one contiguous
4521  // register, with one additional dword beyond the loaded data. Rewrite the
4522  // return type to use a single register result.
4523 
4524  if (IsTFE) {
4525  Dst1Reg = MI.getOperand(1).getReg();
4526  if (MRI->getType(Dst1Reg) != S32)
4527  return false;
4528 
4529  // TODO: Make sure the TFE operand bit is set.
4530  MI.RemoveOperand(1);
4531 
4532  // Handle the easy case that requires no repack instructions.
4533  if (Ty == S32) {
4534  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4535  return true;
4536  }
4537  }
4538 
4539  // Now figure out how to copy the new result register back into the old
4540  // result.
4541  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4542 
4543  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4544 
4545  if (ResultNumRegs == 1) {
4546  assert(!IsTFE);
4547  ResultRegs[0] = NewResultReg;
4548  } else {
4549  // We have to repack into a new vector of some kind.
4550  for (int I = 0; I != NumDataRegs; ++I)
4551  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4552  B.buildUnmerge(ResultRegs, NewResultReg);
4553 
4554  // Drop the final TFE element to get the data part. The TFE result is
4555  // directly written to the right place already.
4556  if (IsTFE)
4557  ResultRegs.resize(NumDataRegs);
4558  }
4559 
4560  // For an s16 scalar result, we form an s32 result with a truncate regardless
4561  // of packed vs. unpacked.
4562  if (IsD16 && !Ty.isVector()) {
4563  B.buildTrunc(DstReg, ResultRegs[0]);
4564  return true;
4565  }
4566 
4567  // Avoid a build/concat_vector of 1 entry.
4568  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4569  B.buildBitcast(DstReg, ResultRegs[0]);
4570  return true;
4571  }
4572 
4573  assert(Ty.isVector());
4574 
4575  if (IsD16) {
4576  // For packed D16 results with TFE enabled, all the data components are
4577  // S32. Cast back to the expected type.
4578  //
4579  // TODO: We don't really need to use load s32 elements. We would only need one
4580  // cast for the TFE result if a multiple of v2s16 was used.
4581  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4582  for (Register &Reg : ResultRegs)
4583  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4584  } else if (ST.hasUnpackedD16VMem()) {
4585  for (Register &Reg : ResultRegs)
4586  Reg = B.buildTrunc(S16, Reg).getReg(0);
4587  }
4588  }
4589 
4590  auto padWithUndef = [&](LLT Ty, int NumElts) {
4591  if (NumElts == 0)
4592  return;
4593  Register Undef = B.buildUndef(Ty).getReg(0);
4594  for (int I = 0; I != NumElts; ++I)
4595  ResultRegs.push_back(Undef);
4596  };
4597 
4598  // Pad out any elements eliminated due to the dmask.
4599  LLT ResTy = MRI->getType(ResultRegs[0]);
4600  if (!ResTy.isVector()) {
4601  padWithUndef(ResTy, NumElts - ResultRegs.size());
4602  B.buildBuildVector(DstReg, ResultRegs);
4603  return true;
4604  }
4605 
4606  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4607  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4608 
4609  // Deal with the one annoying legal case.
4610  const LLT V3S16 = LLT::fixed_vector(3, 16);
4611  if (Ty == V3S16) {
4612  padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4613  auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4614  B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4615  return true;
4616  }
4617 
4618  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4619  B.buildConcatVectors(DstReg, ResultRegs);
4620  return true;
4621 }
4622 
4624  LegalizerHelper &Helper, MachineInstr &MI) const {
4625  MachineIRBuilder &B = Helper.MIRBuilder;
4626  GISelChangeObserver &Observer = Helper.Observer;
4627 
4628  Register Dst = MI.getOperand(0).getReg();
4629  LLT Ty = B.getMRI()->getType(Dst);
4630  unsigned Size = Ty.getSizeInBits();
4631  MachineFunction &MF = B.getMF();
4632 
4633  Observer.changingInstr(MI);
4634 
4636  Ty = getBitcastRegisterType(Ty);
4637  Helper.bitcastDst(MI, Ty, 0);
4638  Dst = MI.getOperand(0).getReg();
4639  B.setInsertPt(B.getMBB(), MI);
4640  }
4641 
4642  // FIXME: We don't really need this intermediate instruction. The intrinsic
4643  // should be fixed to have a memory operand. Since it's readnone, we're not
4644  // allowed to add one.
4645  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4646  MI.RemoveOperand(1); // Remove intrinsic ID
4647 
4648  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4649  // TODO: Should this use datalayout alignment?
4650  const unsigned MemSize = (Size + 7) / 8;
4651  const Align MemAlign(4);
4656  MemSize, MemAlign);
4657  MI.addMemOperand(MF, MMO);
4658 
4659  // There are no 96-bit result scalar loads, but widening to 128-bit should
4660  // always be legal. We may need to restore this to a 96-bit result if it turns
4661  // out this needs to be converted to a vector load during RegBankSelect.
4662  if (!isPowerOf2_32(Size)) {
4663  if (Ty.isVector())
4664  Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4665  else
4666  Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4667  }
4668 
4669  Observer.changedInstr(MI);
4670  return true;
4671 }
4672 
4673 // TODO: Move to selection
4676  MachineIRBuilder &B) const {
4677  if (!ST.isTrapHandlerEnabled() ||
4679  return legalizeTrapEndpgm(MI, MRI, B);
4680 
4681  if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4682  switch (*HsaAbiVer) {
4685  return legalizeTrapHsaQueuePtr(MI, MRI, B);
4687  return ST.supportsGetDoorbellID() ?
4688  legalizeTrapHsa(MI, MRI, B) :
4690  }
4691  }
4692 
4693  llvm_unreachable("Unknown trap handler");
4694 }
4695 
4698  B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4699  MI.eraseFromParent();
4700  return true;
4701 }
4702 
4705  // Pass queue pointer to trap handler as input, and insert trap instruction
4706  // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4707  Register LiveIn =
4710  return false;
4711 
4712  Register SGPR01(AMDGPU::SGPR0_SGPR1);
4713  B.buildCopy(SGPR01, LiveIn);
4714  B.buildInstr(AMDGPU::S_TRAP)
4715  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4716  .addReg(SGPR01, RegState::Implicit);
4717 
4718  MI.eraseFromParent();
4719  return true;
4720 }
4721 
4724  B.buildInstr(AMDGPU::S_TRAP)
4725  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4726  MI.eraseFromParent();
4727  return true;
4728 }
4729 
4732  // Is non-HSA path or trap-handler disabled? then, report a warning
4733  // accordingly
4734  if (!ST.isTrapHandlerEnabled() ||
4736  DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4737  "debugtrap handler not supported",
4738  MI.getDebugLoc(), DS_Warning);
4739  LLVMContext &Ctx = B.getMF().getFunction().getContext();
4740  Ctx.diagnose(NoTrap);
4741  } else {
4742  // Insert debug-trap instruction
4743  B.buildInstr(AMDGPU::S_TRAP)
4744  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4745  }
4746 
4747  MI.eraseFromParent();
4748  return true;
4749 }
4750 
4752  MachineIRBuilder &B) const {
4753  MachineRegisterInfo &MRI = *B.getMRI();
4754  const LLT S16 = LLT::scalar(16);
4755  const LLT S32 = LLT::scalar(32);
4756 
4757  Register DstReg = MI.getOperand(0).getReg();
4758  Register NodePtr = MI.getOperand(2).getReg();
4759  Register RayExtent = MI.getOperand(3).getReg();
4760  Register RayOrigin = MI.getOperand(4).getReg();
4761  Register RayDir = MI.getOperand(5).getReg();
4762  Register RayInvDir = MI.getOperand(6).getReg();
4763  Register TDescr = MI.getOperand(7).getReg();
4764 
4765  if (!ST.hasGFX10_AEncoding()) {
4766  DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4767  "intrinsic not supported on subtarget",
4768  MI.getDebugLoc());
4769  B.getMF().getFunction().getContext().diagnose(BadIntrin);
4770  return false;
4771  }
4772 
4773  bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4774  bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4775  unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4776  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4777  : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4778  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4779 
4781  if (Is64) {
4782  auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4783  Ops.push_back(Unmerge.getReg(0));
4784  Ops.push_back(Unmerge.getReg(1));
4785  } else {
4786  Ops.push_back(NodePtr);
4787  }
4788  Ops.push_back(RayExtent);
4789 
4790  auto packLanes = [&Ops, &S32, &B] (Register Src) {
4791  auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4792  Ops.push_back(Unmerge.getReg(0));
4793  Ops.push_back(Unmerge.getReg(1));
4794  Ops.push_back(Unmerge.getReg(2));
4795  };
4796 
4797  packLanes(RayOrigin);
4798  if (IsA16) {
4799  auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4800  auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4804  B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4805  B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4806  B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4807  Ops.push_back(R1);
4808  Ops.push_back(R2);
4809  Ops.push_back(R3);
4810  } else {
4811  packLanes(RayDir);
4812  packLanes(RayInvDir);
4813  }
4814 
4815  auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4816  .addDef(DstReg)
4817  .addImm(Opcode);
4818 
4819  for (Register R : Ops) {
4820  MIB.addUse(R);
4821  }
4822 
4823  MIB.addUse(TDescr)
4824  .addImm(IsA16 ? 1 : 0)
4825  .cloneMemRefs(MI);
4826 
4827  MI.eraseFromParent();
4828  return true;
4829 }
4830 
4832  MachineInstr &MI) const {
4833  MachineIRBuilder &B = Helper.MIRBuilder;
4834  MachineRegisterInfo &MRI = *B.getMRI();
4835 
4836  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4837  auto IntrID = MI.getIntrinsicID();
4838  switch (IntrID) {
4839  case Intrinsic::amdgcn_if:
4840  case Intrinsic::amdgcn_else: {
4841  MachineInstr *Br = nullptr;
4842  MachineBasicBlock *UncondBrTarget = nullptr;
4843  bool Negated = false;
4844  if (MachineInstr *BrCond =
4845  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4846  const SIRegisterInfo *TRI
4847  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4848 
4849  Register Def = MI.getOperand(1).getReg();
4850  Register Use = MI.getOperand(3).getReg();
4851 
4852  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4853 
4854  if (Negated)
4855  std::swap(CondBrTarget, UncondBrTarget);
4856 
4857  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4858  if (IntrID == Intrinsic::amdgcn_if) {
4859  B.buildInstr(AMDGPU::SI_IF)
4860  .addDef(Def)
4861  .addUse(Use)
4862  .addMBB(UncondBrTarget);
4863  } else {
4864  B.buildInstr(AMDGPU::SI_ELSE)
4865  .addDef(Def)
4866  .addUse(Use)
4867  .addMBB(UncondBrTarget);
4868  }
4869 
4870  if (Br) {
4871  Br->getOperand(0).setMBB(CondBrTarget);
4872  } else {
4873  // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4874  // since we're swapping branch targets it needs to be reinserted.
4875  // FIXME: IRTranslator should probably not do this
4876  B.buildBr(*CondBrTarget);
4877  }
4878 
4879  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4880  MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4881  MI.eraseFromParent();
4882  BrCond->eraseFromParent();
4883  return true;
4884  }
4885 
4886  return false;
4887  }
4888  case Intrinsic::amdgcn_loop: {
4889  MachineInstr *Br = nullptr;
4890  MachineBasicBlock *UncondBrTarget = nullptr;
4891  bool Negated = false;
4892  if (MachineInstr *BrCond =
4893  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4894  const SIRegisterInfo *TRI
4895  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4896 
4897  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4898  Register Reg = MI.getOperand(2).getReg();
4899 
4900  if (Negated)
4901  std::swap(CondBrTarget, UncondBrTarget);
4902 
4903  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4904  B.buildInstr(AMDGPU::SI_LOOP)
4905  .addUse(Reg)
4906  .addMBB(UncondBrTarget);
4907 
4908  if (Br)
4909  Br->getOperand(0).setMBB(CondBrTarget);
4910  else
4911  B.buildBr(*CondBrTarget);
4912 
4913  MI.eraseFromParent();
4914  BrCond->eraseFromParent();
4915  MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4916  return true;
4917  }
4918 
4919  return false;
4920  }
4921  case Intrinsic::amdgcn_kernarg_segment_ptr:
4922  if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4923  // This only makes sense to call in a kernel, so just lower to null.
4924  B.buildConstant(MI.getOperand(0).getReg(), 0);
4925  MI.eraseFromParent();
4926  return true;
4927  }
4928 
4931  case Intrinsic::amdgcn_implicitarg_ptr:
4932  return legalizeImplicitArgPtr(MI, MRI, B);
4933  case Intrinsic::amdgcn_workitem_id_x:
4936  case Intrinsic::amdgcn_workitem_id_y:
4939  case Intrinsic::amdgcn_workitem_id_z:
4942  case Intrinsic::amdgcn_workgroup_id_x:
4945  case Intrinsic::amdgcn_workgroup_id_y:
4948  case Intrinsic::amdgcn_workgroup_id_z:
4951  case Intrinsic::amdgcn_dispatch_ptr:
4954  case Intrinsic::amdgcn_queue_ptr:
4957  case Intrinsic::amdgcn_implicit_buffer_ptr:
4960  case Intrinsic::amdgcn_dispatch_id:
4963  case Intrinsic::amdgcn_fdiv_fast:
4964  return legalizeFDIVFastIntrin(MI, MRI, B);
4965  case Intrinsic::amdgcn_is_shared:
4967  case Intrinsic::amdgcn_is_private:
4969  case Intrinsic::amdgcn_wavefrontsize: {
4970  B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4971  MI.eraseFromParent();
4972  return true;
4973  }
4974  case Intrinsic::amdgcn_s_buffer_load:
4975  return legalizeSBufferLoad(Helper, MI);
4976  case Intrinsic::amdgcn_raw_buffer_store:
4977  case Intrinsic::amdgcn_struct_buffer_store:
4978  return legalizeBufferStore(MI, MRI, B, false, false);
4979  case Intrinsic::amdgcn_raw_buffer_store_format:
4980  case Intrinsic::amdgcn_struct_buffer_store_format:
4981  return legalizeBufferStore(MI, MRI, B, false, true);
4982  case Intrinsic::amdgcn_raw_tbuffer_store:
4983  case Intrinsic::amdgcn_struct_tbuffer_store:
4984  return legalizeBufferStore(MI, MRI, B, true, true);
4985  case Intrinsic::amdgcn_raw_buffer_load:
4986  case Intrinsic::amdgcn_struct_buffer_load:
4987  return legalizeBufferLoad(MI, MRI, B, false, false);
4988  case Intrinsic::amdgcn_raw_buffer_load_format:
4989  case Intrinsic::amdgcn_struct_buffer_load_format:
4990  return legalizeBufferLoad(MI, MRI, B, true, false);
4991  case Intrinsic::amdgcn_raw_tbuffer_load:
4992  case Intrinsic::amdgcn_struct_tbuffer_load:
4993  return legalizeBufferLoad(MI, MRI, B, true, true);
4994  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4995  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4996  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4997  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4998  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4999  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5000  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5001  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5002  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5003  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5004  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5005  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5006  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5007  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5008  case Intrinsic::amdgcn_raw_buffer_atomic_and:
5009  case Intrinsic::amdgcn_struct_buffer_atomic_and:
5010  case Intrinsic::amdgcn_raw_buffer_atomic_or:
5011  case Intrinsic::amdgcn_struct_buffer_atomic_or:
5012  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5013  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5014  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5015  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5016  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5017  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5018  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5019  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5020  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5021  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5022  case Intrinsic::amdgcn_buffer_atomic_fadd:
5023  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5024  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5025  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5026  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5027  return legalizeBufferAtomic(MI, B, IntrID);
5028  case Intrinsic::amdgcn_atomic_inc:
5029  return legalizeAtomicIncDec(MI, B, true);
5030  case Intrinsic::amdgcn_atomic_dec:
5031  return legalizeAtomicIncDec(MI, B, false);
5032  case Intrinsic::trap:
5033  return legalizeTrapIntrinsic(MI, MRI, B);
5034  case Intrinsic::debugtrap:
5035  return legalizeDebugTrapIntrinsic(MI, MRI, B);
5036  case Intrinsic::amdgcn_rsq_clamp:
5037  return legalizeRsqClampIntrinsic(MI, MRI, B);
5038  case Intrinsic::amdgcn_ds_fadd:
5039  case Intrinsic::amdgcn_ds_fmin:
5040  case Intrinsic::amdgcn_ds_fmax:
5041  return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5042  case Intrinsic::amdgcn_image_bvh_intersect_ray:
5043  return legalizeBVHIntrinsic(MI, B);
5044  default: {
5045  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5047  return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5048  return true;
5049  }
5050  }
5051 
5052  return true;
5053 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
llvm::NextPowerOf2
uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:683
MIPatternMatch.h
llvm::MachineMemOperand::setValue
void setValue(const Value *NewSV)
Change the SourceValue for this MachineMemOperand.
Definition: MachineMemOperand.h:328
llvm::AMDGPUFunctionArgInfo::PreloadedValue
PreloadedValue
Definition: AMDGPUArgumentUsageInfo.h:98
llvm::AMDGPUFunctionArgInfo::QUEUE_PTR
@ QUEUE_PTR
Definition: AMDGPUArgumentUsageInfo.h:102
llvm::LegalizeRuleSet::widenScalarToNextPow2
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
Definition: LegalizerInfo.h:823
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4630
llvm::AMDGPUMachineFunction::allocateLDSGlobal
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
Definition: AMDGPUMachineFunction.cpp:41
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:102
Merge
R600 Clause Merge
Definition: R600ClauseMergePass.cpp:69
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::LegalityPredicates::scalarOrEltWiderThan
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
Definition: LegalityPredicates.cpp:141
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
llvm::AMDGPULegalizerInfo::legalizeImplicitArgPtr
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:3616
shouldBitcastLoadStoreType
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
Definition: AMDGPULegalizerInfo.cpp:363
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:741
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:102
llvm::AMDGPU::MIMGBaseOpcodeInfo::Store
bool Store
Definition: AMDGPUBaseInfo.h:282
llvm::AMDGPU::MIMGBaseOpcodeInfo::Gradients
bool Gradients
Definition: AMDGPUBaseInfo.h:289
llvm::LLT::getScalarSizeInBits
unsigned getScalarSizeInBits() const
Definition: LowLevelTypeImpl.h:213
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
llvm::GlobalValue::hasExternalLinkage
bool hasExternalLinkage() const
Definition: GlobalValue.h:431
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1003
llvm::MachineMemOperand::getAlign
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
Definition: MachineOperand.cpp:1082
vectorWiderThan
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
Definition: AMDGPULegalizerInfo.cpp:171
llvm::APFloatBase::IEEEsingle
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:170
SIMachineFunctionInfo.h
llvm::AMDGPU::Hwreg::ID_SHIFT_
@ ID_SHIFT_
Definition: SIDefines.h:375
llvm::LegalizerInfo::getActionDefinitionsBuilder
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
Definition: LegalizerInfo.cpp:290
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::ArgDescriptor
Definition: AMDGPUArgumentUsageInfo.h:23
getPow2VectorType
static LLT getPow2VectorType(LLT Ty)
Definition: AMDGPULegalizerInfo.cpp:49
widenToNextPowerOf2
static LLT widenToNextPowerOf2(LLT Ty)
Definition: AMDGPULegalizerInfo.cpp:2458
llvm::Function
Definition: Function.h:61
convertImageAddrToPacked
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
Definition: AMDGPULegalizerInfo.cpp:4202
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:418
llvm::AMDGPU::SIModeRegisterDefaults::IEEE
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
Definition: AMDGPUBaseInfo.h:912
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::SIInstrInfo::MO_GOTPCREL32
@ MO_GOTPCREL32
Definition: SIInstrInfo.h:154
llvm::GCNSubtarget::getNSAMaxSize
unsigned getNSAMaxSize() const
Definition: GCNSubtarget.h:882
isWideScalarExtLoadTruncStore
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
Definition: AMDGPULegalizerInfo.cpp:231
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::MachineFunction::end
iterator end()
Definition: MachineFunction.h:806
llvm::LegalizeRuleSet::customFor
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
Definition: LegalizerInfo.h:797
llvm::AMDGPULegalizerInfo::legalizeIsAddrSpace
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
Definition: AMDGPULegalizerInfo.cpp:3633
llvm::AMDGPU::MIMGBaseOpcodeInfo::Gather4
bool Gather4
Definition: AMDGPUBaseInfo.h:286
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:430
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
llvm::CmpInst::FCMP_ONE
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:728
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::AMDGPU::ImageDimIntrinsicInfo
Definition: AMDGPUInstrInfo.h:50