LLVM  14.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
40  "amdgpu-global-isel-new-legality",
41  cl::desc("Use GlobalISel desired legality, rather than try to use"
42  "rules compatible with selection patterns"),
43  cl::init(false),
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
50  unsigned NElts = Ty.getNumElements();
51  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52  return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
57  unsigned Bits = Ty.getSizeInBits();
58  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59  return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66  return [=](const LegalityQuery &Query) {
67  const LLT Ty = Query.Types[TypeIdx];
68  if (!Ty.isVector())
69  return false;
70 
71  const LLT EltTy = Ty.getElementType();
72  const unsigned EltSize = EltTy.getSizeInBits();
73  return Ty.getNumElements() % 2 != 0 &&
74  EltSize > 1 && EltSize < 32 &&
75  Ty.getSizeInBits() % 32 != 0;
76  };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80  return [=](const LegalityQuery &Query) {
81  const LLT Ty = Query.Types[TypeIdx];
82  return Ty.getSizeInBits() % 32 == 0;
83  };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87  return [=](const LegalityQuery &Query) {
88  const LLT Ty = Query.Types[TypeIdx];
89  const LLT EltTy = Ty.getScalarType();
90  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91  };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95  return [=](const LegalityQuery &Query) {
96  const LLT Ty = Query.Types[TypeIdx];
97  const LLT EltTy = Ty.getElementType();
98  return std::make_pair(TypeIdx,
99  LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100  };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104  return [=](const LegalityQuery &Query) {
105  const LLT Ty = Query.Types[TypeIdx];
106  const LLT EltTy = Ty.getElementType();
107  unsigned Size = Ty.getSizeInBits();
108  unsigned Pieces = (Size + 63) / 64;
109  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110  return std::make_pair(
111  TypeIdx,
112  LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113  };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119  return [=](const LegalityQuery &Query) {
120  const LLT Ty = Query.Types[TypeIdx];
121 
122  const LLT EltTy = Ty.getElementType();
123  const int Size = Ty.getSizeInBits();
124  const int EltSize = EltTy.getSizeInBits();
125  const int NextMul32 = (Size + 31) / 32;
126 
127  assert(EltSize < 32);
128 
129  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130  return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131  };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135  const unsigned Size = Ty.getSizeInBits();
136 
137  LLT CoercedTy;
138  if (Size <= 32) {
139  // <2 x s8> -> s16
140  // <4 x s8> -> s32
141  return LLT::scalar(Size);
142  }
143 
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148  return [=](const LegalityQuery &Query) {
149  const LLT Ty = Query.Types[TypeIdx];
150  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151  };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155  return [=](const LegalityQuery &Query) {
156  const LLT Ty = Query.Types[TypeIdx];
157  unsigned Size = Ty.getSizeInBits();
158  assert(Size % 32 == 0);
159  return std::make_pair(
160  TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161  };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165  return [=](const LegalityQuery &Query) {
166  const LLT QueryTy = Query.Types[TypeIdx];
167  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168  };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172  return [=](const LegalityQuery &Query) {
173  const LLT QueryTy = Query.Types[TypeIdx];
174  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175  };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179  return [=](const LegalityQuery &Query) {
180  const LLT QueryTy = Query.Types[TypeIdx];
181  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182  };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186  return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190  const int EltSize = EltTy.getSizeInBits();
191  return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195  const int EltSize = Ty.getElementType().getSizeInBits();
196  return EltSize == 32 || EltSize == 64 ||
197  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198  EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202  if (!isRegisterSize(Ty.getSizeInBits()))
203  return false;
204 
205  if (Ty.isVector())
206  return isRegisterVectorType(Ty);
207 
208  return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214  return [=](const LegalityQuery &Query) {
215  return isRegisterType(Query.Types[TypeIdx]);
216  };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220  return [=](const LegalityQuery &Query) {
221  const LLT QueryTy = Query.Types[TypeIdx];
222  if (!QueryTy.isVector())
223  return false;
224  const LLT EltTy = QueryTy.getElementType();
225  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226  };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
232  return [=](const LegalityQuery &Query) {
233  const LLT Ty = Query.Types[TypeIdx];
234  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235  Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236  };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243  bool IsLoad) {
244  switch (AS) {
246  // FIXME: Private element size.
247  return ST.enableFlatScratch() ? 128 : 32;
249  return ST.useDS128() ? 128 : 64;
253  // Treat constant and global as identical. SMRD loads are sometimes usable for
254  // global loads (ideally constant address space should be eliminated)
255  // depending on the context. Legality cannot be context dependent, but
256  // RegBankSelect can split the load as necessary depending on the pointer
257  // register bank/uniformity and if the memory is invariant or not written in a
258  // kernel.
259  return IsLoad ? 512 : 128;
260  default:
261  // Flat addresses may contextually need to be split to 32-bit parts if they
262  // may alias scratch depending on the subtarget.
263  return 128;
264  }
265 }
266 
268  const LegalityQuery &Query) {
269  const LLT Ty = Query.Types[0];
270 
271  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272  const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274  unsigned RegSize = Ty.getSizeInBits();
275  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277  unsigned AS = Query.Types[1].getAddressSpace();
278 
279  // All of these need to be custom lowered to cast the pointer operand.
281  return false;
282 
283  // Do not handle extending vector loads.
284  if (Ty.isVector() && MemSize != RegSize)
285  return false;
286 
287  // TODO: We should be able to widen loads if the alignment is high enough, but
288  // we also need to modify the memory access size.
289 #if 0
290  // Accept widening loads based on alignment.
291  if (IsLoad && MemSize < Size)
292  MemSize = std::max(MemSize, Align);
293 #endif
294 
295  // Only 1-byte and 2-byte to 32-bit extloads are valid.
296  if (MemSize != RegSize && RegSize != 32)
297  return false;
298 
299  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300  return false;
301 
302  switch (MemSize) {
303  case 8:
304  case 16:
305  case 32:
306  case 64:
307  case 128:
308  break;
309  case 96:
310  if (!ST.hasDwordx3LoadStores())
311  return false;
312  break;
313  case 256:
314  case 512:
315  // These may contextually need to be broken down.
316  break;
317  default:
318  return false;
319  }
320 
321  assert(RegSize >= MemSize);
322 
323  if (AlignBits < MemSize) {
324  const SITargetLowering *TLI = ST.getTargetLowering();
325  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326  Align(AlignBits / 8)))
327  return false;
328  }
329 
330  return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338  if (EnableNewLegality)
339  return false;
340 
341  const unsigned Size = Ty.getSizeInBits();
342  if (Size <= 64)
343  return false;
344  if (!Ty.isVector())
345  return true;
346 
347  LLT EltTy = Ty.getElementType();
348  if (EltTy.isPointer())
349  return true;
350 
351  unsigned EltSize = EltTy.getSizeInBits();
352  return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356  const LLT Ty = Query.Types[0];
357  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364  const LLT MemTy) {
365  const unsigned MemSizeInBits = MemTy.getSizeInBits();
366  const unsigned Size = Ty.getSizeInBits();
367  if (Size != MemSizeInBits)
368  return Size <= 32 && Ty.isVector();
369 
371  return true;
372 
373  // Don't try to handle bitcasting vector ext loads for now.
374  return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375  (Size <= 32 || isRegisterSize(Size)) &&
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383  unsigned AlignInBits, unsigned AddrSpace,
384  unsigned Opcode) {
385  unsigned SizeInBits = MemoryTy.getSizeInBits();
386  // We don't want to widen cases that are naturally legal.
387  if (isPowerOf2_32(SizeInBits))
388  return false;
389 
390  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391  // end up widening these for a scalar load during RegBankSelect, since there
392  // aren't 96-bit scalar loads.
393  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394  return false;
395 
396  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397  return false;
398 
399  // A load is known dereferenceable up to the alignment, so it's legal to widen
400  // to it.
401  //
402  // TODO: Could check dereferenceable for less aligned cases.
403  unsigned RoundedSize = NextPowerOf2(SizeInBits);
404  if (AlignInBits < RoundedSize)
405  return false;
406 
407  // Do not widen if it would introduce a slow unaligned load.
408  const SITargetLowering *TLI = ST.getTargetLowering();
409  bool Fast = false;
411  RoundedSize, AddrSpace, Align(AlignInBits / 8),
412  MachineMemOperand::MOLoad, &Fast) &&
413  Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417  unsigned Opcode) {
418  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419  return false;
420 
421  return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422  Query.MMODescrs[0].AlignInBits,
423  Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
427  const GCNTargetMachine &TM)
428  : ST(ST_) {
429  using namespace TargetOpcode;
430 
431  auto GetAddrSpacePtr = [&TM](unsigned AS) {
432  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433  };
434 
435  const LLT S1 = LLT::scalar(1);
436  const LLT S8 = LLT::scalar(8);
437  const LLT S16 = LLT::scalar(16);
438  const LLT S32 = LLT::scalar(32);
439  const LLT S64 = LLT::scalar(64);
440  const LLT S128 = LLT::scalar(128);
441  const LLT S256 = LLT::scalar(256);
442  const LLT S512 = LLT::scalar(512);
443  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445  const LLT V2S8 = LLT::fixed_vector(2, 8);
446  const LLT V2S16 = LLT::fixed_vector(2, 16);
447  const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449  const LLT V2S32 = LLT::fixed_vector(2, 32);
450  const LLT V3S32 = LLT::fixed_vector(3, 32);
451  const LLT V4S32 = LLT::fixed_vector(4, 32);
452  const LLT V5S32 = LLT::fixed_vector(5, 32);
453  const LLT V6S32 = LLT::fixed_vector(6, 32);
454  const LLT V7S32 = LLT::fixed_vector(7, 32);
455  const LLT V8S32 = LLT::fixed_vector(8, 32);
456  const LLT V9S32 = LLT::fixed_vector(9, 32);
457  const LLT V10S32 = LLT::fixed_vector(10, 32);
458  const LLT V11S32 = LLT::fixed_vector(11, 32);
459  const LLT V12S32 = LLT::fixed_vector(12, 32);
460  const LLT V13S32 = LLT::fixed_vector(13, 32);
461  const LLT V14S32 = LLT::fixed_vector(14, 32);
462  const LLT V15S32 = LLT::fixed_vector(15, 32);
463  const LLT V16S32 = LLT::fixed_vector(16, 32);
464  const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466  const LLT V2S64 = LLT::fixed_vector(2, 64);
467  const LLT V3S64 = LLT::fixed_vector(3, 64);
468  const LLT V4S64 = LLT::fixed_vector(4, 64);
469  const LLT V5S64 = LLT::fixed_vector(5, 64);
470  const LLT V6S64 = LLT::fixed_vector(6, 64);
471  const LLT V7S64 = LLT::fixed_vector(7, 64);
472  const LLT V8S64 = LLT::fixed_vector(8, 64);
473  const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475  std::initializer_list<LLT> AllS32Vectors =
476  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478  std::initializer_list<LLT> AllS64Vectors =
479  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489  const LLT CodePtr = FlatPtr;
490 
491  const std::initializer_list<LLT> AddrSpaces64 = {
492  GlobalPtr, ConstantPtr, FlatPtr
493  };
494 
495  const std::initializer_list<LLT> AddrSpaces32 = {
496  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497  };
498 
499  const std::initializer_list<LLT> FPTypesBase = {
500  S32, S64
501  };
502 
503  const std::initializer_list<LLT> FPTypes16 = {
504  S32, S64, S16
505  };
506 
507  const std::initializer_list<LLT> FPTypesPK16 = {
508  S32, S64, S16, V2S16
509  };
510 
511  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513  // s1 for VCC branches, s32 for SCC branches.
514  getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517  // elements for v3s16
519  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520  .legalFor(AllS32Vectors)
521  .legalFor(AllS64Vectors)
522  .legalFor(AddrSpaces64)
523  .legalFor(AddrSpaces32)
524  .legalIf(isPointer(0))
525  .clampScalar(0, S16, S256)
526  .widenScalarToNextPow2(0, 32)
527  .clampMaxNumElements(0, S32, 16)
529  .scalarize(0);
530 
531  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532  // Full set of gfx9 features.
533  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534  .legalFor({S32, S16, V2S16})
535  .minScalar(0, S16)
536  .clampMaxNumElements(0, S16, 2)
538  .maxScalar(0, S32)
539  .scalarize(0);
540 
541  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542  .legalFor({S32, S16, V2S16}) // Clamp modifier
543  .minScalarOrElt(0, S16)
544  .clampMaxNumElements(0, S16, 2)
545  .scalarize(0)
546  .widenScalarToNextPow2(0, 32)
547  .lower();
548  } else if (ST.has16BitInsts()) {
549  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550  .legalFor({S32, S16})
551  .minScalar(0, S16)
553  .maxScalar(0, S32)
554  .scalarize(0);
555 
556  // Technically the saturating operations require clamp bit support, but this
557  // was introduced at the same time as 16-bit operations.
558  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559  .legalFor({S32, S16}) // Clamp modifier
560  .minScalar(0, S16)
561  .scalarize(0)
562  .widenScalarToNextPow2(0, 16)
563  .lower();
564 
565  // We're just lowering this, but it helps get a better result to try to
566  // coerce to the desired type first.
567  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568  .minScalar(0, S16)
569  .scalarize(0)
570  .lower();
571  } else {
572  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573  .legalFor({S32})
574  .widenScalarToNextMultipleOf(0, 32)
575  .clampScalar(0, S32, S32)
576  .scalarize(0);
577 
578  if (ST.hasIntClamp()) {
579  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580  .legalFor({S32}) // Clamp modifier.
581  .scalarize(0)
582  .minScalarOrElt(0, S32)
583  .lower();
584  } else {
585  // Clamp bit support was added in VI, along with 16-bit operations.
586  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587  .minScalar(0, S32)
588  .scalarize(0)
589  .lower();
590  }
591 
592  // FIXME: DAG expansion gets better results. The widening uses the smaller
593  // range values and goes for the min/max lowering directly.
594  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595  .minScalar(0, S32)
596  .scalarize(0)
597  .lower();
598  }
599 
601  {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602  .customFor({S32, S64})
603  .clampScalar(0, S32, S64)
604  .widenScalarToNextPow2(0, 32)
605  .scalarize(0);
606 
607  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608  .legalFor({S32})
609  .maxScalar(0, S32);
610 
611  if (ST.hasVOP3PInsts()) {
612  Mulh
613  .clampMaxNumElements(0, S8, 2)
614  .lowerFor({V2S8});
615  }
616 
617  Mulh
618  .scalarize(0)
619  .lower();
620 
621  // Report legal for any types we can handle anywhere. For the cases only legal
622  // on the SALU, RegBankSelect will be able to re-legalize.
623  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625  .clampScalar(0, S32, S64)
629  .scalarize(0);
630 
631  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633  .legalFor({{S32, S1}, {S32, S32}})
634  .minScalar(0, S32)
635  // TODO: .scalarize(0)
636  .lower();
637 
638  getActionDefinitionsBuilder(G_BITCAST)
639  // Don't worry about the size constraint.
641  .lower();
642 
643 
644  getActionDefinitionsBuilder(G_CONSTANT)
645  .legalFor({S1, S32, S64, S16, GlobalPtr,
646  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647  .legalIf(isPointer(0))
648  .clampScalar(0, S32, S64)
650 
651  getActionDefinitionsBuilder(G_FCONSTANT)
652  .legalFor({S32, S64, S16})
653  .clampScalar(0, S16, S64);
654 
655  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656  .legalIf(isRegisterType(0))
657  // s1 and s16 are special cases because they have legal operations on
658  // them, but don't really occupy registers in the normal way.
659  .legalFor({S1, S16})
660  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661  .clampScalarOrElt(0, S32, MaxScalar)
662  .widenScalarToNextPow2(0, 32)
663  .clampMaxNumElements(0, S32, 16);
664 
665  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667  // If the amount is divergent, we have to do a wave reduction to get the
668  // maximum value, so this is expanded during RegBankSelect.
669  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670  .legalFor({{PrivatePtr, S32}});
671 
672  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673  .customIf(typeIsNot(0, PrivatePtr));
674 
675  getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677  auto &FPOpActions = getActionDefinitionsBuilder(
678  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679  .legalFor({S32, S64});
680  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681  .customFor({S32, S64});
682  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683  .customFor({S32, S64});
684 
685  if (ST.has16BitInsts()) {
686  if (ST.hasVOP3PInsts())
687  FPOpActions.legalFor({S16, V2S16});
688  else
689  FPOpActions.legalFor({S16});
690 
691  TrigActions.customFor({S16});
692  FDIVActions.customFor({S16});
693  }
694 
695  auto &MinNumMaxNum = getActionDefinitionsBuilder({
696  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698  if (ST.hasVOP3PInsts()) {
699  MinNumMaxNum.customFor(FPTypesPK16)
701  .clampMaxNumElements(0, S16, 2)
702  .clampScalar(0, S16, S64)
703  .scalarize(0);
704  } else if (ST.has16BitInsts()) {
705  MinNumMaxNum.customFor(FPTypes16)
706  .clampScalar(0, S16, S64)
707  .scalarize(0);
708  } else {
709  MinNumMaxNum.customFor(FPTypesBase)
710  .clampScalar(0, S32, S64)
711  .scalarize(0);
712  }
713 
714  if (ST.hasVOP3PInsts())
715  FPOpActions.clampMaxNumElements(0, S16, 2);
716 
717  FPOpActions
718  .scalarize(0)
719  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721  TrigActions
722  .scalarize(0)
723  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725  FDIVActions
726  .scalarize(0)
727  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729  getActionDefinitionsBuilder({G_FNEG, G_FABS})
730  .legalFor(FPTypesPK16)
731  .clampMaxNumElements(0, S16, 2)
732  .scalarize(0)
733  .clampScalar(0, S16, S64);
734 
735  if (ST.has16BitInsts()) {
736  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737  .legalFor({S32, S64, S16})
738  .scalarize(0)
739  .clampScalar(0, S16, S64);
740  } else {
742  .legalFor({S32, S64})
743  .scalarize(0)
744  .clampScalar(0, S32, S64);
745 
746  if (ST.hasFractBug()) {
748  .customFor({S64})
749  .legalFor({S32, S64})
750  .scalarize(0)
751  .clampScalar(0, S32, S64);
752  } else {
754  .legalFor({S32, S64})
755  .scalarize(0)
756  .clampScalar(0, S32, S64);
757  }
758  }
759 
760  getActionDefinitionsBuilder(G_FPTRUNC)
761  .legalFor({{S32, S64}, {S16, S32}})
762  .scalarize(0)
763  .lower();
764 
766  .legalFor({{S64, S32}, {S32, S16}})
767  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768  .scalarize(0);
769 
771  // Use actual fsub instruction
772  .legalFor({S32})
773  // Must use fadd + fneg
774  .lowerFor({S64, S16, V2S16})
775  .scalarize(0)
776  .clampScalar(0, S32, S64);
777 
778  // Whether this is legal depends on the floating point mode for the function.
779  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781  FMad.customFor({S32, S16});
782  else if (ST.hasMadMacF32Insts())
783  FMad.customFor({S32});
784  else if (ST.hasMadF16())
785  FMad.customFor({S16});
786  FMad.scalarize(0)
787  .lower();
788 
789  auto &FRem = getActionDefinitionsBuilder(G_FREM);
790  if (ST.has16BitInsts()) {
791  FRem.customFor({S16, S32, S64});
792  } else {
793  FRem.minScalar(0, S32)
794  .customFor({S32, S64});
795  }
796  FRem.scalarize(0);
797 
798  // TODO: Do we need to clamp maximum bitwidth?
800  .legalIf(isScalar(0))
801  .legalFor({{V2S16, V2S32}})
802  .clampMaxNumElements(0, S16, 2)
803  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804  // situations (like an invalid implicit use), we don't want to infinite loop
805  // in the legalizer.
807  .alwaysLegal();
808 
809  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811  {S32, S1}, {S64, S1}, {S16, S1}})
812  .scalarize(0)
813  .clampScalar(0, S32, S64)
814  .widenScalarToNextPow2(1, 32);
815 
816  // TODO: Split s1->s64 during regbankselect for VALU.
817  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819  .lowerIf(typeIs(1, S1))
820  .customFor({{S32, S64}, {S64, S64}});
821  if (ST.has16BitInsts())
822  IToFP.legalFor({{S16, S16}});
823  IToFP.clampScalar(1, S32, S64)
824  .minScalar(0, S32)
825  .scalarize(0)
827 
828  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830  .customFor({{S64, S32}, {S64, S64}})
831  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832  if (ST.has16BitInsts())
833  FPToI.legalFor({{S16, S16}});
834  else
835  FPToI.minScalar(1, S32);
836 
837  FPToI.minScalar(0, S32)
838  .widenScalarToNextPow2(0, 32)
839  .scalarize(0)
840  .lower();
841 
842  // Lower roundeven into G_FRINT
843  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844  .scalarize(0)
845  .lower();
846 
847  if (ST.has16BitInsts()) {
848  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849  .legalFor({S16, S32, S64})
850  .clampScalar(0, S16, S64)
851  .scalarize(0);
852  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854  .legalFor({S32, S64})
855  .clampScalar(0, S32, S64)
856  .scalarize(0);
857  } else {
858  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859  .legalFor({S32})
860  .customFor({S64})
861  .clampScalar(0, S32, S64)
862  .scalarize(0);
863  }
864 
865  getActionDefinitionsBuilder(G_PTR_ADD)
866  .legalIf(all(isPointer(0), sameSize(0, 1)))
867  .scalarize(0)
868  .scalarSameSizeAs(1, 0);
869 
870  getActionDefinitionsBuilder(G_PTRMASK)
871  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872  .scalarSameSizeAs(1, 0)
873  .scalarize(0);
874 
875  auto &CmpBuilder =
876  getActionDefinitionsBuilder(G_ICMP)
877  // The compare output type differs based on the register bank of the output,
878  // so make both s1 and s32 legal.
879  //
880  // Scalar compares producing output in scc will be promoted to s32, as that
881  // is the allocatable register type that will be needed for the copy from
882  // scc. This will be promoted during RegBankSelect, and we assume something
883  // before that won't try to use s32 result types.
884  //
885  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886  // bank.
887  .legalForCartesianProduct(
888  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889  .legalForCartesianProduct(
890  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891  if (ST.has16BitInsts()) {
892  CmpBuilder.legalFor({{S1, S16}});
893  }
894 
895  CmpBuilder
896  .widenScalarToNextPow2(1)
897  .clampScalar(1, S32, S64)
898  .scalarize(0)
899  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901  getActionDefinitionsBuilder(G_FCMP)
902  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903  .widenScalarToNextPow2(1)
904  .clampScalar(1, S32, S64)
905  .scalarize(0);
906 
907  // FIXME: fpow has a selection pattern that should move to custom lowering.
908  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909  if (ST.has16BitInsts())
910  Exp2Ops.legalFor({S32, S16});
911  else
912  Exp2Ops.legalFor({S32});
913  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914  Exp2Ops.scalarize(0);
915 
916  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917  if (ST.has16BitInsts())
918  ExpOps.customFor({{S32}, {S16}});
919  else
920  ExpOps.customFor({S32});
921  ExpOps.clampScalar(0, MinScalarFPTy, S32)
922  .scalarize(0);
923 
924  getActionDefinitionsBuilder(G_FPOWI)
925  .clampScalar(0, MinScalarFPTy, S32)
926  .lower();
927 
928  // The 64-bit versions produce 32-bit results, but only on the SALU.
929  getActionDefinitionsBuilder(G_CTPOP)
930  .legalFor({{S32, S32}, {S32, S64}})
931  .clampScalar(0, S32, S32)
932  .clampScalar(1, S32, S64)
933  .scalarize(0)
934  .widenScalarToNextPow2(0, 32)
935  .widenScalarToNextPow2(1, 32);
936 
937  // The hardware instructions return a different result on 0 than the generic
938  // instructions expect. The hardware produces -1, but these produce the
939  // bitwidth.
940  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941  .scalarize(0)
942  .clampScalar(0, S32, S32)
943  .clampScalar(1, S32, S64)
944  .widenScalarToNextPow2(0, 32)
945  .widenScalarToNextPow2(1, 32)
946  .custom();
947 
948  // The 64-bit versions produce 32-bit results, but only on the SALU.
949  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950  .legalFor({{S32, S32}, {S32, S64}})
951  .clampScalar(0, S32, S32)
952  .clampScalar(1, S32, S64)
953  .scalarize(0)
954  .widenScalarToNextPow2(0, 32)
955  .widenScalarToNextPow2(1, 32);
956 
957  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958  // RegBankSelect.
959  getActionDefinitionsBuilder(G_BITREVERSE)
960  .legalFor({S32, S64})
961  .clampScalar(0, S32, S64)
962  .scalarize(0)
963  .widenScalarToNextPow2(0);
964 
965  if (ST.has16BitInsts()) {
966  getActionDefinitionsBuilder(G_BSWAP)
967  .legalFor({S16, S32, V2S16})
968  .clampMaxNumElements(0, S16, 2)
969  // FIXME: Fixing non-power-of-2 before clamp is workaround for
970  // narrowScalar limitation.
971  .widenScalarToNextPow2(0)
972  .clampScalar(0, S16, S32)
973  .scalarize(0);
974 
975  if (ST.hasVOP3PInsts()) {
976  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977  .legalFor({S32, S16, V2S16})
978  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979  .clampMaxNumElements(0, S16, 2)
980  .minScalar(0, S16)
981  .widenScalarToNextPow2(0)
982  .scalarize(0)
983  .lower();
984  } else {
985  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986  .legalFor({S32, S16})
987  .widenScalarToNextPow2(0)
988  .minScalar(0, S16)
989  .scalarize(0)
990  .lower();
991  }
992  } else {
993  // TODO: Should have same legality without v_perm_b32
994  getActionDefinitionsBuilder(G_BSWAP)
995  .legalFor({S32})
996  .lowerIf(scalarNarrowerThan(0, 32))
997  // FIXME: Fixing non-power-of-2 before clamp is workaround for
998  // narrowScalar limitation.
999  .widenScalarToNextPow2(0)
1000  .maxScalar(0, S32)
1001  .scalarize(0)
1002  .lower();
1003 
1004  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005  .legalFor({S32})
1006  .minScalar(0, S32)
1007  .widenScalarToNextPow2(0)
1008  .scalarize(0)
1009  .lower();
1010  }
1011 
1012  getActionDefinitionsBuilder(G_INTTOPTR)
1013  // List the common cases
1014  .legalForCartesianProduct(AddrSpaces64, {S64})
1015  .legalForCartesianProduct(AddrSpaces32, {S32})
1016  .scalarize(0)
1017  // Accept any address space as long as the size matches
1018  .legalIf(sameSize(0, 1))
1019  .widenScalarIf(smallerThan(1, 0),
1020  [](const LegalityQuery &Query) {
1021  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022  })
1023  .narrowScalarIf(largerThan(1, 0),
1024  [](const LegalityQuery &Query) {
1025  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026  });
1027 
1028  getActionDefinitionsBuilder(G_PTRTOINT)
1029  // List the common cases
1030  .legalForCartesianProduct(AddrSpaces64, {S64})
1031  .legalForCartesianProduct(AddrSpaces32, {S32})
1032  .scalarize(0)
1033  // Accept any address space as long as the size matches
1034  .legalIf(sameSize(0, 1))
1035  .widenScalarIf(smallerThan(0, 1),
1036  [](const LegalityQuery &Query) {
1037  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038  })
1039  .narrowScalarIf(
1040  largerThan(0, 1),
1041  [](const LegalityQuery &Query) {
1042  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043  });
1044 
1045  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046  .scalarize(0)
1047  .custom();
1048 
1049  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050  bool IsLoad) -> bool {
1051  const LLT DstTy = Query.Types[0];
1052 
1053  // Split vector extloads.
1054  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1056 
1057  if (MemSize < DstTy.getSizeInBits())
1058  MemSize = std::max(MemSize, AlignBits);
1059 
1060  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1061  return true;
1062 
1063  const LLT PtrTy = Query.Types[1];
1064  unsigned AS = PtrTy.getAddressSpace();
1065  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1066  return true;
1067 
1068  // Catch weird sized loads that don't evenly divide into the access sizes
1069  // TODO: May be able to widen depending on alignment etc.
1070  unsigned NumRegs = (MemSize + 31) / 32;
1071  if (NumRegs == 3) {
1072  if (!ST.hasDwordx3LoadStores())
1073  return true;
1074  } else {
1075  // If the alignment allows, these should have been widened.
1076  if (!isPowerOf2_32(NumRegs))
1077  return true;
1078  }
1079 
1080  if (AlignBits < MemSize) {
1081  const SITargetLowering *TLI = ST.getTargetLowering();
1082  return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1083  Align(AlignBits / 8));
1084  }
1085 
1086  return false;
1087  };
1088 
1089  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1090  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1091  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1092 
1093  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1094  // LDS
1095  // TODO: Unsupported flat for SI.
1096 
1097  for (unsigned Op : {G_LOAD, G_STORE}) {
1098  const bool IsStore = Op == G_STORE;
1099 
1100  auto &Actions = getActionDefinitionsBuilder(Op);
1101  // Explicitly list some common cases.
1102  // TODO: Does this help compile time at all?
1103  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1104  {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1105  {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1106  {S64, GlobalPtr, S64, GlobalAlign32},
1107  {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1108  {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1109  {S32, GlobalPtr, S8, GlobalAlign8},
1110  {S32, GlobalPtr, S16, GlobalAlign16},
1111 
1112  {S32, LocalPtr, S32, 32},
1113  {S64, LocalPtr, S64, 32},
1114  {V2S32, LocalPtr, V2S32, 32},
1115  {S32, LocalPtr, S8, 8},
1116  {S32, LocalPtr, S16, 16},
1117  {V2S16, LocalPtr, S32, 32},
1118 
1119  {S32, PrivatePtr, S32, 32},
1120  {S32, PrivatePtr, S8, 8},
1121  {S32, PrivatePtr, S16, 16},
1122  {V2S16, PrivatePtr, S32, 32},
1123 
1124  {S32, ConstantPtr, S32, GlobalAlign32},
1125  {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1126  {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1127  {S64, ConstantPtr, S64, GlobalAlign32},
1128  {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1129  Actions.legalIf(
1130  [=](const LegalityQuery &Query) -> bool {
1131  return isLoadStoreLegal(ST, Query);
1132  });
1133 
1134  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1135  // 64-bits.
1136  //
1137  // TODO: Should generalize bitcast action into coerce, which will also cover
1138  // inserting addrspacecasts.
1139  Actions.customIf(typeIs(1, Constant32Ptr));
1140 
1141  // Turn any illegal element vectors into something easier to deal
1142  // with. These will ultimately produce 32-bit scalar shifts to extract the
1143  // parts anyway.
1144  //
1145  // For odd 16-bit element vectors, prefer to split those into pieces with
1146  // 16-bit vector parts.
1147  Actions.bitcastIf(
1148  [=](const LegalityQuery &Query) -> bool {
1149  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1150  Query.MMODescrs[0].MemoryTy);
1151  }, bitcastToRegisterType(0));
1152 
1153  if (!IsStore) {
1154  // Widen suitably aligned loads by loading extra bytes. The standard
1155  // legalization actions can't properly express widening memory operands.
1156  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1157  return shouldWidenLoad(ST, Query, G_LOAD);
1158  });
1159  }
1160 
1161  // FIXME: load/store narrowing should be moved to lower action
1162  Actions
1163  .narrowScalarIf(
1164  [=](const LegalityQuery &Query) -> bool {
1165  return !Query.Types[0].isVector() &&
1166  needToSplitMemOp(Query, Op == G_LOAD);
1167  },
1168  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1169  const LLT DstTy = Query.Types[0];
1170  const LLT PtrTy = Query.Types[1];
1171 
1172  const unsigned DstSize = DstTy.getSizeInBits();
1173  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1174 
1175  // Split extloads.
1176  if (DstSize > MemSize)
1177  return std::make_pair(0, LLT::scalar(MemSize));
1178 
1179  if (!isPowerOf2_32(DstSize)) {
1180  // We're probably decomposing an odd sized store. Try to split
1181  // to the widest type. TODO: Account for alignment. As-is it
1182  // should be OK, since the new parts will be further legalized.
1183  unsigned FloorSize = PowerOf2Floor(DstSize);
1184  return std::make_pair(0, LLT::scalar(FloorSize));
1185  }
1186 
1187  if (DstSize > 32 && (DstSize % 32 != 0)) {
1188  // FIXME: Need a way to specify non-extload of larger size if
1189  // suitably aligned.
1190  return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1191  }
1192 
1193  unsigned MaxSize = maxSizeForAddrSpace(ST,
1194  PtrTy.getAddressSpace(),
1195  Op == G_LOAD);
1196  if (MemSize > MaxSize)
1197  return std::make_pair(0, LLT::scalar(MaxSize));
1198 
1199  unsigned Align = Query.MMODescrs[0].AlignInBits;
1200  return std::make_pair(0, LLT::scalar(Align));
1201  })
1202  .fewerElementsIf(
1203  [=](const LegalityQuery &Query) -> bool {
1204  return Query.Types[0].isVector() &&
1205  needToSplitMemOp(Query, Op == G_LOAD);
1206  },
1207  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1208  const LLT DstTy = Query.Types[0];
1209  const LLT PtrTy = Query.Types[1];
1210 
1211  LLT EltTy = DstTy.getElementType();
1212  unsigned MaxSize = maxSizeForAddrSpace(ST,
1213  PtrTy.getAddressSpace(),
1214  Op == G_LOAD);
1215 
1216  // FIXME: Handle widened to power of 2 results better. This ends
1217  // up scalarizing.
1218  // FIXME: 3 element stores scalarized on SI
1219 
1220  // Split if it's too large for the address space.
1221  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1222  if (MemSize > MaxSize) {
1223  unsigned NumElts = DstTy.getNumElements();
1224  unsigned EltSize = EltTy.getSizeInBits();
1225 
1226  if (MaxSize % EltSize == 0) {
1227  return std::make_pair(
1229  ElementCount::getFixed(MaxSize / EltSize), EltTy));
1230  }
1231 
1232  unsigned NumPieces = MemSize / MaxSize;
1233 
1234  // FIXME: Refine when odd breakdowns handled
1235  // The scalars will need to be re-legalized.
1236  if (NumPieces == 1 || NumPieces >= NumElts ||
1237  NumElts % NumPieces != 0)
1238  return std::make_pair(0, EltTy);
1239 
1240  return std::make_pair(
1241  0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1242  }
1243 
1244  // FIXME: We could probably handle weird extending loads better.
1245  if (DstTy.getSizeInBits() > MemSize)
1246  return std::make_pair(0, EltTy);
1247 
1248  unsigned EltSize = EltTy.getSizeInBits();
1249  unsigned DstSize = DstTy.getSizeInBits();
1250  if (!isPowerOf2_32(DstSize)) {
1251  // We're probably decomposing an odd sized store. Try to split
1252  // to the widest type. TODO: Account for alignment. As-is it
1253  // should be OK, since the new parts will be further legalized.
1254  unsigned FloorSize = PowerOf2Floor(DstSize);
1255  return std::make_pair(
1257  ElementCount::getFixed(FloorSize / EltSize), EltTy));
1258  }
1259 
1260  // Need to split because of alignment.
1261  unsigned Align = Query.MMODescrs[0].AlignInBits;
1262  if (EltSize > Align &&
1263  (EltSize / Align < DstTy.getNumElements())) {
1264  return std::make_pair(
1265  0, LLT::fixed_vector(EltSize / Align, EltTy));
1266  }
1267 
1268  // May need relegalization for the scalars.
1269  return std::make_pair(0, EltTy);
1270  })
1271  .minScalar(0, S32)
1272  .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1273  .widenScalarToNextPow2(0)
1274  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1275  .lower();
1276  }
1277 
1278  // FIXME: Unaligned accesses not lowered.
1279  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1280  .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1281  {S32, GlobalPtr, S16, 2 * 8},
1282  {S32, LocalPtr, S8, 8},
1283  {S32, LocalPtr, S16, 16},
1284  {S32, PrivatePtr, S8, 8},
1285  {S32, PrivatePtr, S16, 16},
1286  {S32, ConstantPtr, S8, 8},
1287  {S32, ConstantPtr, S16, 2 * 8}})
1288  .legalIf(
1289  [=](const LegalityQuery &Query) -> bool {
1290  return isLoadStoreLegal(ST, Query);
1291  });
1292 
1293  if (ST.hasFlatAddressSpace()) {
1294  ExtLoads.legalForTypesWithMemDesc(
1295  {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1296  }
1297 
1298  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1299  // 64-bits.
1300  //
1301  // TODO: Should generalize bitcast action into coerce, which will also cover
1302  // inserting addrspacecasts.
1303  ExtLoads.customIf(typeIs(1, Constant32Ptr));
1304 
1305  ExtLoads.clampScalar(0, S32, S32)
1306  .widenScalarToNextPow2(0)
1307  .lower();
1308 
1309  auto &Atomics = getActionDefinitionsBuilder(
1310  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1311  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1312  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1313  G_ATOMICRMW_UMIN})
1314  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1315  {S64, GlobalPtr}, {S64, LocalPtr},
1316  {S32, RegionPtr}, {S64, RegionPtr}});
1317  if (ST.hasFlatAddressSpace()) {
1318  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1319  }
1320 
1321  auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1322  if (ST.hasLDSFPAtomicAdd()) {
1323  Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1324  if (ST.hasGFX90AInsts())
1325  Atomic.legalFor({{S64, LocalPtr}});
1326  }
1327  if (ST.hasAtomicFaddInsts())
1328  Atomic.legalFor({{S32, GlobalPtr}});
1329 
1330  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1331  // demarshalling
1332  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1333  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1334  {S32, FlatPtr}, {S64, FlatPtr}})
1335  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1336  {S32, RegionPtr}, {S64, RegionPtr}});
1337  // TODO: Pointer types, any 32-bit or 64-bit vector
1338 
1339  // Condition should be s32 for scalar, s1 for vector.
1340  getActionDefinitionsBuilder(G_SELECT)
1341  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1342  LocalPtr, FlatPtr, PrivatePtr,
1343  LLT::fixed_vector(2, LocalPtr),
1344  LLT::fixed_vector(2, PrivatePtr)},
1345  {S1, S32})
1346  .clampScalar(0, S16, S64)
1347  .scalarize(1)
1348  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1349  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1350  .clampMaxNumElements(0, S32, 2)
1351  .clampMaxNumElements(0, LocalPtr, 2)
1352  .clampMaxNumElements(0, PrivatePtr, 2)
1353  .scalarize(0)
1354  .widenScalarToNextPow2(0)
1355  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1356 
1357  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1358  // be more flexible with the shift amount type.
1359  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1360  .legalFor({{S32, S32}, {S64, S32}});
1361  if (ST.has16BitInsts()) {
1362  if (ST.hasVOP3PInsts()) {
1363  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1364  .clampMaxNumElements(0, S16, 2);
1365  } else
1366  Shifts.legalFor({{S16, S16}});
1367 
1368  // TODO: Support 16-bit shift amounts for all types
1369  Shifts.widenScalarIf(
1370  [=](const LegalityQuery &Query) {
1371  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1372  // 32-bit amount.
1373  const LLT ValTy = Query.Types[0];
1374  const LLT AmountTy = Query.Types[1];
1375  return ValTy.getSizeInBits() <= 16 &&
1376  AmountTy.getSizeInBits() < 16;
1377  }, changeTo(1, S16));
1378  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1379  Shifts.clampScalar(1, S32, S32);
1380  Shifts.clampScalar(0, S16, S64);
1381  Shifts.widenScalarToNextPow2(0, 16);
1382 
1383  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384  .minScalar(0, S16)
1385  .scalarize(0)
1386  .lower();
1387  } else {
1388  // Make sure we legalize the shift amount type first, as the general
1389  // expansion for the shifted type will produce much worse code if it hasn't
1390  // been truncated already.
1391  Shifts.clampScalar(1, S32, S32);
1392  Shifts.clampScalar(0, S32, S64);
1393  Shifts.widenScalarToNextPow2(0, 32);
1394 
1395  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1396  .minScalar(0, S32)
1397  .scalarize(0)
1398  .lower();
1399  }
1400  Shifts.scalarize(0);
1401 
1402  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1403  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1404  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1405  unsigned IdxTypeIdx = 2;
1406 
1407  getActionDefinitionsBuilder(Op)
1408  .customIf([=](const LegalityQuery &Query) {
1409  const LLT EltTy = Query.Types[EltTypeIdx];
1410  const LLT VecTy = Query.Types[VecTypeIdx];
1411  const LLT IdxTy = Query.Types[IdxTypeIdx];
1412  const unsigned EltSize = EltTy.getSizeInBits();
1413  return (EltSize == 32 || EltSize == 64) &&
1414  VecTy.getSizeInBits() % 32 == 0 &&
1415  VecTy.getSizeInBits() <= MaxRegisterSize &&
1416  IdxTy.getSizeInBits() == 32;
1417  })
1418  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1419  bitcastToVectorElement32(VecTypeIdx))
1420  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1421  .bitcastIf(
1422  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1423  [=](const LegalityQuery &Query) {
1424  // For > 64-bit element types, try to turn this into a 64-bit
1425  // element vector since we may be able to do better indexing
1426  // if this is scalar. If not, fall back to 32.
1427  const LLT EltTy = Query.Types[EltTypeIdx];
1428  const LLT VecTy = Query.Types[VecTypeIdx];
1429  const unsigned DstEltSize = EltTy.getSizeInBits();
1430  const unsigned VecSize = VecTy.getSizeInBits();
1431 
1432  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1433  return std::make_pair(
1434  VecTypeIdx,
1435  LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1436  })
1437  .clampScalar(EltTypeIdx, S32, S64)
1438  .clampScalar(VecTypeIdx, S32, S64)
1439  .clampScalar(IdxTypeIdx, S32, S32)
1440  .clampMaxNumElements(VecTypeIdx, S32, 32)
1441  // TODO: Clamp elements for 64-bit vectors?
1442  // It should only be necessary with variable indexes.
1443  // As a last resort, lower to the stack
1444  .lower();
1445  }
1446 
1447  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1448  .unsupportedIf([=](const LegalityQuery &Query) {
1449  const LLT &EltTy = Query.Types[1].getElementType();
1450  return Query.Types[0] != EltTy;
1451  });
1452 
1453  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1454  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1455  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1456 
1457  // FIXME: Doesn't handle extract of illegal sizes.
1458  getActionDefinitionsBuilder(Op)
1459  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1460  // FIXME: Multiples of 16 should not be legal.
1461  .legalIf([=](const LegalityQuery &Query) {
1462  const LLT BigTy = Query.Types[BigTyIdx];
1463  const LLT LitTy = Query.Types[LitTyIdx];
1464  return (BigTy.getSizeInBits() % 32 == 0) &&
1465  (LitTy.getSizeInBits() % 16 == 0);
1466  })
1467  .widenScalarIf(
1468  [=](const LegalityQuery &Query) {
1469  const LLT BigTy = Query.Types[BigTyIdx];
1470  return (BigTy.getScalarSizeInBits() < 16);
1471  },
1473  .widenScalarIf(
1474  [=](const LegalityQuery &Query) {
1475  const LLT LitTy = Query.Types[LitTyIdx];
1476  return (LitTy.getScalarSizeInBits() < 16);
1477  },
1479  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1480  .widenScalarToNextPow2(BigTyIdx, 32);
1481 
1482  }
1483 
1484  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1485  .legalForCartesianProduct(AllS32Vectors, {S32})
1486  .legalForCartesianProduct(AllS64Vectors, {S64})
1487  .clampNumElements(0, V16S32, V32S32)
1488  .clampNumElements(0, V2S64, V16S64)
1489  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1490 
1491  if (ST.hasScalarPackInsts()) {
1492  BuildVector
1493  // FIXME: Should probably widen s1 vectors straight to s32
1494  .minScalarOrElt(0, S16)
1495  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1496  .minScalar(1, S32);
1497 
1498  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1499  .legalFor({V2S16, S32})
1500  .lower();
1501  BuildVector.minScalarOrElt(0, S32);
1502  } else {
1503  BuildVector.customFor({V2S16, S16});
1504  BuildVector.minScalarOrElt(0, S32);
1505 
1506  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1507  .customFor({V2S16, S32})
1508  .lower();
1509  }
1510 
1511  BuildVector.legalIf(isRegisterType(0));
1512 
1513  // FIXME: Clamp maximum size
1514  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1515  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1516  .clampMaxNumElements(0, S32, 32)
1517  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1518  .clampMaxNumElements(0, S16, 64);
1519 
1520  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1521  // pre-legalize.
1522  if (ST.hasVOP3PInsts()) {
1523  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1524  .customFor({V2S16, V2S16})
1525  .lower();
1526  } else
1527  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1528 
1529  // Merge/Unmerge
1530  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1531  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1532  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1533 
1534  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1535  const LLT Ty = Query.Types[TypeIdx];
1536  if (Ty.isVector()) {
1537  const LLT &EltTy = Ty.getElementType();
1538  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1539  return true;
1540  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1541  return true;
1542  }
1543  return false;
1544  };
1545 
1546  auto &Builder = getActionDefinitionsBuilder(Op)
1547  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1548  .lowerFor({{S16, V2S16}})
1549  .lowerIf([=](const LegalityQuery &Query) {
1550  const LLT BigTy = Query.Types[BigTyIdx];
1551  return BigTy.getSizeInBits() == 32;
1552  })
1553  // Try to widen to s16 first for small types.
1554  // TODO: Only do this on targets with legal s16 shifts
1555  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1556  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1557  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1558  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1559  elementTypeIs(1, S16)),
1560  changeTo(1, V2S16))
1561  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1562  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1563  // valid.
1564  .clampScalar(LitTyIdx, S32, S512)
1565  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1566  // Break up vectors with weird elements into scalars
1567  .fewerElementsIf(
1568  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1569  scalarize(0))
1570  .fewerElementsIf(
1571  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1572  scalarize(1))
1573  .clampScalar(BigTyIdx, S32, MaxScalar);
1574 
1575  if (Op == G_MERGE_VALUES) {
1576  Builder.widenScalarIf(
1577  // TODO: Use 16-bit shifts if legal for 8-bit values?
1578  [=](const LegalityQuery &Query) {
1579  const LLT Ty = Query.Types[LitTyIdx];
1580  return Ty.getSizeInBits() < 32;
1581  },
1582  changeTo(LitTyIdx, S32));
1583  }
1584 
1585  Builder.widenScalarIf(
1586  [=](const LegalityQuery &Query) {
1587  const LLT Ty = Query.Types[BigTyIdx];
1588  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1589  Ty.getSizeInBits() % 16 != 0;
1590  },
1591  [=](const LegalityQuery &Query) {
1592  // Pick the next power of 2, or a multiple of 64 over 128.
1593  // Whichever is smaller.
1594  const LLT &Ty = Query.Types[BigTyIdx];
1595  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1596  if (NewSizeInBits >= 256) {
1597  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1598  if (RoundedTo < NewSizeInBits)
1599  NewSizeInBits = RoundedTo;
1600  }
1601  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1602  })
1603  // Any vectors left are the wrong size. Scalarize them.
1604  .scalarize(0)
1605  .scalarize(1);
1606  }
1607 
1608  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1609  // RegBankSelect.
1610  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1611  .legalFor({{S32}, {S64}});
1612 
1613  if (ST.hasVOP3PInsts()) {
1614  SextInReg.lowerFor({{V2S16}})
1615  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1616  // get more vector shift opportunities, since we'll get those when
1617  // expanded.
1618  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1619  } else if (ST.has16BitInsts()) {
1620  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1621  } else {
1622  // Prefer to promote to s32 before lowering if we don't have 16-bit
1623  // shifts. This avoid a lot of intermediate truncate and extend operations.
1624  SextInReg.lowerFor({{S32}, {S64}});
1625  }
1626 
1627  SextInReg
1628  .scalarize(0)
1629  .clampScalar(0, S32, S64)
1630  .lower();
1631 
1632  getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1633  .scalarize(0)
1634  .lower();
1635 
1636  // TODO: Only Try to form v2s16 with legal packed instructions.
1637  getActionDefinitionsBuilder(G_FSHR)
1638  .legalFor({{S32, S32}})
1639  .lowerFor({{V2S16, V2S16}})
1640  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641  .scalarize(0)
1642  .lower();
1643 
1644  if (ST.hasVOP3PInsts()) {
1645  getActionDefinitionsBuilder(G_FSHL)
1646  .lowerFor({{V2S16, V2S16}})
1647  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1648  .scalarize(0)
1649  .lower();
1650  } else {
1651  getActionDefinitionsBuilder(G_FSHL)
1652  .scalarize(0)
1653  .lower();
1654  }
1655 
1656  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1657  .legalFor({S64});
1658 
1659  getActionDefinitionsBuilder(G_FENCE)
1660  .alwaysLegal();
1661 
1662  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1663  .scalarize(0)
1664  .minScalar(0, S32)
1665  .lower();
1666 
1667  getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1668  .legalFor({{S32, S32}, {S64, S32}})
1669  .clampScalar(1, S32, S32)
1670  .clampScalar(0, S32, S64)
1671  .widenScalarToNextPow2(0)
1672  .scalarize(0);
1673 
1674  getActionDefinitionsBuilder({
1675  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1676  G_FCOPYSIGN,
1677 
1678  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1679  G_ATOMICRMW_NAND,
1680  G_ATOMICRMW_FSUB,
1681  G_READ_REGISTER,
1682  G_WRITE_REGISTER,
1683 
1684  G_SADDO, G_SSUBO,
1685 
1686  // TODO: Implement
1687  G_FMINIMUM, G_FMAXIMUM}).lower();
1688 
1689  getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1690  .lower();
1691 
1692  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1693  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1694  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1695  .unsupported();
1696 
1697  getLegacyLegalizerInfo().computeTables();
1698  verify(*ST.getInstrInfo());
1699 }
1700 
1701 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1702  MachineInstr &MI) const {
1703  MachineIRBuilder &B = Helper.MIRBuilder;
1704  MachineRegisterInfo &MRI = *B.getMRI();
1705 
1706  switch (MI.getOpcode()) {
1707  case TargetOpcode::G_ADDRSPACE_CAST:
1708  return legalizeAddrSpaceCast(MI, MRI, B);
1709  case TargetOpcode::G_FRINT:
1710  return legalizeFrint(MI, MRI, B);
1711  case TargetOpcode::G_FCEIL:
1712  return legalizeFceil(MI, MRI, B);
1713  case TargetOpcode::G_FREM:
1714  return legalizeFrem(MI, MRI, B);
1715  case TargetOpcode::G_INTRINSIC_TRUNC:
1716  return legalizeIntrinsicTrunc(MI, MRI, B);
1717  case TargetOpcode::G_SITOFP:
1718  return legalizeITOFP(MI, MRI, B, true);
1719  case TargetOpcode::G_UITOFP:
1720  return legalizeITOFP(MI, MRI, B, false);
1721  case TargetOpcode::G_FPTOSI:
1722  return legalizeFPTOI(MI, MRI, B, true);
1723  case TargetOpcode::G_FPTOUI:
1724  return legalizeFPTOI(MI, MRI, B, false);
1725  case TargetOpcode::G_FMINNUM:
1726  case TargetOpcode::G_FMAXNUM:
1727  case TargetOpcode::G_FMINNUM_IEEE:
1728  case TargetOpcode::G_FMAXNUM_IEEE:
1729  return legalizeMinNumMaxNum(Helper, MI);
1730  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1731  return legalizeExtractVectorElt(MI, MRI, B);
1732  case TargetOpcode::G_INSERT_VECTOR_ELT:
1733  return legalizeInsertVectorElt(MI, MRI, B);
1734  case TargetOpcode::G_SHUFFLE_VECTOR:
1735  return legalizeShuffleVector(MI, MRI, B);
1736  case TargetOpcode::G_FSIN:
1737  case TargetOpcode::G_FCOS:
1738  return legalizeSinCos(MI, MRI, B);
1739  case TargetOpcode::G_GLOBAL_VALUE:
1740  return legalizeGlobalValue(MI, MRI, B);
1741  case TargetOpcode::G_LOAD:
1742  case TargetOpcode::G_SEXTLOAD:
1743  case TargetOpcode::G_ZEXTLOAD:
1744  return legalizeLoad(Helper, MI);
1745  case TargetOpcode::G_FMAD:
1746  return legalizeFMad(MI, MRI, B);
1747  case TargetOpcode::G_FDIV:
1748  return legalizeFDIV(MI, MRI, B);
1749  case TargetOpcode::G_UDIV:
1750  case TargetOpcode::G_UREM:
1751  case TargetOpcode::G_UDIVREM:
1752  return legalizeUnsignedDIV_REM(MI, MRI, B);
1753  case TargetOpcode::G_SDIV:
1754  case TargetOpcode::G_SREM:
1755  case TargetOpcode::G_SDIVREM:
1756  return legalizeSignedDIV_REM(MI, MRI, B);
1757  case TargetOpcode::G_ATOMIC_CMPXCHG:
1758  return legalizeAtomicCmpXChg(MI, MRI, B);
1759  case TargetOpcode::G_FLOG:
1760  return legalizeFlog(MI, B, numbers::ln2f);
1761  case TargetOpcode::G_FLOG10:
1763  case TargetOpcode::G_FEXP:
1764  return legalizeFExp(MI, B);
1765  case TargetOpcode::G_FPOW:
1766  return legalizeFPow(MI, B);
1767  case TargetOpcode::G_FFLOOR:
1768  return legalizeFFloor(MI, MRI, B);
1769  case TargetOpcode::G_BUILD_VECTOR:
1770  return legalizeBuildVector(MI, MRI, B);
1771  case TargetOpcode::G_CTLZ:
1772  case TargetOpcode::G_CTTZ:
1773  return legalizeCTLZ_CTTZ(MI, MRI, B);
1774  default:
1775  return false;
1776  }
1777 
1778  llvm_unreachable("expected switch to return");
1779 }
1780 
1782  unsigned AS,
1784  MachineIRBuilder &B) const {
1785  MachineFunction &MF = B.getMF();
1786  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1787  const LLT S32 = LLT::scalar(32);
1788 
1790 
1791  if (ST.hasApertureRegs()) {
1792  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1793  // getreg.
1794  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1797  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1800  unsigned Encoding =
1803  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1804 
1805  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1806 
1807  B.buildInstr(AMDGPU::S_GETREG_B32)
1808  .addDef(GetReg)
1809  .addImm(Encoding);
1810  MRI.setType(GetReg, S32);
1811 
1812  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1813  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1814  }
1815 
1818 
1820  return Register();
1821 
1822  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1823  // private_segment_aperture_base_hi.
1824  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1825 
1826  // TODO: can we be smarter about machine pointer info?
1829  PtrInfo,
1832  LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1833 
1834  Register LoadAddr;
1835 
1836  B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1837  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1838 }
1839 
1842  MachineIRBuilder &B) const {
1843  MachineFunction &MF = B.getMF();
1844 
1845  const LLT S32 = LLT::scalar(32);
1846  Register Dst = MI.getOperand(0).getReg();
1847  Register Src = MI.getOperand(1).getReg();
1848 
1849  LLT DstTy = MRI.getType(Dst);
1850  LLT SrcTy = MRI.getType(Src);
1851  unsigned DestAS = DstTy.getAddressSpace();
1852  unsigned SrcAS = SrcTy.getAddressSpace();
1853 
1854  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1855  // vector element.
1856  assert(!DstTy.isVector());
1857 
1858  const AMDGPUTargetMachine &TM
1859  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1860 
1861  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1862  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1863  return true;
1864  }
1865 
1866  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1867  // Truncate.
1868  B.buildExtract(Dst, Src, 0);
1869  MI.eraseFromParent();
1870  return true;
1871  }
1872 
1873  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1875  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1876 
1877  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1878  // another. Merge operands are required to be the same type, but creating an
1879  // extra ptrtoint would be kind of pointless.
1880  auto HighAddr = B.buildConstant(
1882  B.buildMerge(Dst, {Src, HighAddr});
1883  MI.eraseFromParent();
1884  return true;
1885  }
1886 
1887  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1888  assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1889  DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1890  unsigned NullVal = TM.getNullPointerValue(DestAS);
1891 
1892  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1893  auto FlatNull = B.buildConstant(SrcTy, 0);
1894 
1895  // Extract low 32-bits of the pointer.
1896  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1897 
1898  auto CmpRes =
1899  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1900  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1901 
1902  MI.eraseFromParent();
1903  return true;
1904  }
1905 
1906  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1907  return false;
1908 
1909  if (!ST.hasFlatAddressSpace())
1910  return false;
1911 
1912  auto SegmentNull =
1913  B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1914  auto FlatNull =
1915  B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1916 
1917  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1918  if (!ApertureReg.isValid())
1919  return false;
1920 
1921  auto CmpRes =
1922  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1923 
1924  // Coerce the type of the low half of the result so we can use merge_values.
1925  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1926 
1927  // TODO: Should we allow mismatched types but matching sizes in merges to
1928  // avoid the ptrtoint?
1929  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1930  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1931 
1932  MI.eraseFromParent();
1933  return true;
1934 }
1935 
1938  MachineIRBuilder &B) const {
1939  Register Src = MI.getOperand(1).getReg();
1940  LLT Ty = MRI.getType(Src);
1941  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1942 
1943  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1944  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1945 
1946  auto C1 = B.buildFConstant(Ty, C1Val);
1947  auto CopySign = B.buildFCopysign(Ty, C1, Src);
1948 
1949  // TODO: Should this propagate fast-math-flags?
1950  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1951  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1952 
1953  auto C2 = B.buildFConstant(Ty, C2Val);
1954  auto Fabs = B.buildFAbs(Ty, Src);
1955 
1956  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1957  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1958  MI.eraseFromParent();
1959  return true;
1960 }
1961 
1964  MachineIRBuilder &B) const {
1965 
1966  const LLT S1 = LLT::scalar(1);
1967  const LLT S64 = LLT::scalar(64);
1968 
1969  Register Src = MI.getOperand(1).getReg();
1970  assert(MRI.getType(Src) == S64);
1971 
1972  // result = trunc(src)
1973  // if (src > 0.0 && src != result)
1974  // result += 1.0
1975 
1976  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1977 
1978  const auto Zero = B.buildFConstant(S64, 0.0);
1979  const auto One = B.buildFConstant(S64, 1.0);
1980  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1981  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1982  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1983  auto Add = B.buildSelect(S64, And, One, Zero);
1984 
1985  // TODO: Should this propagate fast-math-flags?
1986  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1987  return true;
1988 }
1989 
1992  MachineIRBuilder &B) const {
1993  Register DstReg = MI.getOperand(0).getReg();
1994  Register Src0Reg = MI.getOperand(1).getReg();
1995  Register Src1Reg = MI.getOperand(2).getReg();
1996  auto Flags = MI.getFlags();
1997  LLT Ty = MRI.getType(DstReg);
1998 
1999  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2000  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2001  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2002  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2003  MI.eraseFromParent();
2004  return true;
2005 }
2006 
2008  MachineIRBuilder &B) {
2009  const unsigned FractBits = 52;
2010  const unsigned ExpBits = 11;
2011  LLT S32 = LLT::scalar(32);
2012 
2013  auto Const0 = B.buildConstant(S32, FractBits - 32);
2014  auto Const1 = B.buildConstant(S32, ExpBits);
2015 
2016  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2017  .addUse(Hi)
2018  .addUse(Const0.getReg(0))
2019  .addUse(Const1.getReg(0));
2020 
2021  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2022 }
2023 
2026  MachineIRBuilder &B) const {
2027  const LLT S1 = LLT::scalar(1);
2028  const LLT S32 = LLT::scalar(32);
2029  const LLT S64 = LLT::scalar(64);
2030 
2031  Register Src = MI.getOperand(1).getReg();
2032  assert(MRI.getType(Src) == S64);
2033 
2034  // TODO: Should this use extract since the low half is unused?
2035  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2036  Register Hi = Unmerge.getReg(1);
2037 
2038  // Extract the upper half, since this is where we will find the sign and
2039  // exponent.
2040  auto Exp = extractF64Exponent(Hi, B);
2041 
2042  const unsigned FractBits = 52;
2043 
2044  // Extract the sign bit.
2045  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2046  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2047 
2048  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2049 
2050  const auto Zero32 = B.buildConstant(S32, 0);
2051 
2052  // Extend back to 64-bits.
2053  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2054 
2055  auto Shr = B.buildAShr(S64, FractMask, Exp);
2056  auto Not = B.buildNot(S64, Shr);
2057  auto Tmp0 = B.buildAnd(S64, Src, Not);
2058  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2059 
2060  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2061  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2062 
2063  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2064  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2065  MI.eraseFromParent();
2066  return true;
2067 }
2068 
2071  MachineIRBuilder &B, bool Signed) const {
2072 
2073  Register Dst = MI.getOperand(0).getReg();
2074  Register Src = MI.getOperand(1).getReg();
2075 
2076  const LLT S64 = LLT::scalar(64);
2077  const LLT S32 = LLT::scalar(32);
2078 
2079  assert(MRI.getType(Src) == S64);
2080 
2081  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2082  auto ThirtyTwo = B.buildConstant(S32, 32);
2083 
2084  if (MRI.getType(Dst) == S64) {
2085  auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2086  : B.buildUITOFP(S64, Unmerge.getReg(1));
2087 
2088  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2089  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2090  .addUse(CvtHi.getReg(0))
2091  .addUse(ThirtyTwo.getReg(0));
2092 
2093  // TODO: Should this propagate fast-math-flags?
2094  B.buildFAdd(Dst, LdExp, CvtLo);
2095  MI.eraseFromParent();
2096  return true;
2097  }
2098 
2099  assert(MRI.getType(Dst) == S32);
2100 
2101  auto One = B.buildConstant(S32, 1);
2102 
2103  MachineInstrBuilder ShAmt;
2104  if (Signed) {
2105  auto ThirtyOne = B.buildConstant(S32, 31);
2106  auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2107  auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2108  auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2109  auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2110  /*HasSideEffects=*/false)
2111  .addUse(Unmerge.getReg(1));
2112  auto LS2 = B.buildSub(S32, LS, One);
2113  ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2114  } else
2115  ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2116  auto Norm = B.buildShl(S64, Src, ShAmt);
2117  auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2118  auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2119  auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2120  auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2121  auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2122  B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2123  /*HasSideEffects=*/false)
2124  .addUse(FVal.getReg(0))
2125  .addUse(Scale.getReg(0));
2126  MI.eraseFromParent();
2127  return true;
2128 }
2129 
2130 // TODO: Copied from DAG implementation. Verify logic and document how this
2131 // actually works.
2135  bool Signed) const {
2136 
2137  Register Dst = MI.getOperand(0).getReg();
2138  Register Src = MI.getOperand(1).getReg();
2139 
2140  const LLT S64 = LLT::scalar(64);
2141  const LLT S32 = LLT::scalar(32);
2142 
2143  const LLT SrcLT = MRI.getType(Src);
2144  assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2145 
2146  unsigned Flags = MI.getFlags();
2147 
2148  // The basic idea of converting a floating point number into a pair of 32-bit
2149  // integers is illustrated as follows:
2150  //
2151  // tf := trunc(val);
2152  // hif := floor(tf * 2^-32);
2153  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2154  // hi := fptoi(hif);
2155  // lo := fptoi(lof);
2156  //
2157  auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2158  MachineInstrBuilder Sign;
2159  if (Signed && SrcLT == S32) {
2160  // However, a 32-bit floating point number has only 23 bits mantissa and
2161  // it's not enough to hold all the significant bits of `lof` if val is
2162  // negative. To avoid the loss of precision, We need to take the absolute
2163  // value after truncating and flip the result back based on the original
2164  // signedness.
2165  Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2166  Trunc = B.buildFAbs(S32, Trunc, Flags);
2167  }
2168  MachineInstrBuilder K0, K1;
2169  if (SrcLT == S64) {
2170  K0 = B.buildFConstant(S64,
2171  BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2172  K1 = B.buildFConstant(S64,
2173  BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2174  } else {
2175  K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2176  K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2177  }
2178 
2179  auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2180  auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2181  auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2182 
2183  auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2184  : B.buildFPTOUI(S32, FloorMul);
2185  auto Lo = B.buildFPTOUI(S32, Fma);
2186 
2187  if (Signed && SrcLT == S32) {
2188  // Flip the result based on the signedness, which is either all 0s or 1s.
2189  Sign = B.buildMerge(S64, {Sign, Sign});
2190  // r := xor({lo, hi}, sign) - sign;
2191  B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2192  } else
2193  B.buildMerge(Dst, {Lo, Hi});
2194  MI.eraseFromParent();
2195 
2196  return true;
2197 }
2198 
2200  MachineInstr &MI) const {
2201  MachineFunction &MF = Helper.MIRBuilder.getMF();
2203 
2204  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2205  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2206 
2207  // With ieee_mode disabled, the instructions have the correct behavior
2208  // already for G_FMINNUM/G_FMAXNUM
2209  if (!MFI->getMode().IEEE)
2210  return !IsIEEEOp;
2211 
2212  if (IsIEEEOp)
2213  return true;
2214 
2216 }
2217 
2220  MachineIRBuilder &B) const {
2221  // TODO: Should move some of this into LegalizerHelper.
2222 
2223  // TODO: Promote dynamic indexing of s16 to s32
2224 
2225  // FIXME: Artifact combiner probably should have replaced the truncated
2226  // constant before this, so we shouldn't need
2227  // getIConstantVRegValWithLookThrough.
2228  Optional<ValueAndVReg> MaybeIdxVal =
2229  getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2230  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2231  return true;
2232  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2233 
2234  Register Dst = MI.getOperand(0).getReg();
2235  Register Vec = MI.getOperand(1).getReg();
2236 
2237  LLT VecTy = MRI.getType(Vec);
2238  LLT EltTy = VecTy.getElementType();
2239  assert(EltTy == MRI.getType(Dst));
2240 
2241  if (IdxVal < VecTy.getNumElements())
2242  B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2243  else
2244  B.buildUndef(Dst);
2245 
2246  MI.eraseFromParent();
2247  return true;
2248 }
2249 
2252  MachineIRBuilder &B) const {
2253  // TODO: Should move some of this into LegalizerHelper.
2254 
2255  // TODO: Promote dynamic indexing of s16 to s32
2256 
2257  // FIXME: Artifact combiner probably should have replaced the truncated
2258  // constant before this, so we shouldn't need
2259  // getIConstantVRegValWithLookThrough.
2260  Optional<ValueAndVReg> MaybeIdxVal =
2261  getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2262  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2263  return true;
2264 
2265  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2266  Register Dst = MI.getOperand(0).getReg();
2267  Register Vec = MI.getOperand(1).getReg();
2268  Register Ins = MI.getOperand(2).getReg();
2269 
2270  LLT VecTy = MRI.getType(Vec);
2271  LLT EltTy = VecTy.getElementType();
2272  assert(EltTy == MRI.getType(Ins));
2273 
2274  if (IdxVal < VecTy.getNumElements())
2275  B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2276  else
2277  B.buildUndef(Dst);
2278 
2279  MI.eraseFromParent();
2280  return true;
2281 }
2282 
2285  MachineIRBuilder &B) const {
2286  const LLT V2S16 = LLT::fixed_vector(2, 16);
2287 
2288  Register Dst = MI.getOperand(0).getReg();
2289  Register Src0 = MI.getOperand(1).getReg();
2290  LLT DstTy = MRI.getType(Dst);
2291  LLT SrcTy = MRI.getType(Src0);
2292 
2293  if (SrcTy == V2S16 && DstTy == V2S16 &&
2294  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2295  return true;
2296 
2297  MachineIRBuilder HelperBuilder(MI);
2298  GISelObserverWrapper DummyObserver;
2299  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2301 }
2302 
2305  MachineIRBuilder &B) const {
2306 
2307  Register DstReg = MI.getOperand(0).getReg();
2308  Register SrcReg = MI.getOperand(1).getReg();
2309  LLT Ty = MRI.getType(DstReg);
2310  unsigned Flags = MI.getFlags();
2311 
2312  Register TrigVal;
2313  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2314  if (ST.hasTrigReducedRange()) {
2315  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2316  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2317  .addUse(MulVal.getReg(0))
2318  .setMIFlags(Flags).getReg(0);
2319  } else
2320  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2321 
2322  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2323  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2324  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2325  .addUse(TrigVal)
2326  .setMIFlags(Flags);
2327  MI.eraseFromParent();
2328  return true;
2329 }
2330 
2333  const GlobalValue *GV,
2334  int64_t Offset,
2335  unsigned GAFlags) const {
2336  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2337  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2338  // to the following code sequence:
2339  //
2340  // For constant address space:
2341  // s_getpc_b64 s[0:1]
2342  // s_add_u32 s0, s0, $symbol
2343  // s_addc_u32 s1, s1, 0
2344  //
2345  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346  // a fixup or relocation is emitted to replace $symbol with a literal
2347  // constant, which is a pc-relative offset from the encoding of the $symbol
2348  // operand to the global variable.
2349  //
2350  // For global address space:
2351  // s_getpc_b64 s[0:1]
2352  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2353  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2354  //
2355  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2356  // fixups or relocations are emitted to replace $symbol@*@lo and
2357  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2358  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2359  // operand to the global variable.
2360  //
2361  // What we want here is an offset from the value returned by s_getpc
2362  // (which is the address of the s_add_u32 instruction) to the global
2363  // variable, but since the encoding of $symbol starts 4 bytes after the start
2364  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2365  // small. This requires us to add 4 to the global variable offset in order to
2366  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2367  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2368  // instruction.
2369 
2370  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2371 
2372  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2373  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2374 
2375  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2376  .addDef(PCReg);
2377 
2378  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2379  if (GAFlags == SIInstrInfo::MO_NONE)
2380  MIB.addImm(0);
2381  else
2382  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2383 
2384  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2385 
2386  if (PtrTy.getSizeInBits() == 32)
2387  B.buildExtract(DstReg, PCReg, 0);
2388  return true;
2389  }
2390 
2393  MachineIRBuilder &B) const {
2394  Register DstReg = MI.getOperand(0).getReg();
2395  LLT Ty = MRI.getType(DstReg);
2396  unsigned AS = Ty.getAddressSpace();
2397 
2398  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2399  MachineFunction &MF = B.getMF();
2401 
2403  if (!MFI->isModuleEntryFunction() &&
2404  !GV->getName().equals("llvm.amdgcn.module.lds")) {
2405  const Function &Fn = MF.getFunction();
2406  DiagnosticInfoUnsupported BadLDSDecl(
2407  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2408  DS_Warning);
2409  Fn.getContext().diagnose(BadLDSDecl);
2410 
2411  // We currently don't have a way to correctly allocate LDS objects that
2412  // aren't directly associated with a kernel. We do force inlining of
2413  // functions that use local objects. However, if these dead functions are
2414  // not eliminated, we don't want a compile time error. Just emit a warning
2415  // and a trap, since there should be no callable path here.
2416  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2417  B.buildUndef(DstReg);
2418  MI.eraseFromParent();
2419  return true;
2420  }
2421 
2422  // TODO: We could emit code to handle the initialization somewhere.
2423  // We ignore the initializer for now and legalize it to allow selection.
2424  // The initializer will anyway get errored out during assembly emission.
2425  const SITargetLowering *TLI = ST.getTargetLowering();
2426  if (!TLI->shouldUseLDSConstAddress(GV)) {
2427  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2428  return true; // Leave in place;
2429  }
2430 
2431  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2432  Type *Ty = GV->getValueType();
2433  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2434  // zero-sized type in other languages to declare the dynamic shared
2435  // memory which size is not known at the compile time. They will be
2436  // allocated by the runtime and placed directly after the static
2437  // allocated ones. They all share the same offset.
2438  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2439  // Adjust alignment for that dynamic shared memory array.
2440  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2441  LLT S32 = LLT::scalar(32);
2442  auto Sz =
2443  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2444  B.buildIntToPtr(DstReg, Sz);
2445  MI.eraseFromParent();
2446  return true;
2447  }
2448  }
2449 
2450  B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2451  *cast<GlobalVariable>(GV)));
2452  MI.eraseFromParent();
2453  return true;
2454  }
2455 
2456  const SITargetLowering *TLI = ST.getTargetLowering();
2457 
2458  if (TLI->shouldEmitFixup(GV)) {
2459  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2460  MI.eraseFromParent();
2461  return true;
2462  }
2463 
2464  if (TLI->shouldEmitPCReloc(GV)) {
2465  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2466  MI.eraseFromParent();
2467  return true;
2468  }
2469 
2471  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2472 
2473  LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2478  LoadTy, Align(8));
2479 
2480  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2481 
2482  if (Ty.getSizeInBits() == 32) {
2483  // Truncate if this is a 32-bit constant address.
2484  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2485  B.buildExtract(DstReg, Load, 0);
2486  } else
2487  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2488 
2489  MI.eraseFromParent();
2490  return true;
2491 }
2492 
2494  if (Ty.isVector())
2495  return Ty.changeElementCount(
2497  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2498 }
2499 
2501  MachineInstr &MI) const {
2502  MachineIRBuilder &B = Helper.MIRBuilder;
2503  MachineRegisterInfo &MRI = *B.getMRI();
2504  GISelChangeObserver &Observer = Helper.Observer;
2505 
2506  Register PtrReg = MI.getOperand(1).getReg();
2507  LLT PtrTy = MRI.getType(PtrReg);
2508  unsigned AddrSpace = PtrTy.getAddressSpace();
2509 
2510  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2512  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2513  Observer.changingInstr(MI);
2514  MI.getOperand(1).setReg(Cast.getReg(0));
2515  Observer.changedInstr(MI);
2516  return true;
2517  }
2518 
2519  if (MI.getOpcode() != AMDGPU::G_LOAD)
2520  return false;
2521 
2522  Register ValReg = MI.getOperand(0).getReg();
2523  LLT ValTy = MRI.getType(ValReg);
2524 
2525  MachineMemOperand *MMO = *MI.memoperands_begin();
2526  const unsigned ValSize = ValTy.getSizeInBits();
2527  const LLT MemTy = MMO->getMemoryType();
2528  const Align MemAlign = MMO->getAlign();
2529  const unsigned MemSize = MemTy.getSizeInBits();
2530  const unsigned AlignInBits = 8 * MemAlign.value();
2531 
2532  // Widen non-power-of-2 loads to the alignment if needed
2533  if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2534  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2535 
2536  // This was already the correct extending load result type, so just adjust
2537  // the memory type.
2538  if (WideMemSize == ValSize) {
2539  MachineFunction &MF = B.getMF();
2540 
2541  MachineMemOperand *WideMMO =
2542  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2543  Observer.changingInstr(MI);
2544  MI.setMemRefs(MF, {WideMMO});
2545  Observer.changedInstr(MI);
2546  return true;
2547  }
2548 
2549  // Don't bother handling edge case that should probably never be produced.
2550  if (ValSize > WideMemSize)
2551  return false;
2552 
2553  LLT WideTy = widenToNextPowerOf2(ValTy);
2554 
2555  Register WideLoad;
2556  if (!WideTy.isVector()) {
2557  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2558  B.buildTrunc(ValReg, WideLoad).getReg(0);
2559  } else {
2560  // Extract the subvector.
2561 
2562  if (isRegisterType(ValTy)) {
2563  // If this a case where G_EXTRACT is legal, use it.
2564  // (e.g. <3 x s32> -> <4 x s32>)
2565  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2566  B.buildExtract(ValReg, WideLoad, 0);
2567  } else {
2568  // For cases where the widened type isn't a nice register value, unmerge
2569  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2570  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2571  WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2572  B.setInsertPt(B.getMBB(), MI.getIterator());
2573  B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2574  }
2575  }
2576 
2577  MI.eraseFromParent();
2578  return true;
2579  }
2580 
2581  return false;
2582 }
2583 
2586  MachineIRBuilder &B) const {
2587  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2588  assert(Ty.isScalar());
2589 
2590  MachineFunction &MF = B.getMF();
2592 
2593  // TODO: Always legal with future ftz flag.
2594  // FIXME: Do we need just output?
2595  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2596  return true;
2597  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2598  return true;
2599 
2600  MachineIRBuilder HelperBuilder(MI);
2601  GISelObserverWrapper DummyObserver;
2602  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2603  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2604 }
2605 
2608  Register DstReg = MI.getOperand(0).getReg();
2609  Register PtrReg = MI.getOperand(1).getReg();
2610  Register CmpVal = MI.getOperand(2).getReg();
2611  Register NewVal = MI.getOperand(3).getReg();
2612 
2614  "this should not have been custom lowered");
2615 
2616  LLT ValTy = MRI.getType(CmpVal);
2617  LLT VecTy = LLT::fixed_vector(2, ValTy);
2618 
2619  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2620 
2621  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2622  .addDef(DstReg)
2623  .addUse(PtrReg)
2624  .addUse(PackedVal)
2625  .setMemRefs(MI.memoperands());
2626 
2627  MI.eraseFromParent();
2628  return true;
2629 }
2630 
2632  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2633  Register Dst = MI.getOperand(0).getReg();
2634  Register Src = MI.getOperand(1).getReg();
2635  LLT Ty = B.getMRI()->getType(Dst);
2636  unsigned Flags = MI.getFlags();
2637 
2638  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2639  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2640 
2641  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2642  MI.eraseFromParent();
2643  return true;
2644 }
2645 
2647  MachineIRBuilder &B) const {
2648  Register Dst = MI.getOperand(0).getReg();
2649  Register Src = MI.getOperand(1).getReg();
2650  unsigned Flags = MI.getFlags();
2651  LLT Ty = B.getMRI()->getType(Dst);
2652 
2653  auto K = B.buildFConstant(Ty, numbers::log2e);
2654  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2655  B.buildFExp2(Dst, Mul, Flags);
2656  MI.eraseFromParent();
2657  return true;
2658 }
2659 
2661  MachineIRBuilder &B) const {
2662  Register Dst = MI.getOperand(0).getReg();
2663  Register Src0 = MI.getOperand(1).getReg();
2664  Register Src1 = MI.getOperand(2).getReg();
2665  unsigned Flags = MI.getFlags();
2666  LLT Ty = B.getMRI()->getType(Dst);
2667  const LLT S16 = LLT::scalar(16);
2668  const LLT S32 = LLT::scalar(32);
2669 
2670  if (Ty == S32) {
2671  auto Log = B.buildFLog2(S32, Src0, Flags);
2672  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2673  .addUse(Log.getReg(0))
2674  .addUse(Src1)
2675  .setMIFlags(Flags);
2676  B.buildFExp2(Dst, Mul, Flags);
2677  } else if (Ty == S16) {
2678  // There's no f16 fmul_legacy, so we need to convert for it.
2679  auto Log = B.buildFLog2(S16, Src0, Flags);
2680  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2681  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2682  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2683  .addUse(Ext0.getReg(0))
2684  .addUse(Ext1.getReg(0))
2685  .setMIFlags(Flags);
2686 
2687  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2688  } else
2689  return false;
2690 
2691  MI.eraseFromParent();
2692  return true;
2693 }
2694 
2695 // Find a source register, ignoring any possible source modifiers.
2697  Register ModSrc = OrigSrc;
2698  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2699  ModSrc = SrcFNeg->getOperand(1).getReg();
2700  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2701  ModSrc = SrcFAbs->getOperand(1).getReg();
2702  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2703  ModSrc = SrcFAbs->getOperand(1).getReg();
2704  return ModSrc;
2705 }
2706 
2709  MachineIRBuilder &B) const {
2710 
2711  const LLT S1 = LLT::scalar(1);
2712  const LLT S64 = LLT::scalar(64);
2713  Register Dst = MI.getOperand(0).getReg();
2714  Register OrigSrc = MI.getOperand(1).getReg();
2715  unsigned Flags = MI.getFlags();
2716  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2717  "this should not have been custom lowered");
2718 
2719  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2720  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2721  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2722  // V_FRACT bug is:
2723  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2724  //
2725  // Convert floor(x) to (x - fract(x))
2726 
2727  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2728  .addUse(OrigSrc)
2729  .setMIFlags(Flags);
2730 
2731  // Give source modifier matching some assistance before obscuring a foldable
2732  // pattern.
2733 
2734  // TODO: We can avoid the neg on the fract? The input sign to fract
2735  // shouldn't matter?
2736  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2737 
2738  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2739 
2741 
2742  // We don't need to concern ourselves with the snan handling difference, so
2743  // use the one which will directly select.
2744  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2745  if (MFI->getMode().IEEE)
2746  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2747  else
2748  B.buildFMinNum(Min, Fract, Const, Flags);
2749 
2750  Register CorrectedFract = Min;
2751  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2752  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2753  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2754  }
2755 
2756  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2757  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2758 
2759  MI.eraseFromParent();
2760  return true;
2761 }
2762 
2763 // Turn an illegal packed v2s16 build vector into bit operations.
2764 // TODO: This should probably be a bitcast action in LegalizerHelper.
2767  Register Dst = MI.getOperand(0).getReg();
2768  const LLT S32 = LLT::scalar(32);
2769  assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2770 
2771  Register Src0 = MI.getOperand(1).getReg();
2772  Register Src1 = MI.getOperand(2).getReg();
2773  assert(MRI.getType(Src0) == LLT::scalar(16));
2774 
2775  auto Merge = B.buildMerge(S32, {Src0, Src1});
2776  B.buildBitcast(Dst, Merge);
2777 
2778  MI.eraseFromParent();
2779  return true;
2780 }
2781 
2782 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2783 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2784 // case with a single min instruction instead of a compare+select.
2787  MachineIRBuilder &B) const {
2788  Register Dst = MI.getOperand(0).getReg();
2789  Register Src = MI.getOperand(1).getReg();
2790  LLT DstTy = MRI.getType(Dst);
2791  LLT SrcTy = MRI.getType(Src);
2792 
2793  unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2794  ? AMDGPU::G_AMDGPU_FFBH_U32
2795  : AMDGPU::G_AMDGPU_FFBL_B32;
2796  auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2797  B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2798 
2799  MI.eraseFromParent();
2800  return true;
2801 }
2802 
2803 // Check that this is a G_XOR x, -1
2804 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2805  if (MI.getOpcode() != TargetOpcode::G_XOR)
2806  return false;
2807  auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2808  return ConstVal && *ConstVal == -1;
2809 }
2810 
2811 // Return the use branch instruction, otherwise null if the usage is invalid.
2812 static MachineInstr *
2814  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2815  Register CondDef = MI.getOperand(0).getReg();
2816  if (!MRI.hasOneNonDBGUse(CondDef))
2817  return nullptr;
2818 
2819  MachineBasicBlock *Parent = MI.getParent();
2821 
2822  if (isNot(MRI, *UseMI)) {
2823  Register NegatedCond = UseMI->getOperand(0).getReg();
2824  if (!MRI.hasOneNonDBGUse(NegatedCond))
2825  return nullptr;
2826 
2827  // We're deleting the def of this value, so we need to remove it.
2828  eraseInstr(*UseMI, MRI);
2829 
2830  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2831  Negated = true;
2832  }
2833 
2834  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2835  return nullptr;
2836 
2837  // Make sure the cond br is followed by a G_BR, or is the last instruction.
2838  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2839  if (Next == Parent->end()) {
2840  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2841  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2842  return nullptr;
2843  UncondBrTarget = &*NextMBB;
2844  } else {
2845  if (Next->getOpcode() != AMDGPU::G_BR)
2846  return nullptr;
2847  Br = &*Next;
2848  UncondBrTarget = Br->getOperand(0).getMBB();
2849  }
2850 
2851  return UseMI;
2852 }
2853 
2855  const ArgDescriptor *Arg,
2856  const TargetRegisterClass *ArgRC,
2857  LLT ArgTy) const {
2858  MCRegister SrcReg = Arg->getRegister();
2859  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2860  assert(DstReg.isVirtual() && "Virtual register expected");
2861 
2862  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2863  ArgTy);
2864  if (Arg->isMasked()) {
2865  // TODO: Should we try to emit this once in the entry block?
2866  const LLT S32 = LLT::scalar(32);
2867  const unsigned Mask = Arg->getMask();
2868  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2869 
2870  Register AndMaskSrc = LiveIn;
2871 
2872  if (Shift != 0) {
2873  auto ShiftAmt = B.buildConstant(S32, Shift);
2874  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2875  }
2876 
2877  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2878  } else {
2879  B.buildCopy(DstReg, LiveIn);
2880  }
2881 
2882  return true;
2883 }
2884 
2886  Register DstReg, MachineIRBuilder &B,
2887  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2888  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2889  const ArgDescriptor *Arg;
2890  const TargetRegisterClass *ArgRC;
2891  LLT ArgTy;
2892  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2893 
2894  if (!Arg) {
2896  // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2897  // case the pointer argument may be missing and we use null.
2898  B.buildConstant(DstReg, 0);
2899  return true;
2900  }
2901 
2902  // It's undefined behavior if a function marked with the amdgpu-no-*
2903  // attributes uses the corresponding intrinsic.
2904  B.buildUndef(DstReg);
2905  return true;
2906  }
2907 
2908  if (!Arg->isRegister() || !Arg->getRegister().isValid())
2909  return false; // TODO: Handle these
2910  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2911 }
2912 
2915  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2916  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2917  return false;
2918 
2919  MI.eraseFromParent();
2920  return true;
2921 }
2922 
2925  MachineIRBuilder &B) const {
2926  Register Dst = MI.getOperand(0).getReg();
2927  LLT DstTy = MRI.getType(Dst);
2928  LLT S16 = LLT::scalar(16);
2929  LLT S32 = LLT::scalar(32);
2930  LLT S64 = LLT::scalar(64);
2931 
2932  if (DstTy == S16)
2933  return legalizeFDIV16(MI, MRI, B);
2934  if (DstTy == S32)
2935  return legalizeFDIV32(MI, MRI, B);
2936  if (DstTy == S64)
2937  return legalizeFDIV64(MI, MRI, B);
2938 
2939  return false;
2940 }
2941 
2943  Register DstDivReg,
2944  Register DstRemReg,
2945  Register X,
2946  Register Y) const {
2947  const LLT S1 = LLT::scalar(1);
2948  const LLT S32 = LLT::scalar(32);
2949 
2950  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2951  // algorithm used here.
2952 
2953  // Initial estimate of inv(y).
2954  auto FloatY = B.buildUITOFP(S32, Y);
2955  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2956  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2957  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2958  auto Z = B.buildFPTOUI(S32, ScaledY);
2959 
2960  // One round of UNR.
2961  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2962  auto NegYZ = B.buildMul(S32, NegY, Z);
2963  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2964 
2965  // Quotient/remainder estimate.
2966  auto Q = B.buildUMulH(S32, X, Z);
2967  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2968 
2969  // First quotient/remainder refinement.
2970  auto One = B.buildConstant(S32, 1);
2971  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2972  if (DstDivReg)
2973  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2974  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2975 
2976  // Second quotient/remainder refinement.
2977  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2978  if (DstDivReg)
2979  B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2980 
2981  if (DstRemReg)
2982  B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2983 }
2984 
2985 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
2986 //
2987 // Return lo, hi of result
2988 //
2989 // %cvt.lo = G_UITOFP Val.lo
2990 // %cvt.hi = G_UITOFP Val.hi
2991 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2992 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2993 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2994 // %mul2 = G_FMUL %mul1, 2**(-32)
2995 // %trunc = G_INTRINSIC_TRUNC %mul2
2996 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2997 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2998 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2999  Register Val) {
3000  const LLT S32 = LLT::scalar(32);
3001  auto Unmerge = B.buildUnmerge(S32, Val);
3002 
3003  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3004  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3005 
3006  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3007  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3008 
3009  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3010  auto Mul1 =
3011  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3012 
3013  // 2**(-32)
3014  auto Mul2 =
3015  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3016  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3017 
3018  // -(2**32)
3019  auto Mad2 = B.buildFMAD(S32, Trunc,
3020  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3021 
3022  auto ResultLo = B.buildFPTOUI(S32, Mad2);
3023  auto ResultHi = B.buildFPTOUI(S32, Trunc);
3024 
3025  return {ResultLo.getReg(0), ResultHi.getReg(0)};
3026 }
3027 
3029  Register DstDivReg,
3030  Register DstRemReg,
3031  Register Numer,
3032  Register Denom) const {
3033  const LLT S32 = LLT::scalar(32);
3034  const LLT S64 = LLT::scalar(64);
3035  const LLT S1 = LLT::scalar(1);
3036  Register RcpLo, RcpHi;
3037 
3038  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3039 
3040  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3041 
3042  auto Zero64 = B.buildConstant(S64, 0);
3043  auto NegDenom = B.buildSub(S64, Zero64, Denom);
3044 
3045  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3046  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3047 
3048  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3049  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3050  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3051 
3052  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3053  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3054  auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3055  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3056 
3057  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3058  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3059  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3060  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3061  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3062 
3063  auto Zero32 = B.buildConstant(S32, 0);
3064  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3065  auto Add2_HiC =
3066  B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3067  auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3068  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3069 
3070  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3071  Register NumerLo = UnmergeNumer.getReg(0);
3072  Register NumerHi = UnmergeNumer.getReg(1);
3073 
3074  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3075  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3076  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3077  Register Mul3_Lo = UnmergeMul3.getReg(0);
3078  Register Mul3_Hi = UnmergeMul3.getReg(1);
3079  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3080  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3081  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3082  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3083 
3084  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3085  Register DenomLo = UnmergeDenom.getReg(0);
3086  Register DenomHi = UnmergeDenom.getReg(1);
3087 
3088  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3089  auto C1 = B.buildSExt(S32, CmpHi);
3090 
3091  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3092  auto C2 = B.buildSExt(S32, CmpLo);
3093 
3094  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3095  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3096 
3097  // TODO: Here and below portions of the code can be enclosed into if/endif.
3098  // Currently control flow is unconditional and we have 4 selects after
3099  // potential endif to substitute PHIs.
3100 
3101  // if C3 != 0 ...
3102  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3103  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3104  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3105  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3106 
3107  auto One64 = B.buildConstant(S64, 1);
3108  auto Add3 = B.buildAdd(S64, MulHi3, One64);
3109 
3110  auto C4 =
3111  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3112  auto C5 =
3113  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3114  auto C6 = B.buildSelect(
3115  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3116 
3117  // if (C6 != 0)
3118  auto Add4 = B.buildAdd(S64, Add3, One64);
3119  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3120 
3121  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3122  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3123  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3124 
3125  // endif C6
3126  // endif C3
3127 
3128  if (DstDivReg) {
3129  auto Sel1 = B.buildSelect(
3130  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3131  B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3132  Sel1, MulHi3);
3133  }
3134 
3135  if (DstRemReg) {
3136  auto Sel2 = B.buildSelect(
3137  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3138  B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3139  Sel2, Sub1);
3140  }
3141 }
3142 
3145  MachineIRBuilder &B) const {
3146  Register DstDivReg, DstRemReg;
3147  switch (MI.getOpcode()) {
3148  default:
3149  llvm_unreachable("Unexpected opcode!");
3150  case AMDGPU::G_UDIV: {
3151  DstDivReg = MI.getOperand(0).getReg();
3152  break;
3153  }
3154  case AMDGPU::G_UREM: {
3155  DstRemReg = MI.getOperand(0).getReg();
3156  break;
3157  }
3158  case AMDGPU::G_UDIVREM: {
3159  DstDivReg = MI.getOperand(0).getReg();
3160  DstRemReg = MI.getOperand(1).getReg();
3161  break;
3162  }
3163  }
3164 
3165  const LLT S64 = LLT::scalar(64);
3166  const LLT S32 = LLT::scalar(32);
3167  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3168  Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3169  Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3170  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3171 
3172  if (Ty == S32)
3173  legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3174  else if (Ty == S64)
3175  legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3176  else
3177  return false;
3178 
3179  MI.eraseFromParent();
3180  return true;
3181 }
3182 
3185  MachineIRBuilder &B) const {
3186  const LLT S64 = LLT::scalar(64);
3187  const LLT S32 = LLT::scalar(32);
3188 
3189  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3190  if (Ty != S32 && Ty != S64)
3191  return false;
3192 
3193  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3194  Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3195  Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3196 
3197  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3198  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3199  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3200 
3201  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3202  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3203 
3204  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3205  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3206 
3207  Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3208  switch (MI.getOpcode()) {
3209  default:
3210  llvm_unreachable("Unexpected opcode!");
3211  case AMDGPU::G_SDIV: {
3212  DstDivReg = MI.getOperand(0).getReg();
3213  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3214  break;
3215  }
3216  case AMDGPU::G_SREM: {
3217  DstRemReg = MI.getOperand(0).getReg();
3218  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3219  break;
3220  }
3221  case AMDGPU::G_SDIVREM: {
3222  DstDivReg = MI.getOperand(0).getReg();
3223  DstRemReg = MI.getOperand(1).getReg();
3224  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3225  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3226  break;
3227  }
3228  }
3229 
3230  if (Ty == S32)
3231  legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3232  else
3233  legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3234 
3235  if (DstDivReg) {
3236  auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3237  auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3238  B.buildSub(DstDivReg, SignXor, Sign);
3239  }
3240 
3241  if (DstRemReg) {
3242  auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3243  auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3244  B.buildSub(DstRemReg, SignXor, Sign);
3245  }
3246 
3247  MI.eraseFromParent();
3248  return true;
3249 }
3250 
3253  MachineIRBuilder &B) const {
3254  Register Res = MI.getOperand(0).getReg();
3255  Register LHS = MI.getOperand(1).getReg();
3256  Register RHS = MI.getOperand(2).getReg();
3257  uint16_t Flags = MI.getFlags();
3258  LLT ResTy = MRI.getType(Res);
3259 
3260  const MachineFunction &MF = B.getMF();
3261  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3262  MI.getFlag(MachineInstr::FmAfn);
3263 
3264  if (!AllowInaccurateRcp)
3265  return false;
3266 
3267  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3268  // 1 / x -> RCP(x)
3269  if (CLHS->isExactlyValue(1.0)) {
3270  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3271  .addUse(RHS)
3272  .setMIFlags(Flags);
3273 
3274  MI.eraseFromParent();
3275  return true;
3276  }
3277 
3278  // -1 / x -> RCP( FNEG(x) )
3279  if (CLHS->isExactlyValue(-1.0)) {
3280  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3281  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3282  .addUse(FNeg.getReg(0))
3283  .setMIFlags(Flags);
3284 
3285  MI.eraseFromParent();
3286  return true;
3287  }
3288  }
3289 
3290  // x / y -> x * (1.0 / y)
3291  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3292  .addUse(RHS)
3293  .setMIFlags(Flags);
3294  B.buildFMul(Res, LHS, RCP, Flags);
3295 
3296  MI.eraseFromParent();
3297  return true;
3298 }
3299 
3302  MachineIRBuilder &B) const {
3303  Register Res = MI.getOperand(0).getReg();
3304  Register X = MI.getOperand(1).getReg();
3305  Register Y = MI.getOperand(2).getReg();
3306  uint16_t Flags = MI.getFlags();
3307  LLT ResTy = MRI.getType(Res);
3308 
3309  const MachineFunction &MF = B.getMF();
3310  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3311  MI.getFlag(MachineInstr::FmAfn);
3312 
3313  if (!AllowInaccurateRcp)
3314  return false;
3315 
3316  auto NegY = B.buildFNeg(ResTy, Y);
3317  auto One = B.buildFConstant(ResTy, 1.0);
3318 
3319  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3320  .addUse(Y)
3321  .setMIFlags(Flags);
3322 
3323  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3324  R = B.buildFMA(ResTy, Tmp0, R, R);
3325 
3326  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3327  R = B.buildFMA(ResTy, Tmp1, R, R);
3328 
3329  auto Ret = B.buildFMul(ResTy, X, R);
3330  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3331 
3332  B.buildFMA(Res, Tmp2, R, Ret);
3333  MI.eraseFromParent();
3334  return true;
3335 }
3336 
3339  MachineIRBuilder &B) const {
3340  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3341  return true;
3342 
3343  Register Res = MI.getOperand(0).getReg();
3344  Register LHS = MI.getOperand(1).getReg();
3345  Register RHS = MI.getOperand(2).getReg();
3346 
3347  uint16_t Flags = MI.getFlags();
3348 
3349  LLT S16 = LLT::scalar(16);
3350  LLT S32 = LLT::scalar(32);
3351 
3352  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3353  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3354 
3355  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3356  .addUse(RHSExt.getReg(0))
3357  .setMIFlags(Flags);
3358 
3359  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3360  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3361 
3362  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3363  .addUse(RDst.getReg(0))
3364  .addUse(RHS)
3365  .addUse(LHS)
3366  .setMIFlags(Flags);
3367 
3368  MI.eraseFromParent();
3369  return true;
3370 }
3371 
3372 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3373 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3374 static void toggleSPDenormMode(bool Enable,
3376  const GCNSubtarget &ST,
3378  // Set SP denorm mode to this value.
3379  unsigned SPDenormMode =
3380  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3381 
3382  if (ST.hasDenormModeInst()) {
3383  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3384  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3385 
3386  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3387  B.buildInstr(AMDGPU::S_DENORM_MODE)
3388  .addImm(NewDenormModeValue);
3389 
3390  } else {
3391  // Select FP32 bit field in mode register.
3392  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3395 
3396  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3397  .addImm(SPDenormMode)
3398  .addImm(SPDenormModeBitField);
3399  }
3400 }
3401 
3404  MachineIRBuilder &B) const {
3405  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3406  return true;
3407 
3408  Register Res = MI.getOperand(0).getReg();
3409  Register LHS = MI.getOperand(1).getReg();
3410  Register RHS = MI.getOperand(2).getReg();
3411  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3413 
3414  uint16_t Flags = MI.getFlags();
3415 
3416  LLT S32 = LLT::scalar(32);
3417  LLT S1 = LLT::scalar(1);
3418 
3419  auto One = B.buildFConstant(S32, 1.0f);
3420 
3421  auto DenominatorScaled =
3422  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3423  .addUse(LHS)
3424  .addUse(RHS)
3425  .addImm(0)
3426  .setMIFlags(Flags);
3427  auto NumeratorScaled =
3428  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3429  .addUse(LHS)
3430  .addUse(RHS)
3431  .addImm(1)
3432  .setMIFlags(Flags);
3433 
3434  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3435  .addUse(DenominatorScaled.getReg(0))
3436  .setMIFlags(Flags);
3437  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3438 
3439  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3440  // aren't modeled as reading it.
3441  if (!Mode.allFP32Denormals())
3442  toggleSPDenormMode(true, B, ST, Mode);
3443 
3444  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3445  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3446  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3447  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3448  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3449  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3450 
3451  if (!Mode.allFP32Denormals())
3452  toggleSPDenormMode(false, B, ST, Mode);
3453 
3454  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3455  .addUse(Fma4.getReg(0))
3456  .addUse(Fma1.getReg(0))
3457  .addUse(Fma3.getReg(0))
3458  .addUse(NumeratorScaled.getReg(1))
3459  .setMIFlags(Flags);
3460 
3461  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3462  .addUse(Fmas.getReg(0))
3463  .addUse(RHS)
3464  .addUse(LHS)
3465  .setMIFlags(Flags);
3466 
3467  MI.eraseFromParent();
3468  return true;
3469 }
3470 
3473  MachineIRBuilder &B) const {
3475  return true;
3476 
3477  Register Res = MI.getOperand(0).getReg();
3478  Register LHS = MI.getOperand(1).getReg();
3479  Register RHS = MI.getOperand(2).getReg();
3480 
3481  uint16_t Flags = MI.getFlags();
3482 
3483  LLT S64 = LLT::scalar(64);
3484  LLT S1 = LLT::scalar(1);
3485 
3486  auto One = B.buildFConstant(S64, 1.0);
3487 
3488  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3489  .addUse(LHS)
3490  .addUse(RHS)
3491  .addImm(0)
3492  .setMIFlags(Flags);
3493 
3494  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3495 
3496  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3497  .addUse(DivScale0.getReg(0))
3498  .setMIFlags(Flags);
3499 
3500  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3501  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3502  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3503 
3504  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3505  .addUse(LHS)
3506  .addUse(RHS)
3507  .addImm(1)
3508  .setMIFlags(Flags);
3509 
3510  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3511  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3512  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3513 
3514  Register Scale;
3516  // Workaround a hardware bug on SI where the condition output from div_scale
3517  // is not usable.
3518 
3519  LLT S32 = LLT::scalar(32);
3520 
3521  auto NumUnmerge = B.buildUnmerge(S32, LHS);
3522  auto DenUnmerge = B.buildUnmerge(S32, RHS);
3523  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3524  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3525 
3526  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3527  Scale1Unmerge.getReg(1));
3528  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3529  Scale0Unmerge.getReg(1));
3530  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3531  } else {
3532  Scale = DivScale1.getReg(1);
3533  }
3534 
3535  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3536  .addUse(Fma4.getReg(0))
3537  .addUse(Fma3.getReg(0))
3538  .addUse(Mul.getReg(0))
3539  .addUse(Scale)
3540  .setMIFlags(Flags);
3541 
3542  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3543  .addUse(Fmas.getReg(0))
3544  .addUse(RHS)
3545  .addUse(LHS)
3546  .setMIFlags(Flags);
3547 
3548  MI.eraseFromParent();
3549  return true;
3550 }
3551 
3554  MachineIRBuilder &B) const {
3555  Register Res = MI.getOperand(0).getReg();
3556  Register LHS = MI.getOperand(2).getReg();
3557  Register RHS = MI.getOperand(3).getReg();
3558  uint16_t Flags = MI.getFlags();
3559 
3560  LLT S32 = LLT::scalar(32);
3561  LLT S1 = LLT::scalar(1);
3562 
3563  auto Abs = B.buildFAbs(S32, RHS, Flags);
3564  const APFloat C0Val(1.0f);
3565 
3566  auto C0 = B.buildConstant(S32, 0x6f800000);
3567  auto C1 = B.buildConstant(S32, 0x2f800000);
3568  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3569 
3570  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3571  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3572 
3573  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3574 
3575  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3576  .addUse(Mul0.getReg(0))
3577  .setMIFlags(Flags);
3578 
3579  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3580 
3581  B.buildFMul(Res, Sel, Mul1, Flags);
3582 
3583  MI.eraseFromParent();
3584  return true;
3585 }
3586 
3587 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3588 // FIXME: Why do we handle this one but not other removed instructions?
3589 //
3590 // Reciprocal square root. The clamp prevents infinite results, clamping
3591 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3592 // +-max_float.
3595  MachineIRBuilder &B) const {
3597  return true;
3598 
3599  Register Dst = MI.getOperand(0).getReg();
3600  Register Src = MI.getOperand(2).getReg();
3601  auto Flags = MI.getFlags();
3602 
3603  LLT Ty = MRI.getType(Dst);
3604 
3605  const fltSemantics *FltSemantics;
3606  if (Ty == LLT::scalar(32))
3607  FltSemantics = &APFloat::IEEEsingle();
3608  else if (Ty == LLT::scalar(64))
3609  FltSemantics = &APFloat::IEEEdouble();
3610  else
3611  return false;
3612 
3613  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3614  .addUse(Src)
3615  .setMIFlags(Flags);
3616 
3617  // We don't need to concern ourselves with the snan handling difference, since
3618  // the rsq quieted (or not) so use the one which will directly select.
3619  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3620  const bool UseIEEE = MFI->getMode().IEEE;
3621 
3622  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3623  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3624  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3625 
3626  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3627 
3628  if (UseIEEE)
3629  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3630  else
3631  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3632  MI.eraseFromParent();
3633  return true;
3634 }
3635 
3636 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3637  switch (IID) {
3638  case Intrinsic::amdgcn_ds_fadd:
3639  return AMDGPU::G_ATOMICRMW_FADD;
3640  case Intrinsic::amdgcn_ds_fmin:
3641  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3642  case Intrinsic::amdgcn_ds_fmax:
3643  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3644  default:
3645  llvm_unreachable("not a DS FP intrinsic");
3646  }
3647 }
3648 
3650  MachineInstr &MI,
3651  Intrinsic::ID IID) const {
3652  GISelChangeObserver &Observer = Helper.Observer;
3653  Observer.changingInstr(MI);
3654 
3655  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3656 
3657  // The remaining operands were used to set fields in the MemOperand on
3658  // construction.
3659  for (int I = 6; I > 3; --I)
3660  MI.RemoveOperand(I);
3661 
3662  MI.RemoveOperand(1); // Remove the intrinsic ID.
3663  Observer.changedInstr(MI);
3664  return true;
3665 }
3666 
3669  MachineIRBuilder &B) const {
3670  uint64_t Offset =
3673  LLT DstTy = MRI.getType(DstReg);
3674  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3675 
3676  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3677  if (!loadInputValue(KernargPtrReg, B,
3679  return false;
3680 
3681  // FIXME: This should be nuw
3682  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3683  return true;
3684 }
3685 
3688  MachineIRBuilder &B) const {
3689  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3690  if (!MFI->isEntryFunction()) {
3693  }
3694 
3695  Register DstReg = MI.getOperand(0).getReg();
3696  if (!getImplicitArgPtr(DstReg, MRI, B))
3697  return false;
3698 
3699  MI.eraseFromParent();
3700  return true;
3701 }
3702 
3706  unsigned AddrSpace) const {
3707  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3708  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3709  Register Hi32 = Unmerge.getReg(1);
3710 
3711  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3712  MI.eraseFromParent();
3713  return true;
3714 }
3715 
3716 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3717 // offset (the offset that is included in bounds checking and swizzling, to be
3718 // split between the instruction's voffset and immoffset fields) and soffset
3719 // (the offset that is excluded from bounds checking and swizzling, to go in
3720 // the instruction's soffset field). This function takes the first kind of
3721 // offset and figures out how to split it between voffset and immoffset.
3722 std::pair<Register, unsigned>
3724  Register OrigOffset) const {
3725  const unsigned MaxImm = 4095;
3726  Register BaseReg;
3727  unsigned ImmOffset;
3728  const LLT S32 = LLT::scalar(32);
3729  MachineRegisterInfo &MRI = *B.getMRI();
3730 
3731  std::tie(BaseReg, ImmOffset) =
3733 
3734  // If BaseReg is a pointer, convert it to int.
3735  if (MRI.getType(BaseReg).isPointer())
3736  BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3737 
3738  // If the immediate value is too big for the immoffset field, put the value
3739  // and -4096 into the immoffset field so that the value that is copied/added
3740  // for the voffset field is a multiple of 4096, and it stands more chance
3741  // of being CSEd with the copy/add for another similar load/store.
3742  // However, do not do that rounding down to a multiple of 4096 if that is a
3743  // negative number, as it appears to be illegal to have a negative offset
3744  // in the vgpr, even if adding the immediate offset makes it positive.
3745  unsigned Overflow = ImmOffset & ~MaxImm;
3746  ImmOffset -= Overflow;
3747  if ((int32_t)Overflow < 0) {
3748  Overflow += ImmOffset;
3749  ImmOffset = 0;
3750  }
3751 
3752  if (Overflow != 0) {
3753  if (!BaseReg) {
3754  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3755  } else {
3756  auto OverflowVal = B.buildConstant(S32, Overflow);
3757  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3758  }
3759  }
3760 
3761  if (!BaseReg)
3762  BaseReg = B.buildConstant(S32, 0).getReg(0);
3763 
3764  return std::make_pair(BaseReg, ImmOffset);
3765 }
3766 
3767 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3769  Register VOffset, Register SOffset,
3770  unsigned ImmOffset, Register VIndex,
3771  MachineRegisterInfo &MRI) const {
3772  Optional<ValueAndVReg> MaybeVOffsetVal =
3774  Optional<ValueAndVReg> MaybeSOffsetVal =
3776  Optional<ValueAndVReg> MaybeVIndexVal =
3778  // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3779  // update the MMO with that offset. The stride is unknown so we can only do
3780  // this if VIndex is constant 0.
3781  if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3782  MaybeVIndexVal->Value == 0) {
3783  uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3784  MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3785  MMO->setOffset(TotalOffset);
3786  } else {
3787  // We don't have a constant combined offset to use in the MMO. Give up.
3788  MMO->setValue((Value *)nullptr);
3789  }
3790 }
3791 
3792 /// Handle register layout difference for f16 images for some subtargets.
3795  Register Reg,
3796  bool ImageStore) const {
3797  const LLT S16 = LLT::scalar(16);
3798  const LLT S32 = LLT::scalar(32);
3799  LLT StoreVT = MRI.getType(Reg);
3800  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3801 
3802  if (ST.hasUnpackedD16VMem()) {
3803  auto Unmerge = B.buildUnmerge(S16, Reg);
3804 
3805  SmallVector<Register, 4> WideRegs;
3806  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3807  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3808 
3809  int NumElts = StoreVT.getNumElements();
3810 
3811  return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3812  .getReg(0);
3813  }
3814 
3815  if (ImageStore && ST.hasImageStoreD16Bug()) {
3816  if (StoreVT.getNumElements() == 2) {
3817  SmallVector<Register, 4> PackedRegs;
3818  Reg = B.buildBitcast(S32, Reg).getReg(0);
3819  PackedRegs.push_back(Reg);
3820  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3821  return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3822  .getReg(0);
3823  }
3824 
3825  if (StoreVT.getNumElements() == 3) {
3826  SmallVector<Register, 4> PackedRegs;
3827  auto Unmerge = B.buildUnmerge(S16, Reg);
3828  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3829  PackedRegs.push_back(Unmerge.getReg(I));
3830  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3831  Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3832  return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3833  }
3834 
3835  if (StoreVT.getNumElements() == 4) {
3836  SmallVector<Register, 4> PackedRegs;
3837  Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3838  auto Unmerge = B.buildUnmerge(S32, Reg);
3839  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3840  PackedRegs.push_back(Unmerge.getReg(I));
3841  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3842  return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3843  .getReg(0);
3844  }
3845 
3846  llvm_unreachable("invalid data type");
3847  }
3848 
3849  return Reg;
3850 }
3851 
3853  MachineIRBuilder &B, Register VData, bool IsFormat) const {
3854  MachineRegisterInfo *MRI = B.getMRI();
3855  LLT Ty = MRI->getType(VData);
3856 
3857  const LLT S16 = LLT::scalar(16);
3858 
3859  // Fixup illegal register types for i8 stores.
3860  if (Ty == LLT::scalar(8) || Ty == S16) {
3861  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3862  return AnyExt;
3863  }
3864 
3865  if (Ty.isVector()) {
3866  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3867  if (IsFormat)
3868  return handleD16VData(B, *MRI, VData);
3869  }
3870  }
3871 
3872  return VData;
3873 }
3874 
3878  bool IsTyped,
3879  bool IsFormat) const {
3880  Register VData = MI.getOperand(1).getReg();
3881  LLT Ty = MRI.getType(VData);
3882  LLT EltTy = Ty.getScalarType();
3883  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3884  const LLT S32 = LLT::scalar(32);
3885 
3886  VData = fixStoreSourceType(B, VData, IsFormat);
3887  Register RSrc = MI.getOperand(2).getReg();
3888 
3889  MachineMemOperand *MMO = *MI.memoperands_begin();
3890  const int MemSize = MMO->getSize();
3891 
3892  unsigned ImmOffset;
3893 
3894  // The typed intrinsics add an immediate after the registers.
3895  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3896 
3897  // The struct intrinsic variants add one additional operand over raw.
3898  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3899  Register VIndex;
3900  int OpOffset = 0;
3901  if (HasVIndex) {
3902  VIndex = MI.getOperand(3).getReg();
3903  OpOffset = 1;
3904  } else {
3905  VIndex = B.buildConstant(S32, 0).getReg(0);
3906  }
3907 
3908  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3909  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3910 
3911  unsigned Format = 0;
3912  if (IsTyped) {
3913  Format = MI.getOperand(5 + OpOffset).getImm();
3914  ++OpOffset;
3915  }
3916 
3917  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3918 
3919  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3920  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3921 
3922  unsigned Opc;
3923  if (IsTyped) {
3924  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3925  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3926  } else if (IsFormat) {
3927  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3928  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3929  } else {
3930  switch (MemSize) {
3931  case 1:
3932  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3933  break;
3934  case 2:
3935  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3936  break;
3937  default:
3938  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3939  break;
3940  }
3941  }
3942 
3943  auto MIB = B.buildInstr(Opc)
3944  .addUse(VData) // vdata
3945  .addUse(RSrc) // rsrc
3946  .addUse(VIndex) // vindex
3947  .addUse(VOffset) // voffset
3948  .addUse(SOffset) // soffset
3949  .addImm(ImmOffset); // offset(imm)
3950 
3951  if (IsTyped)
3952  MIB.addImm(Format);
3953 
3954  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3955  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3956  .addMemOperand(MMO);
3957 
3958  MI.eraseFromParent();
3959  return true;
3960 }
3961 
3965  bool IsFormat,
3966  bool IsTyped) const {
3967  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3968  MachineMemOperand *MMO = *MI.memoperands_begin();
3969  const LLT MemTy = MMO->getMemoryType();
3970  const LLT S32 = LLT::scalar(32);
3971 
3972  Register Dst = MI.getOperand(0).getReg();
3973  Register RSrc = MI.getOperand(2).getReg();
3974 
3975  // The typed intrinsics add an immediate after the registers.
3976  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3977 
3978  // The struct intrinsic variants add one additional operand over raw.
3979  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3980  Register VIndex;
3981  int OpOffset = 0;
3982  if (HasVIndex) {
3983  VIndex = MI.getOperand(3).getReg();
3984  OpOffset = 1;
3985  } else {
3986  VIndex = B.buildConstant(S32, 0).getReg(0);
3987  }
3988 
3989  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3990  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3991 
3992  unsigned Format = 0;
3993  if (IsTyped) {
3994  Format = MI.getOperand(5 + OpOffset).getImm();
3995  ++OpOffset;
3996  }
3997 
3998  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3999  unsigned ImmOffset;
4000 
4001  LLT Ty = MRI.getType(Dst);
4002  LLT EltTy = Ty.getScalarType();
4003  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4004  const bool Unpacked = ST.hasUnpackedD16VMem();
4005 
4006  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4007  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4008 
4009  unsigned Opc;
4010 
4011  if (IsTyped) {
4012  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4013  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4014  } else if (IsFormat) {
4015  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4016  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4017  } else {
4018  switch (MemTy.getSizeInBits()) {
4019  case 8:
4020  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4021  break;
4022  case 16:
4023  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4024  break;
4025  default:
4026  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4027  break;
4028  }
4029  }
4030 
4031  Register LoadDstReg;
4032 
4033  bool IsExtLoad =
4034  (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4035  LLT UnpackedTy = Ty.changeElementSize(32);
4036 
4037  if (IsExtLoad)
4038  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4039  else if (Unpacked && IsD16 && Ty.isVector())
4040  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4041  else
4042  LoadDstReg = Dst;
4043 
4044  auto MIB = B.buildInstr(Opc)
4045  .addDef(LoadDstReg) // vdata
4046  .addUse(RSrc) // rsrc
4047  .addUse(VIndex) // vindex
4048  .addUse(VOffset) // voffset
4049  .addUse(SOffset) // soffset
4050  .addImm(ImmOffset); // offset(imm)
4051 
4052  if (IsTyped)
4053  MIB.addImm(Format);
4054 
4055  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4056  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4057  .addMemOperand(MMO);
4058 
4059  if (LoadDstReg != Dst) {
4060  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4061 
4062  // Widen result for extending loads was widened.
4063  if (IsExtLoad)
4064  B.buildTrunc(Dst, LoadDstReg);
4065  else {
4066  // Repack to original 16-bit vector result
4067  // FIXME: G_TRUNC should work, but legalization currently fails
4068  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4069  SmallVector<Register, 4> Repack;
4070  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4071  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4072  B.buildMerge(Dst, Repack);
4073  }
4074  }
4075 
4076  MI.eraseFromParent();
4077  return true;
4078 }
4079 
4082  bool IsInc) const {
4083  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4084  AMDGPU::G_AMDGPU_ATOMIC_DEC;
4085  B.buildInstr(Opc)
4086  .addDef(MI.getOperand(0).getReg())
4087  .addUse(MI.getOperand(2).getReg())
4088  .addUse(MI.getOperand(3).getReg())
4089  .cloneMemRefs(MI);
4090  MI.eraseFromParent();
4091  return true;
4092 }
4093 
4094 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4095  switch (IntrID) {
4096  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4097  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4098  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4099  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4100  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4101  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4102  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4103  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4104  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4105  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4106  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4107  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4108  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4109  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4110  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4111  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4112  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4113  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4114  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4115  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4116  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4117  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4118  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4119  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4120  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4121  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4122  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4123  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4124  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4125  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4126  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4127  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4128  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4129  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4130  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4131  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4132  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4133  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4134  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4135  case Intrinsic::amdgcn_buffer_atomic_fadd:
4136  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4137  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4138  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4139  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4140  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4141  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4142  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4143  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4144  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4145  default:
4146  llvm_unreachable("unhandled atomic opcode");
4147  }
4148 }
4149 
4152  Intrinsic::ID IID) const {
4153  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4154  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4155  const bool HasReturn = MI.getNumExplicitDefs() != 0;
4156 
4157  Register Dst;
4158 
4159  int OpOffset = 0;
4160  if (HasReturn) {
4161  // A few FP atomics do not support return values.
4162  Dst = MI.getOperand(0).getReg();
4163  } else {
4164  OpOffset = -1;
4165  }
4166 
4167  Register VData = MI.getOperand(2 + OpOffset).getReg();
4168  Register CmpVal;
4169 
4170  if (IsCmpSwap) {
4171  CmpVal = MI.getOperand(3 + OpOffset).getReg();
4172  ++OpOffset;
4173  }
4174 
4175  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4176  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4177 
4178  // The struct intrinsic variants add one additional operand over raw.
4179  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4180  Register VIndex;
4181  if (HasVIndex) {
4182  VIndex = MI.getOperand(4 + OpOffset).getReg();
4183  ++OpOffset;
4184  } else {
4185  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4186  }
4187 
4188  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4189  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4190  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4191 
4192  MachineMemOperand *MMO = *MI.memoperands_begin();
4193 
4194  unsigned ImmOffset;
4195  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4196  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4197 
4198  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4199 
4200  if (HasReturn)
4201  MIB.addDef(Dst);
4202 
4203  MIB.addUse(VData); // vdata
4204 
4205  if (IsCmpSwap)
4206  MIB.addReg(CmpVal);
4207 
4208  MIB.addUse(RSrc) // rsrc
4209  .addUse(VIndex) // vindex
4210  .addUse(VOffset) // voffset
4211  .addUse(SOffset) // soffset
4212  .addImm(ImmOffset) // offset(imm)
4213  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4214  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4215  .addMemOperand(MMO);
4216 
4217  MI.eraseFromParent();
4218  return true;
4219 }
4220 
4221 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4222 /// vector with s16 typed elements.
4224  SmallVectorImpl<Register> &PackedAddrs,
4225  unsigned ArgOffset,
4227  bool IsA16, bool IsG16) {
4228  const LLT S16 = LLT::scalar(16);
4229  const LLT V2S16 = LLT::fixed_vector(2, 16);
4230  auto EndIdx = Intr->VAddrEnd;
4231 
4232  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4233  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4234  if (!SrcOp.isReg())
4235  continue; // _L to _LZ may have eliminated this.
4236 
4237  Register AddrReg = SrcOp.getReg();
4238 
4239  if ((I < Intr->GradientStart) ||
4240  (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4241  (I >= Intr->CoordStart && !IsA16)) {
4242  // Handle any gradient or coordinate operands that should not be packed
4243  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4244  PackedAddrs.push_back(AddrReg);
4245  } else {
4246  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4247  // derivatives dx/dh and dx/dv are packed with undef.
4248  if (((I + 1) >= EndIdx) ||
4249  ((Intr->NumGradients / 2) % 2 == 1 &&
4250  (I == static_cast<unsigned>(Intr->GradientStart +
4251  (Intr->NumGradients / 2) - 1) ||
4252  I == static_cast<unsigned>(Intr->GradientStart +
4253  Intr->NumGradients - 1))) ||
4254  // Check for _L to _LZ optimization
4255  !MI.getOperand(ArgOffset + I + 1).isReg()) {
4256  PackedAddrs.push_back(
4257  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4258  .getReg(0));
4259  } else {
4260  PackedAddrs.push_back(
4261  B.buildBuildVector(
4262  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4263  .getReg(0));
4264  ++I;
4265  }
4266  }
4267  }
4268 }
4269 
4270 /// Convert from separate vaddr components to a single vector address register,
4271 /// and replace the remaining operands with $noreg.
4273  int DimIdx, int NumVAddrs) {
4274  const LLT S32 = LLT::scalar(32);
4275 
4276  SmallVector<Register, 8> AddrRegs;
4277  for (int I = 0; I != NumVAddrs; ++I) {
4278  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4279  if (SrcOp.isReg()) {
4280  AddrRegs.push_back(SrcOp.getReg());
4281  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4282  }
4283  }
4284 
4285  int NumAddrRegs = AddrRegs.size();
4286  if (NumAddrRegs != 1) {
4287  // Above 8 elements round up to next power of 2 (i.e. 16).
4288  if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4289  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4290  auto Undef = B.buildUndef(S32);
4291  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4292  NumAddrRegs = RoundedNumRegs;
4293  }
4294 
4295  auto VAddr =
4296  B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4297  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4298  }
4299 
4300  for (int I = 1; I != NumVAddrs; ++I) {
4301  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4302  if (SrcOp.isReg())
4303  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4304  }
4305 }
4306 
4307 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4308 ///
4309 /// Depending on the subtarget, load/store with 16-bit element data need to be
4310 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4311 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4312 /// registers.
4313 ///
4314 /// We don't want to directly select image instructions just yet, but also want
4315 /// to exposes all register repacking to the legalizer/combiners. We also don't
4316 /// want a selected instrution entering RegBankSelect. In order to avoid
4317 /// defining a multitude of intermediate image instructions, directly hack on
4318 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4319 /// padding now unnecessary arguments with $noreg.
4322  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4323 
4324  const unsigned NumDefs = MI.getNumExplicitDefs();
4325  const unsigned ArgOffset = NumDefs + 1;
4326  bool IsTFE = NumDefs == 2;
4327  // We are only processing the operands of d16 image operations on subtargets
4328  // that use the unpacked register layout, or need to repack the TFE result.
4329 
4330  // TODO: Do we need to guard against already legalized intrinsics?
4331  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4332  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4333 
4334  MachineRegisterInfo *MRI = B.getMRI();
4335  const LLT S32 = LLT::scalar(32);
4336  const LLT S16 = LLT::scalar(16);
4337  const LLT V2S16 = LLT::fixed_vector(2, 16);
4338 
4339  unsigned DMask = 0;
4340 
4341  // Check for 16 bit addresses and pack if true.
4342  LLT GradTy =
4343  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4344  LLT AddrTy =
4345  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4346  const bool IsG16 = GradTy == S16;
4347  const bool IsA16 = AddrTy == S16;
4348 
4349  int DMaskLanes = 0;
4350  if (!BaseOpcode->Atomic) {
4351  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4352  if (BaseOpcode->Gather4) {
4353  DMaskLanes = 4;
4354  } else if (DMask != 0) {
4355  DMaskLanes = countPopulation(DMask);
4356  } else if (!IsTFE && !BaseOpcode->Store) {
4357  // If dmask is 0, this is a no-op load. This can be eliminated.
4358  B.buildUndef(MI.getOperand(0));
4359  MI.eraseFromParent();
4360  return true;
4361  }
4362  }
4363 
4364  Observer.changingInstr(MI);
4365  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4366 
4367  unsigned NewOpcode = NumDefs == 0 ?
4368  AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4369 
4370  // Track that we legalized this
4371  MI.setDesc(B.getTII().get(NewOpcode));
4372 
4373  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4374  // dmask to be at least 1 otherwise the instruction will fail
4375  if (IsTFE && DMask == 0) {
4376  DMask = 0x1;
4377  DMaskLanes = 1;
4378  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4379  }
4380 
4381  if (BaseOpcode->Atomic) {
4382  Register VData0 = MI.getOperand(2).getReg();
4383  LLT Ty = MRI->getType(VData0);
4384 
4385  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4386  if (Ty.isVector())
4387  return false;
4388 
4389  if (BaseOpcode->AtomicX2) {
4390  Register VData1 = MI.getOperand(3).getReg();
4391  // The two values are packed in one register.
4392  LLT PackedTy = LLT::fixed_vector(2, Ty);
4393  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4394  MI.getOperand(2).setReg(Concat.getReg(0));
4395  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4396  }
4397  }
4398 
4399  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4400 
4401  // Optimize _L to _LZ when _L is zero
4402  if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4403  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4404  const ConstantFP *ConstantLod;
4405 
4406  if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4407  m_GFCst(ConstantLod))) {
4408  if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4409  // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4410  const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4412  Intr->Dim);
4413 
4414  // The starting indexes should remain in the same place.
4415  --CorrectedNumVAddrs;
4416 
4417  MI.getOperand(MI.getNumExplicitDefs())
4418  .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4419  MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4420  Intr = NewImageDimIntr;
4421  }
4422  }
4423  }
4424 
4425  // Optimize _mip away, when 'lod' is zero
4426  if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4427  int64_t ConstantLod;
4428  if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4429  m_ICst(ConstantLod))) {
4430  if (ConstantLod == 0) {
4431  // TODO: Change intrinsic opcode and remove operand instead or replacing
4432  // it with 0, as the _L to _LZ handling is done above.
4433  MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4434  --CorrectedNumVAddrs;
4435  }
4436  }
4437  }
4438 
4439  // Rewrite the addressing register layout before doing anything else.
4440  if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4441  // 16 bit gradients are supported, but are tied to the A16 control
4442  // so both gradients and addresses must be 16 bit
4443  return false;
4444  }
4445 
4446  if (IsA16 && !ST.hasA16()) {
4447  // A16 not supported
4448  return false;
4449  }
4450 
4451  if (IsA16 || IsG16) {
4452  if (Intr->NumVAddrs > 1) {
4453  SmallVector<Register, 4> PackedRegs;
4454 
4455  packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4456  IsG16);
4457 
4458  // See also below in the non-a16 branch
4459  const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4460  PackedRegs.size() <= ST.getNSAMaxSize();
4461 
4462  if (!UseNSA && PackedRegs.size() > 1) {
4463  LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4464  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4465  PackedRegs[0] = Concat.getReg(0);
4466  PackedRegs.resize(1);
4467  }
4468 
4469  const unsigned NumPacked = PackedRegs.size();
4470  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4471  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4472  if (!SrcOp.isReg()) {
4473  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4474  continue;
4475  }
4476 
4477  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4478 
4479  if (I - Intr->VAddrStart < NumPacked)
4480  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4481  else
4482  SrcOp.setReg(AMDGPU::NoRegister);
4483  }
4484  }
4485  } else {
4486  // If the register allocator cannot place the address registers contiguously
4487  // without introducing moves, then using the non-sequential address encoding
4488  // is always preferable, since it saves VALU instructions and is usually a
4489  // wash in terms of code size or even better.
4490  //
4491  // However, we currently have no way of hinting to the register allocator
4492  // that MIMG addresses should be placed contiguously when it is possible to
4493  // do so, so force non-NSA for the common 2-address case as a heuristic.
4494  //
4495  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4496  // allocation when possible.
4497  const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4498  CorrectedNumVAddrs <= ST.getNSAMaxSize();
4499 
4500  if (!UseNSA && Intr->NumVAddrs > 1)
4501  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4502  Intr->NumVAddrs);
4503  }
4504 
4505  int Flags = 0;
4506  if (IsA16)
4507  Flags |= 1;
4508  if (IsG16)
4509  Flags |= 2;
4510  MI.addOperand(MachineOperand::CreateImm(Flags));
4511 
4512  if (BaseOpcode->Store) { // No TFE for stores?
4513  // TODO: Handle dmask trim
4514  Register VData = MI.getOperand(1).getReg();
4515  LLT Ty = MRI->getType(VData);
4516  if (!Ty.isVector() || Ty.getElementType() != S16)
4517  return true;
4518 
4519  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4520  if (RepackedReg != VData) {
4521  MI.getOperand(1).setReg(RepackedReg);
4522  }
4523 
4524  return true;
4525  }
4526 
4527  Register DstReg = MI.getOperand(0).getReg();
4528  LLT Ty = MRI->getType(DstReg);
4529  const LLT EltTy = Ty.getScalarType();
4530  const bool IsD16 = Ty.getScalarType() == S16;
4531  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4532 
4533  // Confirm that the return type is large enough for the dmask specified
4534  if (NumElts < DMaskLanes)
4535  return false;
4536 
4537  if (NumElts > 4 || DMaskLanes > 4)
4538  return false;
4539 
4540  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4541  const LLT AdjustedTy =
4542  Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4543 
4544  // The raw dword aligned data component of the load. The only legal cases
4545  // where this matters should be when using the packed D16 format, for
4546  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4547  LLT RoundedTy;
4548 
4549  // S32 vector to to cover all data, plus TFE result element.
4550  LLT TFETy;
4551 
4552  // Register type to use for each loaded component. Will be S32 or V2S16.
4553  LLT RegTy;
4554 
4555  if (IsD16 && ST.hasUnpackedD16VMem()) {
4556  RoundedTy =
4557  LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4558  TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4559  RegTy = S32;
4560  } else {
4561  unsigned EltSize = EltTy.getSizeInBits();
4562  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4563  unsigned RoundedSize = 32 * RoundedElts;
4564  RoundedTy = LLT::scalarOrVector(
4565  ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4566  TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4567  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4568  }
4569 
4570  // The return type does not need adjustment.
4571  // TODO: Should we change s16 case to s32 or <2 x s16>?
4572  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4573  return true;
4574 
4575  Register Dst1Reg;
4576 
4577  // Insert after the instruction.
4578  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4579 
4580  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4581  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4582  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4583  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4584 
4585  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4586 
4587  MI.getOperand(0).setReg(NewResultReg);
4588 
4589  // In the IR, TFE is supposed to be used with a 2 element struct return
4590  // type. The instruction really returns these two values in one contiguous
4591  // register, with one additional dword beyond the loaded data. Rewrite the
4592  // return type to use a single register result.
4593 
4594  if (IsTFE) {
4595  Dst1Reg = MI.getOperand(1).getReg();
4596  if (MRI->getType(Dst1Reg) != S32)
4597  return false;
4598 
4599  // TODO: Make sure the TFE operand bit is set.
4600  MI.RemoveOperand(1);
4601 
4602  // Handle the easy case that requires no repack instructions.
4603  if (Ty == S32) {
4604  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4605  return true;
4606  }
4607  }
4608 
4609  // Now figure out how to copy the new result register back into the old
4610  // result.
4611  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4612 
4613  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4614 
4615  if (ResultNumRegs == 1) {
4616  assert(!IsTFE);
4617  ResultRegs[0] = NewResultReg;
4618  } else {
4619  // We have to repack into a new vector of some kind.
4620  for (int I = 0; I != NumDataRegs; ++I)
4621  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4622  B.buildUnmerge(ResultRegs, NewResultReg);
4623 
4624  // Drop the final TFE element to get the data part. The TFE result is
4625  // directly written to the right place already.
4626  if (IsTFE)
4627  ResultRegs.resize(NumDataRegs);
4628  }
4629 
4630  // For an s16 scalar result, we form an s32 result with a truncate regardless
4631  // of packed vs. unpacked.
4632  if (IsD16 && !Ty.isVector()) {
4633  B.buildTrunc(DstReg, ResultRegs[0]);
4634  return true;
4635  }
4636 
4637  // Avoid a build/concat_vector of 1 entry.
4638  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4639  B.buildBitcast(DstReg, ResultRegs[0]);
4640  return true;
4641  }
4642 
4643  assert(Ty.isVector());
4644 
4645  if (IsD16) {
4646  // For packed D16 results with TFE enabled, all the data components are
4647  // S32. Cast back to the expected type.
4648  //
4649  // TODO: We don't really need to use load s32 elements. We would only need one
4650  // cast for the TFE result if a multiple of v2s16 was used.
4651  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4652  for (Register &Reg : ResultRegs)
4653  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4654  } else if (ST.hasUnpackedD16VMem()) {
4655  for (Register &Reg : ResultRegs)
4656  Reg = B.buildTrunc(S16, Reg).getReg(0);
4657  }
4658  }
4659 
4660  auto padWithUndef = [&](LLT Ty, int NumElts) {
4661  if (NumElts == 0)
4662  return;
4663  Register Undef = B.buildUndef(Ty).getReg(0);
4664  for (int I = 0; I != NumElts; ++I)
4665  ResultRegs.push_back(Undef);
4666  };
4667 
4668  // Pad out any elements eliminated due to the dmask.
4669  LLT ResTy = MRI->getType(ResultRegs[0]);
4670  if (!ResTy.isVector()) {
4671  padWithUndef(ResTy, NumElts - ResultRegs.size());
4672  B.buildBuildVector(DstReg, ResultRegs);
4673  return true;
4674  }
4675 
4676  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4677  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4678 
4679  // Deal with the one annoying legal case.
4680  const LLT V3S16 = LLT::fixed_vector(3, 16);
4681  if (Ty == V3S16) {
4682  padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4683  auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4684  B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4685  return true;
4686  }
4687 
4688  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4689  B.buildConcatVectors(DstReg, ResultRegs);
4690  return true;
4691 }
4692 
4694  LegalizerHelper &Helper, MachineInstr &MI) const {
4695  MachineIRBuilder &B = Helper.MIRBuilder;
4696  GISelChangeObserver &Observer = Helper.Observer;
4697 
4698  Register Dst = MI.getOperand(0).getReg();
4699  LLT Ty = B.getMRI()->getType(Dst);
4700  unsigned Size = Ty.getSizeInBits();
4701  MachineFunction &MF = B.getMF();
4702 
4703  Observer.changingInstr(MI);
4704 
4706  Ty = getBitcastRegisterType(Ty);
4707  Helper.bitcastDst(MI, Ty, 0);
4708  Dst = MI.getOperand(0).getReg();
4709  B.setInsertPt(B.getMBB(), MI);
4710  }
4711 
4712  // FIXME: We don't really need this intermediate instruction. The intrinsic
4713  // should be fixed to have a memory operand. Since it's readnone, we're not
4714  // allowed to add one.
4715  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4716  MI.RemoveOperand(1); // Remove intrinsic ID
4717 
4718  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4719  // TODO: Should this use datalayout alignment?
4720  const unsigned MemSize = (Size + 7) / 8;
4721  const Align MemAlign(4);
4726  MemSize, MemAlign);
4727  MI.addMemOperand(MF, MMO);
4728 
4729  // There are no 96-bit result scalar loads, but widening to 128-bit should
4730  // always be legal. We may need to restore this to a 96-bit result if it turns
4731  // out this needs to be converted to a vector load during RegBankSelect.
4732  if (!isPowerOf2_32(Size)) {
4733  if (Ty.isVector())
4734  Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4735  else
4736  Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4737  }
4738 
4739  Observer.changedInstr(MI);
4740  return true;
4741 }
4742 
4743 // TODO: Move to selection
4746  MachineIRBuilder &B) const {
4747  if (!ST.isTrapHandlerEnabled() ||
4749  return legalizeTrapEndpgm(MI, MRI, B);
4750 
4751  if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4752  switch (*HsaAbiVer) {
4755  return legalizeTrapHsaQueuePtr(MI, MRI, B);
4757  return ST.supportsGetDoorbellID() ?
4758  legalizeTrapHsa(MI, MRI, B) :
4760  }
4761  }
4762 
4763  llvm_unreachable("Unknown trap handler");
4764 }
4765 
4768  B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4769  MI.eraseFromParent();
4770  return true;
4771 }
4772 
4775  // Pass queue pointer to trap handler as input, and insert trap instruction
4776  // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4777  Register LiveIn =
4780  return false;
4781 
4782  Register SGPR01(AMDGPU::SGPR0_SGPR1);
4783  B.buildCopy(SGPR01, LiveIn);
4784  B.buildInstr(AMDGPU::S_TRAP)
4785  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4786  .addReg(SGPR01, RegState::Implicit);
4787 
4788  MI.eraseFromParent();
4789  return true;
4790 }
4791 
4794  B.buildInstr(AMDGPU::S_TRAP)
4795  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4796  MI.eraseFromParent();
4797  return true;
4798 }
4799 
4802  // Is non-HSA path or trap-handler disabled? Then, report a warning
4803  // accordingly
4804  if (!ST.isTrapHandlerEnabled() ||
4806  DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4807  "debugtrap handler not supported",
4808  MI.getDebugLoc(), DS_Warning);
4809  LLVMContext &Ctx = B.getMF().getFunction().getContext();
4810  Ctx.diagnose(NoTrap);
4811  } else {
4812  // Insert debug-trap instruction
4813  B.buildInstr(AMDGPU::S_TRAP)
4814  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4815  }
4816 
4817  MI.eraseFromParent();
4818  return true;
4819 }
4820 
4822  MachineIRBuilder &B) const {
4823  MachineRegisterInfo &MRI = *B.getMRI();
4824  const LLT S16 = LLT::scalar(16);
4825  const LLT S32 = LLT::scalar(32);
4826 
4827  Register DstReg = MI.getOperand(0).getReg();
4828  Register NodePtr = MI.getOperand(2).getReg();
4829  Register RayExtent = MI.getOperand(3).getReg();
4830  Register RayOrigin = MI.getOperand(4).getReg();
4831  Register RayDir = MI.getOperand(5).getReg();
4832  Register RayInvDir = MI.getOperand(6).getReg();
4833  Register TDescr = MI.getOperand(7).getReg();
4834 
4835  if (!ST.hasGFX10_AEncoding()) {
4836  DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4837  "intrinsic not supported on subtarget",
4838  MI.getDebugLoc());
4839  B.getMF().getFunction().getContext().diagnose(BadIntrin);
4840  return false;
4841  }
4842 
4843  const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4844  const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4845  const unsigned NumVDataDwords = 4;
4846  const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4847  const bool UseNSA =
4848  ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4849  const unsigned BaseOpcodes[2][2] = {
4850  {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4851  {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4852  AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4853  int Opcode;
4854  if (UseNSA) {
4855  Opcode =
4856  AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4857  NumVDataDwords, NumVAddrDwords);
4858  } else {
4859  Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4860  AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4861  PowerOf2Ceil(NumVAddrDwords));
4862  }
4863  assert(Opcode != -1);
4864 
4866  if (Is64) {
4867  auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4868  Ops.push_back(Unmerge.getReg(0));
4869  Ops.push_back(Unmerge.getReg(1));
4870  } else {
4871  Ops.push_back(NodePtr);
4872  }
4873  Ops.push_back(RayExtent);
4874 
4875  auto packLanes = [&Ops, &S32, &B] (Register Src) {
4876  auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4877  Ops.push_back(Unmerge.getReg(0));
4878  Ops.push_back(Unmerge.getReg(1));
4879  Ops.push_back(Unmerge.getReg(2));
4880  };
4881 
4882  packLanes(RayOrigin);
4883  if (IsA16) {
4884  auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4885  auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4889  B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4890  B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4891  B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4892  Ops.push_back(R1);
4893  Ops.push_back(R2);
4894  Ops.push_back(R3);
4895  } else {
4896  packLanes(RayDir);
4897  packLanes(RayInvDir);
4898  }
4899 
4900  if (!UseNSA) {
4901  // Build a single vector containing all the operands so far prepared.
4902  LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4903  Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4904  Ops.clear();
4905  Ops.push_back(MergedOps);
4906  }
4907 
4908  auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4909  .addDef(DstReg)
4910  .addImm(Opcode);
4911 
4912  for (Register R : Ops) {
4913  MIB.addUse(R);
4914  }
4915 
4916  MIB.addUse(TDescr)
4917  .addImm(IsA16 ? 1 : 0)
4918  .cloneMemRefs(MI);
4919 
4920  MI.eraseFromParent();
4921  return true;
4922 }
4923 
4925  MachineInstr &MI) const {
4926  MachineIRBuilder &B = Helper.MIRBuilder;
4927  MachineRegisterInfo &MRI = *B.getMRI();
4928 
4929  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4930  auto IntrID = MI.getIntrinsicID();
4931  switch (IntrID) {
4932  case Intrinsic::amdgcn_if:
4933  case Intrinsic::amdgcn_else: {
4934  MachineInstr *Br = nullptr;
4935  MachineBasicBlock *UncondBrTarget = nullptr;
4936  bool Negated = false;
4937  if (MachineInstr *BrCond =
4938  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4939  const SIRegisterInfo *TRI
4940  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4941 
4942  Register Def = MI.getOperand(1).getReg();
4943  Register Use = MI.getOperand(3).getReg();
4944 
4945  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4946 
4947  if (Negated)
4948  std::swap(CondBrTarget, UncondBrTarget);
4949 
4950  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4951  if (IntrID == Intrinsic::amdgcn_if) {
4952  B.buildInstr(AMDGPU::SI_IF)
4953  .addDef(Def)
4954  .addUse(Use)
4955  .addMBB(UncondBrTarget);
4956  } else {
4957  B.buildInstr(AMDGPU::SI_ELSE)
4958  .addDef(Def)
4959  .addUse(Use)
4960  .addMBB(UncondBrTarget);
4961  }
4962 
4963  if (Br) {
4964  Br->getOperand(0).setMBB(CondBrTarget);
4965  } else {
4966  // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4967  // since we're swapping branch targets it needs to be reinserted.
4968  // FIXME: IRTranslator should probably not do this
4969  B.buildBr(*CondBrTarget);
4970  }
4971 
4972  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4973  MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4974  MI.eraseFromParent();
4975  BrCond->eraseFromParent();
4976  return true;
4977  }
4978 
4979  return false;
4980  }
4981  case Intrinsic::amdgcn_loop: {
4982  MachineInstr *Br = nullptr;
4983  MachineBasicBlock *UncondBrTarget = nullptr;
4984  bool Negated = false;
4985  if (MachineInstr *BrCond =
4986  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4987  const SIRegisterInfo *TRI
4988  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4989 
4990  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4991  Register Reg = MI.getOperand(2).getReg();
4992 
4993  if (Negated)
4994  std::swap(CondBrTarget, UncondBrTarget);
4995 
4996  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4997  B.buildInstr(AMDGPU::SI_LOOP)
4998  .addUse(Reg)
4999  .addMBB(UncondBrTarget);
5000 
5001  if (Br)
5002  Br->getOperand(0).setMBB(CondBrTarget);
5003  else
5004  B.buildBr(*CondBrTarget);
5005 
5006  MI.eraseFromParent();
5007  BrCond->eraseFromParent();
5008  MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5009  return true;
5010  }
5011 
5012  return false;
5013  }
5014  case Intrinsic::amdgcn_kernarg_segment_ptr:
5015  if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5016  // This only makes sense to call in a kernel, so just lower to null.
5017  B.buildConstant(MI.getOperand(0).getReg(), 0);
5018  MI.eraseFromParent();
5019  return true;
5020  }
5021 
5024  case Intrinsic::amdgcn_implicitarg_ptr:
5025  return legalizeImplicitArgPtr(MI, MRI, B);
5026  case Intrinsic::amdgcn_workitem_id_x:
5029  case Intrinsic::amdgcn_workitem_id_y:
5032  case Intrinsic::amdgcn_workitem_id_z:
5035  case Intrinsic::amdgcn_workgroup_id_x:
5038  case Intrinsic::amdgcn_workgroup_id_y:
5041  case Intrinsic::amdgcn_workgroup_id_z:
5044  case Intrinsic::amdgcn_dispatch_ptr:
5047  case Intrinsic::amdgcn_queue_ptr:
5050  case Intrinsic::amdgcn_implicit_buffer_ptr:
5053  case Intrinsic::amdgcn_dispatch_id:
5056  case Intrinsic::amdgcn_fdiv_fast:
5057  return legalizeFDIVFastIntrin(MI, MRI, B);
5058  case Intrinsic::amdgcn_is_shared:
5060  case Intrinsic::amdgcn_is_private:
5062  case Intrinsic::amdgcn_wavefrontsize: {
5063  B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5064  MI.eraseFromParent();
5065  return true;
5066  }
5067  case Intrinsic::amdgcn_s_buffer_load:
5068  return legalizeSBufferLoad(Helper, MI);
5069  case Intrinsic::amdgcn_raw_buffer_store:
5070  case Intrinsic::amdgcn_struct_buffer_store:
5071  return legalizeBufferStore(MI, MRI, B, false, false);
5072  case Intrinsic::amdgcn_raw_buffer_store_format:
5073  case Intrinsic::amdgcn_struct_buffer_store_format:
5074  return legalizeBufferStore(MI, MRI, B, false, true);
5075  case Intrinsic::amdgcn_raw_tbuffer_store:
5076  case Intrinsic::amdgcn_struct_tbuffer_store:
5077  return legalizeBufferStore(MI, MRI, B, true, true);
5078  case Intrinsic::amdgcn_raw_buffer_load:
5079  case Intrinsic::amdgcn_struct_buffer_load:
5080  return legalizeBufferLoad(MI, MRI, B, false, false);
5081  case Intrinsic::amdgcn_raw_buffer_load_format:
5082  case Intrinsic::amdgcn_struct_buffer_load_format:
5083  return legalizeBufferLoad(MI, MRI, B, true, false);
5084  case Intrinsic::amdgcn_raw_tbuffer_load:
5085  case Intrinsic::amdgcn_struct_tbuffer_load:
5086  return legalizeBufferLoad(MI, MRI, B, true, true);
5087  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5088  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5089  case Intrinsic::amdgcn_raw_buffer_atomic_add:
5090  case Intrinsic::amdgcn_struct_buffer_atomic_add:
5091  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5092  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5093  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5094  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5095  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5096  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5097  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5098  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5099  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5100  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5101  case Intrinsic::amdgcn_raw_buffer_atomic_and:
5102  case Intrinsic::amdgcn_struct_buffer_atomic_and:
5103  case Intrinsic::amdgcn_raw_buffer_atomic_or:
5104  case Intrinsic::amdgcn_struct_buffer_atomic_or:
5105  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5106  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5107  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5108  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5109  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5110  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5111  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5112  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5113  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5114  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5115  case Intrinsic::amdgcn_buffer_atomic_fadd:
5116  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5117  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5118  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5119  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5120  return legalizeBufferAtomic(MI, B, IntrID);
5121  case Intrinsic::amdgcn_atomic_inc:
5122  return legalizeAtomicIncDec(MI, B, true);
5123  case Intrinsic::amdgcn_atomic_dec:
5124  return legalizeAtomicIncDec(MI, B, false);
5125  case Intrinsic::trap:
5126  return legalizeTrapIntrinsic(MI, MRI, B);
5127  case Intrinsic::debugtrap:
5128  return legalizeDebugTrapIntrinsic(MI, MRI, B);
5129  case Intrinsic::amdgcn_rsq_clamp:
5130  return legalizeRsqClampIntrinsic(MI, MRI, B);
5131  case Intrinsic::amdgcn_ds_fadd:
5132  case Intrinsic::amdgcn_ds_fmin:
5133  case Intrinsic::amdgcn_ds_fmax:
5134  return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5135  case Intrinsic::amdgcn_image_bvh_intersect_ray:
5136  return legalizeBVHIntrinsic(MI, B);
5137  default: {
5138  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5140  return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5141  return true;
5142  }
5143  }
5144 
5145  return true;
5146 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
llvm::NextPowerOf2
uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:683
MIPatternMatch.h
llvm::MachineMemOperand::setValue
void setValue(const Value *NewSV)
Change the SourceValue for this MachineMemOperand.
Definition: MachineMemOperand.h:318
llvm::AMDGPUFunctionArgInfo::PreloadedValue
PreloadedValue
Definition: AMDGPUArgumentUsageInfo.h:98
llvm::getIConstantVRegSExtVal
Optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:284
llvm::AMDGPUFunctionArgInfo::QUEUE_PTR
@ QUEUE_PTR
Definition: AMDGPUArgumentUsageInfo.h:102
llvm::LegalizeRuleSet::widenScalarToNextPow2
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
Definition: LegalizerInfo.h:838
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4636
llvm::AMDGPUMachineFunction::allocateLDSGlobal
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
Definition: AMDGPUMachin