LLVM  13.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
40  "amdgpu-global-isel-new-legality",
41  cl::desc("Use GlobalISel desired legality, rather than try to use"
42  "rules compatible with selection patterns"),
43  cl::init(false),
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
50  unsigned NElts = Ty.getNumElements();
51  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52  return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
57  unsigned Bits = Ty.getSizeInBits();
58  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59  return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66  return [=](const LegalityQuery &Query) {
67  const LLT Ty = Query.Types[TypeIdx];
68  if (!Ty.isVector())
69  return false;
70 
71  const LLT EltTy = Ty.getElementType();
72  const unsigned EltSize = EltTy.getSizeInBits();
73  return Ty.getNumElements() % 2 != 0 &&
74  EltSize > 1 && EltSize < 32 &&
75  Ty.getSizeInBits() % 32 != 0;
76  };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80  return [=](const LegalityQuery &Query) {
81  const LLT Ty = Query.Types[TypeIdx];
82  return Ty.getSizeInBits() % 32 == 0;
83  };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87  return [=](const LegalityQuery &Query) {
88  const LLT Ty = Query.Types[TypeIdx];
89  const LLT EltTy = Ty.getScalarType();
90  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91  };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95  return [=](const LegalityQuery &Query) {
96  const LLT Ty = Query.Types[TypeIdx];
97  const LLT EltTy = Ty.getElementType();
98  return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99  };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103  return [=](const LegalityQuery &Query) {
104  const LLT Ty = Query.Types[TypeIdx];
105  const LLT EltTy = Ty.getElementType();
106  unsigned Size = Ty.getSizeInBits();
107  unsigned Pieces = (Size + 63) / 64;
108  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109  return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110  };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116  return [=](const LegalityQuery &Query) {
117  const LLT Ty = Query.Types[TypeIdx];
118 
119  const LLT EltTy = Ty.getElementType();
120  const int Size = Ty.getSizeInBits();
121  const int EltSize = EltTy.getSizeInBits();
122  const int NextMul32 = (Size + 31) / 32;
123 
124  assert(EltSize < 32);
125 
126  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127  return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128  };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132  const unsigned Size = Ty.getSizeInBits();
133 
134  LLT CoercedTy;
135  if (Size <= 32) {
136  // <2 x s8> -> s16
137  // <4 x s8> -> s32
138  return LLT::scalar(Size);
139  }
140 
141  return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145  return [=](const LegalityQuery &Query) {
146  const LLT Ty = Query.Types[TypeIdx];
147  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148  };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152  return [=](const LegalityQuery &Query) {
153  const LLT Ty = Query.Types[TypeIdx];
154  unsigned Size = Ty.getSizeInBits();
155  assert(Size % 32 == 0);
156  return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157  };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161  return [=](const LegalityQuery &Query) {
162  const LLT QueryTy = Query.Types[TypeIdx];
163  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164  };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168  return [=](const LegalityQuery &Query) {
169  const LLT QueryTy = Query.Types[TypeIdx];
170  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171  };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175  return [=](const LegalityQuery &Query) {
176  const LLT QueryTy = Query.Types[TypeIdx];
177  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178  };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182  return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186  const int EltSize = EltTy.getSizeInBits();
187  return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191  const int EltSize = Ty.getElementType().getSizeInBits();
192  return EltSize == 32 || EltSize == 64 ||
193  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194  EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198  if (!isRegisterSize(Ty.getSizeInBits()))
199  return false;
200 
201  if (Ty.isVector())
202  return isRegisterVectorType(Ty);
203 
204  return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210  return [=](const LegalityQuery &Query) {
211  return isRegisterType(Query.Types[TypeIdx]);
212  };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216  return [=](const LegalityQuery &Query) {
217  const LLT QueryTy = Query.Types[TypeIdx];
218  if (!QueryTy.isVector())
219  return false;
220  const LLT EltTy = QueryTy.getElementType();
221  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222  };
223 }
224 
225 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
226  return [=](const LegalityQuery &Query) {
227  const LLT Ty = Query.Types[TypeIdx];
228  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
229  Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
230  };
231 }
232 
233 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
234 // handle some operations by just promoting the register during
235 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
236 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
237  bool IsLoad) {
238  switch (AS) {
240  // FIXME: Private element size.
241  return ST.enableFlatScratch() ? 128 : 32;
243  return ST.useDS128() ? 128 : 64;
247  // Treat constant and global as identical. SMRD loads are sometimes usable for
248  // global loads (ideally constant address space should be eliminated)
249  // depending on the context. Legality cannot be context dependent, but
250  // RegBankSelect can split the load as necessary depending on the pointer
251  // register bank/uniformity and if the memory is invariant or not written in a
252  // kernel.
253  return IsLoad ? 512 : 128;
254  default:
255  // Flat addresses may contextually need to be split to 32-bit parts if they
256  // may alias scratch depending on the subtarget.
257  return 128;
258  }
259 }
260 
262  const LegalityQuery &Query,
263  unsigned Opcode) {
264  const LLT Ty = Query.Types[0];
265 
266  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
267  const bool IsLoad = Opcode != AMDGPU::G_STORE;
268 
269  unsigned RegSize = Ty.getSizeInBits();
270  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
271  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
272  unsigned AS = Query.Types[1].getAddressSpace();
273 
274  // All of these need to be custom lowered to cast the pointer operand.
276  return false;
277 
278  // TODO: We should be able to widen loads if the alignment is high enough, but
279  // we also need to modify the memory access size.
280 #if 0
281  // Accept widening loads based on alignment.
282  if (IsLoad && MemSize < Size)
283  MemSize = std::max(MemSize, Align);
284 #endif
285 
286  // Only 1-byte and 2-byte to 32-bit extloads are valid.
287  if (MemSize != RegSize && RegSize != 32)
288  return false;
289 
290  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
291  return false;
292 
293  switch (MemSize) {
294  case 8:
295  case 16:
296  case 32:
297  case 64:
298  case 128:
299  break;
300  case 96:
301  if (!ST.hasDwordx3LoadStores())
302  return false;
303  break;
304  case 256:
305  case 512:
306  // These may contextually need to be broken down.
307  break;
308  default:
309  return false;
310  }
311 
312  assert(RegSize >= MemSize);
313 
314  if (AlignBits < MemSize) {
315  const SITargetLowering *TLI = ST.getTargetLowering();
316  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
317  Align(AlignBits / 8)))
318  return false;
319  }
320 
321  return true;
322 }
323 
324 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
325 // workaround this. Eventually it should ignore the type for loads and only care
326 // about the size. Return true in cases where we will workaround this for now by
327 // bitcasting.
328 static bool loadStoreBitcastWorkaround(const LLT Ty) {
329  if (EnableNewLegality)
330  return false;
331 
332  const unsigned Size = Ty.getSizeInBits();
333  if (Size <= 64)
334  return false;
335  if (!Ty.isVector())
336  return true;
337 
338  LLT EltTy = Ty.getElementType();
339  if (EltTy.isPointer())
340  return true;
341 
342  unsigned EltSize = EltTy.getSizeInBits();
343  return EltSize != 32 && EltSize != 64;
344 }
345 
346 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
347  unsigned Opcode) {
348  const LLT Ty = Query.Types[0];
349  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
351 }
352 
353 /// Return true if a load or store of the type should be lowered with a bitcast
354 /// to a different type.
355 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
356  const unsigned MemSizeInBits) {
357  const unsigned Size = Ty.getSizeInBits();
358  if (Size != MemSizeInBits)
359  return Size <= 32 && Ty.isVector();
360 
362  return true;
363  return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
365 }
366 
367 /// Return true if we should legalize a load by widening an odd sized memory
368 /// access up to the alignment. Note this case when the memory access itself
369 /// changes, not the size of the result register.
370 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
371  unsigned AlignInBits, unsigned AddrSpace,
372  unsigned Opcode) {
373  // We don't want to widen cases that are naturally legal.
374  if (isPowerOf2_32(SizeInBits))
375  return false;
376 
377  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
378  // end up widening these for a scalar load during RegBankSelect, since there
379  // aren't 96-bit scalar loads.
380  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
381  return false;
382 
383  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
384  return false;
385 
386  // A load is known dereferenceable up to the alignment, so it's legal to widen
387  // to it.
388  //
389  // TODO: Could check dereferenceable for less aligned cases.
390  unsigned RoundedSize = NextPowerOf2(SizeInBits);
391  if (AlignInBits < RoundedSize)
392  return false;
393 
394  // Do not widen if it would introduce a slow unaligned load.
395  const SITargetLowering *TLI = ST.getTargetLowering();
396  bool Fast = false;
398  RoundedSize, AddrSpace, Align(AlignInBits / 8),
399  MachineMemOperand::MOLoad, &Fast) &&
400  Fast;
401 }
402 
403 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
404  unsigned Opcode) {
405  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
406  return false;
407 
408  return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
409  Query.MMODescrs[0].AlignInBits,
410  Query.Types[1].getAddressSpace(), Opcode);
411 }
412 
414  const GCNTargetMachine &TM)
415  : ST(ST_) {
416  using namespace TargetOpcode;
417 
418  auto GetAddrSpacePtr = [&TM](unsigned AS) {
419  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
420  };
421 
422  const LLT S1 = LLT::scalar(1);
423  const LLT S8 = LLT::scalar(8);
424  const LLT S16 = LLT::scalar(16);
425  const LLT S32 = LLT::scalar(32);
426  const LLT S64 = LLT::scalar(64);
427  const LLT S128 = LLT::scalar(128);
428  const LLT S256 = LLT::scalar(256);
429  const LLT S512 = LLT::scalar(512);
430  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
431 
432  const LLT V2S8 = LLT::vector(2, 8);
433  const LLT V2S16 = LLT::vector(2, 16);
434  const LLT V4S16 = LLT::vector(4, 16);
435 
436  const LLT V2S32 = LLT::vector(2, 32);
437  const LLT V3S32 = LLT::vector(3, 32);
438  const LLT V4S32 = LLT::vector(4, 32);
439  const LLT V5S32 = LLT::vector(5, 32);
440  const LLT V6S32 = LLT::vector(6, 32);
441  const LLT V7S32 = LLT::vector(7, 32);
442  const LLT V8S32 = LLT::vector(8, 32);
443  const LLT V9S32 = LLT::vector(9, 32);
444  const LLT V10S32 = LLT::vector(10, 32);
445  const LLT V11S32 = LLT::vector(11, 32);
446  const LLT V12S32 = LLT::vector(12, 32);
447  const LLT V13S32 = LLT::vector(13, 32);
448  const LLT V14S32 = LLT::vector(14, 32);
449  const LLT V15S32 = LLT::vector(15, 32);
450  const LLT V16S32 = LLT::vector(16, 32);
451  const LLT V32S32 = LLT::vector(32, 32);
452 
453  const LLT V2S64 = LLT::vector(2, 64);
454  const LLT V3S64 = LLT::vector(3, 64);
455  const LLT V4S64 = LLT::vector(4, 64);
456  const LLT V5S64 = LLT::vector(5, 64);
457  const LLT V6S64 = LLT::vector(6, 64);
458  const LLT V7S64 = LLT::vector(7, 64);
459  const LLT V8S64 = LLT::vector(8, 64);
460  const LLT V16S64 = LLT::vector(16, 64);
461 
462  std::initializer_list<LLT> AllS32Vectors =
463  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
464  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
465  std::initializer_list<LLT> AllS64Vectors =
466  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
467 
468  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
469  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
470  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
471  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
472  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
473  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
474  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
475 
476  const LLT CodePtr = FlatPtr;
477 
478  const std::initializer_list<LLT> AddrSpaces64 = {
479  GlobalPtr, ConstantPtr, FlatPtr
480  };
481 
482  const std::initializer_list<LLT> AddrSpaces32 = {
483  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484  };
485 
486  const std::initializer_list<LLT> FPTypesBase = {
487  S32, S64
488  };
489 
490  const std::initializer_list<LLT> FPTypes16 = {
491  S32, S64, S16
492  };
493 
494  const std::initializer_list<LLT> FPTypesPK16 = {
495  S32, S64, S16, V2S16
496  };
497 
498  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
499 
500  setAction({G_BRCOND, S1}, Legal); // VCC branches
501  setAction({G_BRCOND, S32}, Legal); // SCC branches
502 
503  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
504  // elements for v3s16
506  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
507  .legalFor(AllS32Vectors)
508  .legalFor(AllS64Vectors)
509  .legalFor(AddrSpaces64)
510  .legalFor(AddrSpaces32)
511  .legalIf(isPointer(0))
512  .clampScalar(0, S16, S256)
513  .widenScalarToNextPow2(0, 32)
514  .clampMaxNumElements(0, S32, 16)
516  .scalarize(0);
517 
518  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519  // Full set of gfx9 features.
520  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
521  .legalFor({S32, S16, V2S16})
522  .clampScalar(0, S16, S32)
523  .clampMaxNumElements(0, S16, 2)
524  .scalarize(0)
525  .widenScalarToNextPow2(0, 32);
526 
527  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
528  .legalFor({S32, S16, V2S16}) // Clamp modifier
529  .minScalarOrElt(0, S16)
530  .clampMaxNumElements(0, S16, 2)
531  .scalarize(0)
532  .widenScalarToNextPow2(0, 32)
533  .lower();
534  } else if (ST.has16BitInsts()) {
535  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
536  .legalFor({S32, S16})
537  .clampScalar(0, S16, S32)
538  .scalarize(0)
539  .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
540 
541  // Technically the saturating operations require clamp bit support, but this
542  // was introduced at the same time as 16-bit operations.
543  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
544  .legalFor({S32, S16}) // Clamp modifier
545  .minScalar(0, S16)
546  .scalarize(0)
547  .widenScalarToNextPow2(0, 16)
548  .lower();
549 
550  // We're just lowering this, but it helps get a better result to try to
551  // coerce to the desired type first.
552  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
553  .minScalar(0, S16)
554  .scalarize(0)
555  .lower();
556  } else {
557  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
558  .legalFor({S32})
559  .clampScalar(0, S32, S32)
560  .scalarize(0);
561 
562  if (ST.hasIntClamp()) {
563  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
564  .legalFor({S32}) // Clamp modifier.
565  .scalarize(0)
566  .minScalarOrElt(0, S32)
567  .lower();
568  } else {
569  // Clamp bit support was added in VI, along with 16-bit operations.
570  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571  .minScalar(0, S32)
572  .scalarize(0)
573  .lower();
574  }
575 
576  // FIXME: DAG expansion gets better results. The widening uses the smaller
577  // range values and goes for the min/max lowering directly.
578  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
579  .minScalar(0, S32)
580  .scalarize(0)
581  .lower();
582  }
583 
584  getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
585  .customFor({S32, S64})
586  .clampScalar(0, S32, S64)
587  .widenScalarToNextPow2(0, 32)
588  .scalarize(0);
589 
590  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
591  .legalFor({S32})
592  .maxScalarOrElt(0, S32);
593 
594  if (ST.hasVOP3PInsts()) {
595  Mulh
596  .clampMaxNumElements(0, S8, 2)
597  .lowerFor({V2S8});
598  }
599 
600  Mulh
601  .scalarize(0)
602  .lower();
603 
604  // Report legal for any types we can handle anywhere. For the cases only legal
605  // on the SALU, RegBankSelect will be able to re-legalize.
606  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
607  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
608  .clampScalar(0, S32, S64)
612  .scalarize(0);
613 
614  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
615  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
616  .legalFor({{S32, S1}, {S32, S32}})
617  .minScalar(0, S32)
618  // TODO: .scalarize(0)
619  .lower();
620 
621  getActionDefinitionsBuilder(G_BITCAST)
622  // Don't worry about the size constraint.
624  .lower();
625 
626 
627  getActionDefinitionsBuilder(G_CONSTANT)
628  .legalFor({S1, S32, S64, S16, GlobalPtr,
629  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
630  .legalIf(isPointer(0))
631  .clampScalar(0, S32, S64)
633 
634  getActionDefinitionsBuilder(G_FCONSTANT)
635  .legalFor({S32, S64, S16})
636  .clampScalar(0, S16, S64);
637 
638  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
639  .legalIf(isRegisterType(0))
640  // s1 and s16 are special cases because they have legal operations on
641  // them, but don't really occupy registers in the normal way.
642  .legalFor({S1, S16})
643  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644  .clampScalarOrElt(0, S32, MaxScalar)
645  .widenScalarToNextPow2(0, 32)
646  .clampMaxNumElements(0, S32, 16);
647 
648  setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
649 
650  // If the amount is divergent, we have to do a wave reduction to get the
651  // maximum value, so this is expanded during RegBankSelect.
652  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
653  .legalFor({{PrivatePtr, S32}});
654 
655  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
656  .customIf(typeIsNot(0, PrivatePtr));
657 
658  setAction({G_BLOCK_ADDR, CodePtr}, Legal);
659 
660  auto &FPOpActions = getActionDefinitionsBuilder(
661  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
662  .legalFor({S32, S64});
663  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
664  .customFor({S32, S64});
665  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
666  .customFor({S32, S64});
667 
668  if (ST.has16BitInsts()) {
669  if (ST.hasVOP3PInsts())
670  FPOpActions.legalFor({S16, V2S16});
671  else
672  FPOpActions.legalFor({S16});
673 
674  TrigActions.customFor({S16});
675  FDIVActions.customFor({S16});
676  }
677 
678  auto &MinNumMaxNum = getActionDefinitionsBuilder({
679  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
680 
681  if (ST.hasVOP3PInsts()) {
682  MinNumMaxNum.customFor(FPTypesPK16)
684  .clampMaxNumElements(0, S16, 2)
685  .clampScalar(0, S16, S64)
686  .scalarize(0);
687  } else if (ST.has16BitInsts()) {
688  MinNumMaxNum.customFor(FPTypes16)
689  .clampScalar(0, S16, S64)
690  .scalarize(0);
691  } else {
692  MinNumMaxNum.customFor(FPTypesBase)
693  .clampScalar(0, S32, S64)
694  .scalarize(0);
695  }
696 
697  if (ST.hasVOP3PInsts())
698  FPOpActions.clampMaxNumElements(0, S16, 2);
699 
700  FPOpActions
701  .scalarize(0)
702  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
703 
704  TrigActions
705  .scalarize(0)
706  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
707 
708  FDIVActions
709  .scalarize(0)
710  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711 
712  getActionDefinitionsBuilder({G_FNEG, G_FABS})
713  .legalFor(FPTypesPK16)
714  .clampMaxNumElements(0, S16, 2)
715  .scalarize(0)
716  .clampScalar(0, S16, S64);
717 
718  if (ST.has16BitInsts()) {
719  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
720  .legalFor({S32, S64, S16})
721  .scalarize(0)
722  .clampScalar(0, S16, S64);
723  } else {
725  .legalFor({S32, S64})
726  .scalarize(0)
727  .clampScalar(0, S32, S64);
728 
729  if (ST.hasFractBug()) {
731  .customFor({S64})
732  .legalFor({S32, S64})
733  .scalarize(0)
734  .clampScalar(0, S32, S64);
735  } else {
737  .legalFor({S32, S64})
738  .scalarize(0)
739  .clampScalar(0, S32, S64);
740  }
741  }
742 
743  getActionDefinitionsBuilder(G_FPTRUNC)
744  .legalFor({{S32, S64}, {S16, S32}})
745  .scalarize(0)
746  .lower();
747 
749  .legalFor({{S64, S32}, {S32, S16}})
750  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
751  .scalarize(0);
752 
754  // Use actual fsub instruction
755  .legalFor({S32})
756  // Must use fadd + fneg
757  .lowerFor({S64, S16, V2S16})
758  .scalarize(0)
759  .clampScalar(0, S32, S64);
760 
761  // Whether this is legal depends on the floating point mode for the function.
762  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
763  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
764  FMad.customFor({S32, S16});
765  else if (ST.hasMadMacF32Insts())
766  FMad.customFor({S32});
767  else if (ST.hasMadF16())
768  FMad.customFor({S16});
769  FMad.scalarize(0)
770  .lower();
771 
772  auto &FRem = getActionDefinitionsBuilder(G_FREM);
773  if (ST.has16BitInsts()) {
774  FRem.customFor({S16, S32, S64});
775  } else {
776  FRem.minScalar(0, S32)
777  .customFor({S32, S64});
778  }
779  FRem.scalarize(0);
780 
781  // TODO: Do we need to clamp maximum bitwidth?
783  .legalIf(isScalar(0))
784  .legalFor({{V2S16, V2S32}})
785  .clampMaxNumElements(0, S16, 2)
786  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
787  // situations (like an invalid implicit use), we don't want to infinite loop
788  // in the legalizer.
790  .alwaysLegal();
791 
792  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
793  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
794  {S32, S1}, {S64, S1}, {S16, S1}})
795  .scalarize(0)
796  .clampScalar(0, S32, S64)
797  .widenScalarToNextPow2(1, 32);
798 
799  // TODO: Split s1->s64 during regbankselect for VALU.
800  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
801  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
802  .lowerFor({{S32, S64}})
803  .lowerIf(typeIs(1, S1))
804  .customFor({{S64, S64}});
805  if (ST.has16BitInsts())
806  IToFP.legalFor({{S16, S16}});
807  IToFP.clampScalar(1, S32, S64)
808  .minScalar(0, S32)
809  .scalarize(0)
811 
812  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
813  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
814  .customFor({{S64, S64}})
815  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
816  if (ST.has16BitInsts())
817  FPToI.legalFor({{S16, S16}});
818  else
819  FPToI.minScalar(1, S32);
820 
821  FPToI.minScalar(0, S32)
822  .widenScalarToNextPow2(0, 32)
823  .scalarize(0)
824  .lower();
825 
826  // Lower roundeven into G_FRINT
827  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
828  .scalarize(0)
829  .lower();
830 
831  if (ST.has16BitInsts()) {
832  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
833  .legalFor({S16, S32, S64})
834  .clampScalar(0, S16, S64)
835  .scalarize(0);
836  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
837  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
838  .legalFor({S32, S64})
839  .clampScalar(0, S32, S64)
840  .scalarize(0);
841  } else {
842  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
843  .legalFor({S32})
844  .customFor({S64})
845  .clampScalar(0, S32, S64)
846  .scalarize(0);
847  }
848 
849  getActionDefinitionsBuilder(G_PTR_ADD)
850  .legalIf(all(isPointer(0), sameSize(0, 1)))
851  .scalarize(0)
852  .scalarSameSizeAs(1, 0);
853 
854  getActionDefinitionsBuilder(G_PTRMASK)
855  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
856  .scalarSameSizeAs(1, 0)
857  .scalarize(0);
858 
859  auto &CmpBuilder =
860  getActionDefinitionsBuilder(G_ICMP)
861  // The compare output type differs based on the register bank of the output,
862  // so make both s1 and s32 legal.
863  //
864  // Scalar compares producing output in scc will be promoted to s32, as that
865  // is the allocatable register type that will be needed for the copy from
866  // scc. This will be promoted during RegBankSelect, and we assume something
867  // before that won't try to use s32 result types.
868  //
869  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
870  // bank.
871  .legalForCartesianProduct(
872  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
873  .legalForCartesianProduct(
874  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
875  if (ST.has16BitInsts()) {
876  CmpBuilder.legalFor({{S1, S16}});
877  }
878 
879  CmpBuilder
880  .widenScalarToNextPow2(1)
881  .clampScalar(1, S32, S64)
882  .scalarize(0)
883  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
884 
885  getActionDefinitionsBuilder(G_FCMP)
886  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
887  .widenScalarToNextPow2(1)
888  .clampScalar(1, S32, S64)
889  .scalarize(0);
890 
891  // FIXME: fpow has a selection pattern that should move to custom lowering.
892  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
893  if (ST.has16BitInsts())
894  Exp2Ops.legalFor({S32, S16});
895  else
896  Exp2Ops.legalFor({S32});
897  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
898  Exp2Ops.scalarize(0);
899 
900  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
901  if (ST.has16BitInsts())
902  ExpOps.customFor({{S32}, {S16}});
903  else
904  ExpOps.customFor({S32});
905  ExpOps.clampScalar(0, MinScalarFPTy, S32)
906  .scalarize(0);
907 
908  getActionDefinitionsBuilder(G_FPOWI)
909  .clampScalar(0, MinScalarFPTy, S32)
910  .lower();
911 
912  // The 64-bit versions produce 32-bit results, but only on the SALU.
913  getActionDefinitionsBuilder(G_CTPOP)
914  .legalFor({{S32, S32}, {S32, S64}})
915  .clampScalar(0, S32, S32)
916  .clampScalar(1, S32, S64)
917  .scalarize(0)
918  .widenScalarToNextPow2(0, 32)
919  .widenScalarToNextPow2(1, 32);
920 
921  // The hardware instructions return a different result on 0 than the generic
922  // instructions expect. The hardware produces -1, but these produce the
923  // bitwidth.
924  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
925  .scalarize(0)
926  .clampScalar(0, S32, S32)
927  .clampScalar(1, S32, S64)
928  .widenScalarToNextPow2(0, 32)
929  .widenScalarToNextPow2(1, 32)
930  .lower();
931 
932  // The 64-bit versions produce 32-bit results, but only on the SALU.
933  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
934  .legalFor({{S32, S32}, {S32, S64}})
935  .clampScalar(0, S32, S32)
936  .clampScalar(1, S32, S64)
937  .scalarize(0)
938  .widenScalarToNextPow2(0, 32)
939  .widenScalarToNextPow2(1, 32);
940 
941  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
942  // RegBankSelect.
943  getActionDefinitionsBuilder(G_BITREVERSE)
944  .legalFor({S32, S64})
945  .clampScalar(0, S32, S64)
946  .scalarize(0)
947  .widenScalarToNextPow2(0);
948 
949  if (ST.has16BitInsts()) {
950  getActionDefinitionsBuilder(G_BSWAP)
951  .legalFor({S16, S32, V2S16})
952  .clampMaxNumElements(0, S16, 2)
953  // FIXME: Fixing non-power-of-2 before clamp is workaround for
954  // narrowScalar limitation.
955  .widenScalarToNextPow2(0)
956  .clampScalar(0, S16, S32)
957  .scalarize(0);
958 
959  if (ST.hasVOP3PInsts()) {
960  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
961  .legalFor({S32, S16, V2S16})
962  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
963  .clampMaxNumElements(0, S16, 2)
964  .minScalar(0, S16)
965  .widenScalarToNextPow2(0)
966  .scalarize(0)
967  .lower();
968  } else {
969  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
970  .legalFor({S32, S16})
971  .widenScalarToNextPow2(0)
972  .minScalar(0, S16)
973  .scalarize(0)
974  .lower();
975  }
976  } else {
977  // TODO: Should have same legality without v_perm_b32
978  getActionDefinitionsBuilder(G_BSWAP)
979  .legalFor({S32})
980  .lowerIf(scalarNarrowerThan(0, 32))
981  // FIXME: Fixing non-power-of-2 before clamp is workaround for
982  // narrowScalar limitation.
983  .widenScalarToNextPow2(0)
984  .maxScalar(0, S32)
985  .scalarize(0)
986  .lower();
987 
988  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
989  .legalFor({S32})
990  .minScalar(0, S32)
991  .widenScalarToNextPow2(0)
992  .scalarize(0)
993  .lower();
994  }
995 
996  getActionDefinitionsBuilder(G_INTTOPTR)
997  // List the common cases
998  .legalForCartesianProduct(AddrSpaces64, {S64})
999  .legalForCartesianProduct(AddrSpaces32, {S32})
1000  .scalarize(0)
1001  // Accept any address space as long as the size matches
1002  .legalIf(sameSize(0, 1))
1003  .widenScalarIf(smallerThan(1, 0),
1004  [](const LegalityQuery &Query) {
1005  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1006  })
1007  .narrowScalarIf(largerThan(1, 0),
1008  [](const LegalityQuery &Query) {
1009  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1010  });
1011 
1012  getActionDefinitionsBuilder(G_PTRTOINT)
1013  // List the common cases
1014  .legalForCartesianProduct(AddrSpaces64, {S64})
1015  .legalForCartesianProduct(AddrSpaces32, {S32})
1016  .scalarize(0)
1017  // Accept any address space as long as the size matches
1018  .legalIf(sameSize(0, 1))
1019  .widenScalarIf(smallerThan(0, 1),
1020  [](const LegalityQuery &Query) {
1021  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1022  })
1023  .narrowScalarIf(
1024  largerThan(0, 1),
1025  [](const LegalityQuery &Query) {
1026  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1027  });
1028 
1029  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1030  .scalarize(0)
1031  .custom();
1032 
1033  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1034  bool IsLoad) -> bool {
1035  const LLT DstTy = Query.Types[0];
1036 
1037  // Split vector extloads.
1038  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1039  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1040 
1041  if (MemSize < DstTy.getSizeInBits())
1042  MemSize = std::max(MemSize, AlignBits);
1043 
1044  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1045  return true;
1046 
1047  const LLT PtrTy = Query.Types[1];
1048  unsigned AS = PtrTy.getAddressSpace();
1049  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1050  return true;
1051 
1052  // Catch weird sized loads that don't evenly divide into the access sizes
1053  // TODO: May be able to widen depending on alignment etc.
1054  unsigned NumRegs = (MemSize + 31) / 32;
1055  if (NumRegs == 3) {
1056  if (!ST.hasDwordx3LoadStores())
1057  return true;
1058  } else {
1059  // If the alignment allows, these should have been widened.
1060  if (!isPowerOf2_32(NumRegs))
1061  return true;
1062  }
1063 
1064  if (AlignBits < MemSize) {
1065  const SITargetLowering *TLI = ST.getTargetLowering();
1066  return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1067  Align(AlignBits / 8));
1068  }
1069 
1070  return false;
1071  };
1072 
1073  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1074  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1075  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1076 
1077  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1078  // LDS
1079  // TODO: Unsupported flat for SI.
1080 
1081  for (unsigned Op : {G_LOAD, G_STORE}) {
1082  const bool IsStore = Op == G_STORE;
1083 
1084  auto &Actions = getActionDefinitionsBuilder(Op);
1085  // Explicitly list some common cases.
1086  // TODO: Does this help compile time at all?
1087  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1088  {V2S32, GlobalPtr, 64, GlobalAlign32},
1089  {V4S32, GlobalPtr, 128, GlobalAlign32},
1090  {S64, GlobalPtr, 64, GlobalAlign32},
1091  {V2S64, GlobalPtr, 128, GlobalAlign32},
1092  {V2S16, GlobalPtr, 32, GlobalAlign32},
1093  {S32, GlobalPtr, 8, GlobalAlign8},
1094  {S32, GlobalPtr, 16, GlobalAlign16},
1095 
1096  {S32, LocalPtr, 32, 32},
1097  {S64, LocalPtr, 64, 32},
1098  {V2S32, LocalPtr, 64, 32},
1099  {S32, LocalPtr, 8, 8},
1100  {S32, LocalPtr, 16, 16},
1101  {V2S16, LocalPtr, 32, 32},
1102 
1103  {S32, PrivatePtr, 32, 32},
1104  {S32, PrivatePtr, 8, 8},
1105  {S32, PrivatePtr, 16, 16},
1106  {V2S16, PrivatePtr, 32, 32},
1107 
1108  {S32, ConstantPtr, 32, GlobalAlign32},
1109  {V2S32, ConstantPtr, 64, GlobalAlign32},
1110  {V4S32, ConstantPtr, 128, GlobalAlign32},
1111  {S64, ConstantPtr, 64, GlobalAlign32},
1112  {V2S32, ConstantPtr, 32, GlobalAlign32}});
1113  Actions.legalIf(
1114  [=](const LegalityQuery &Query) -> bool {
1115  return isLoadStoreLegal(ST, Query, Op);
1116  });
1117 
1118  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1119  // 64-bits.
1120  //
1121  // TODO: Should generalize bitcast action into coerce, which will also cover
1122  // inserting addrspacecasts.
1123  Actions.customIf(typeIs(1, Constant32Ptr));
1124 
1125  // Turn any illegal element vectors into something easier to deal
1126  // with. These will ultimately produce 32-bit scalar shifts to extract the
1127  // parts anyway.
1128  //
1129  // For odd 16-bit element vectors, prefer to split those into pieces with
1130  // 16-bit vector parts.
1131  Actions.bitcastIf(
1132  [=](const LegalityQuery &Query) -> bool {
1133  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1134  Query.MMODescrs[0].SizeInBits);
1135  }, bitcastToRegisterType(0));
1136 
1137  if (!IsStore) {
1138  // Widen suitably aligned loads by loading extra bytes. The standard
1139  // legalization actions can't properly express widening memory operands.
1140  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1141  return shouldWidenLoad(ST, Query, G_LOAD);
1142  });
1143  }
1144 
1145  // FIXME: load/store narrowing should be moved to lower action
1146  Actions
1147  .narrowScalarIf(
1148  [=](const LegalityQuery &Query) -> bool {
1149  return !Query.Types[0].isVector() &&
1150  needToSplitMemOp(Query, Op == G_LOAD);
1151  },
1152  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1153  const LLT DstTy = Query.Types[0];
1154  const LLT PtrTy = Query.Types[1];
1155 
1156  const unsigned DstSize = DstTy.getSizeInBits();
1157  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1158 
1159  // Split extloads.
1160  if (DstSize > MemSize)
1161  return std::make_pair(0, LLT::scalar(MemSize));
1162 
1163  if (!isPowerOf2_32(DstSize)) {
1164  // We're probably decomposing an odd sized store. Try to split
1165  // to the widest type. TODO: Account for alignment. As-is it
1166  // should be OK, since the new parts will be further legalized.
1167  unsigned FloorSize = PowerOf2Floor(DstSize);
1168  return std::make_pair(0, LLT::scalar(FloorSize));
1169  }
1170 
1171  if (DstSize > 32 && (DstSize % 32 != 0)) {
1172  // FIXME: Need a way to specify non-extload of larger size if
1173  // suitably aligned.
1174  return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1175  }
1176 
1177  unsigned MaxSize = maxSizeForAddrSpace(ST,
1178  PtrTy.getAddressSpace(),
1179  Op == G_LOAD);
1180  if (MemSize > MaxSize)
1181  return std::make_pair(0, LLT::scalar(MaxSize));
1182 
1183  unsigned Align = Query.MMODescrs[0].AlignInBits;
1184  return std::make_pair(0, LLT::scalar(Align));
1185  })
1186  .fewerElementsIf(
1187  [=](const LegalityQuery &Query) -> bool {
1188  return Query.Types[0].isVector() &&
1189  needToSplitMemOp(Query, Op == G_LOAD);
1190  },
1191  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1192  const LLT DstTy = Query.Types[0];
1193  const LLT PtrTy = Query.Types[1];
1194 
1195  LLT EltTy = DstTy.getElementType();
1196  unsigned MaxSize = maxSizeForAddrSpace(ST,
1197  PtrTy.getAddressSpace(),
1198  Op == G_LOAD);
1199 
1200  // FIXME: Handle widened to power of 2 results better. This ends
1201  // up scalarizing.
1202  // FIXME: 3 element stores scalarized on SI
1203 
1204  // Split if it's too large for the address space.
1205  if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1206  unsigned NumElts = DstTy.getNumElements();
1207  unsigned EltSize = EltTy.getSizeInBits();
1208 
1209  if (MaxSize % EltSize == 0) {
1210  return std::make_pair(
1211  0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1212  }
1213 
1214  unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1215 
1216  // FIXME: Refine when odd breakdowns handled
1217  // The scalars will need to be re-legalized.
1218  if (NumPieces == 1 || NumPieces >= NumElts ||
1219  NumElts % NumPieces != 0)
1220  return std::make_pair(0, EltTy);
1221 
1222  return std::make_pair(0,
1223  LLT::vector(NumElts / NumPieces, EltTy));
1224  }
1225 
1226  // FIXME: We could probably handle weird extending loads better.
1227  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1228  if (DstTy.getSizeInBits() > MemSize)
1229  return std::make_pair(0, EltTy);
1230 
1231  unsigned EltSize = EltTy.getSizeInBits();
1232  unsigned DstSize = DstTy.getSizeInBits();
1233  if (!isPowerOf2_32(DstSize)) {
1234  // We're probably decomposing an odd sized store. Try to split
1235  // to the widest type. TODO: Account for alignment. As-is it
1236  // should be OK, since the new parts will be further legalized.
1237  unsigned FloorSize = PowerOf2Floor(DstSize);
1238  return std::make_pair(
1239  0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1240  }
1241 
1242  // Need to split because of alignment.
1243  unsigned Align = Query.MMODescrs[0].AlignInBits;
1244  if (EltSize > Align &&
1245  (EltSize / Align < DstTy.getNumElements())) {
1246  return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1247  }
1248 
1249  // May need relegalization for the scalars.
1250  return std::make_pair(0, EltTy);
1251  })
1252  .lowerIfMemSizeNotPow2()
1253  .minScalar(0, S32);
1254 
1255  if (IsStore)
1256  Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1257 
1258  Actions
1259  .widenScalarToNextPow2(0)
1260  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1261  .lower();
1262  }
1263 
1264  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1265  .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1266  {S32, GlobalPtr, 16, 2 * 8},
1267  {S32, LocalPtr, 8, 8},
1268  {S32, LocalPtr, 16, 16},
1269  {S32, PrivatePtr, 8, 8},
1270  {S32, PrivatePtr, 16, 16},
1271  {S32, ConstantPtr, 8, 8},
1272  {S32, ConstantPtr, 16, 2 * 8}});
1273  if (ST.hasFlatAddressSpace()) {
1274  ExtLoads.legalForTypesWithMemDesc(
1275  {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1276  }
1277 
1278  ExtLoads.clampScalar(0, S32, S32)
1279  .widenScalarToNextPow2(0)
1280  .unsupportedIfMemSizeNotPow2()
1281  .lower();
1282 
1283  auto &Atomics = getActionDefinitionsBuilder(
1284  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1285  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1286  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1287  G_ATOMICRMW_UMIN})
1288  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1289  {S64, GlobalPtr}, {S64, LocalPtr},
1290  {S32, RegionPtr}, {S64, RegionPtr}});
1291  if (ST.hasFlatAddressSpace()) {
1292  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1293  }
1294 
1295  auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1296  if (ST.hasLDSFPAtomics()) {
1297  Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1298  if (ST.hasGFX90AInsts())
1299  Atomic.legalFor({{S64, LocalPtr}});
1300  }
1301  if (ST.hasAtomicFaddInsts())
1302  Atomic.legalFor({{S32, GlobalPtr}});
1303 
1304  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1305  // demarshalling
1306  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1307  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1308  {S32, FlatPtr}, {S64, FlatPtr}})
1309  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1310  {S32, RegionPtr}, {S64, RegionPtr}});
1311  // TODO: Pointer types, any 32-bit or 64-bit vector
1312 
1313  // Condition should be s32 for scalar, s1 for vector.
1314  getActionDefinitionsBuilder(G_SELECT)
1315  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1316  GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1317  LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1318  .clampScalar(0, S16, S64)
1319  .scalarize(1)
1320  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1321  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1322  .clampMaxNumElements(0, S32, 2)
1323  .clampMaxNumElements(0, LocalPtr, 2)
1324  .clampMaxNumElements(0, PrivatePtr, 2)
1325  .scalarize(0)
1326  .widenScalarToNextPow2(0)
1327  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1328 
1329  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1330  // be more flexible with the shift amount type.
1331  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1332  .legalFor({{S32, S32}, {S64, S32}});
1333  if (ST.has16BitInsts()) {
1334  if (ST.hasVOP3PInsts()) {
1335  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1336  .clampMaxNumElements(0, S16, 2);
1337  } else
1338  Shifts.legalFor({{S16, S16}});
1339 
1340  // TODO: Support 16-bit shift amounts for all types
1341  Shifts.widenScalarIf(
1342  [=](const LegalityQuery &Query) {
1343  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1344  // 32-bit amount.
1345  const LLT ValTy = Query.Types[0];
1346  const LLT AmountTy = Query.Types[1];
1347  return ValTy.getSizeInBits() <= 16 &&
1348  AmountTy.getSizeInBits() < 16;
1349  }, changeTo(1, S16));
1350  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1351  Shifts.clampScalar(1, S32, S32);
1352  Shifts.clampScalar(0, S16, S64);
1353  Shifts.widenScalarToNextPow2(0, 16);
1354 
1355  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1356  .minScalar(0, S16)
1357  .scalarize(0)
1358  .lower();
1359  } else {
1360  // Make sure we legalize the shift amount type first, as the general
1361  // expansion for the shifted type will produce much worse code if it hasn't
1362  // been truncated already.
1363  Shifts.clampScalar(1, S32, S32);
1364  Shifts.clampScalar(0, S32, S64);
1365  Shifts.widenScalarToNextPow2(0, 32);
1366 
1367  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1368  .minScalar(0, S32)
1369  .scalarize(0)
1370  .lower();
1371  }
1372  Shifts.scalarize(0);
1373 
1374  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1375  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1376  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1377  unsigned IdxTypeIdx = 2;
1378 
1379  getActionDefinitionsBuilder(Op)
1380  .customIf([=](const LegalityQuery &Query) {
1381  const LLT EltTy = Query.Types[EltTypeIdx];
1382  const LLT VecTy = Query.Types[VecTypeIdx];
1383  const LLT IdxTy = Query.Types[IdxTypeIdx];
1384  const unsigned EltSize = EltTy.getSizeInBits();
1385  return (EltSize == 32 || EltSize == 64) &&
1386  VecTy.getSizeInBits() % 32 == 0 &&
1387  VecTy.getSizeInBits() <= MaxRegisterSize &&
1388  IdxTy.getSizeInBits() == 32;
1389  })
1390  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1391  bitcastToVectorElement32(VecTypeIdx))
1392  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1393  .bitcastIf(
1394  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1395  [=](const LegalityQuery &Query) {
1396  // For > 64-bit element types, try to turn this into a 64-bit
1397  // element vector since we may be able to do better indexing
1398  // if this is scalar. If not, fall back to 32.
1399  const LLT EltTy = Query.Types[EltTypeIdx];
1400  const LLT VecTy = Query.Types[VecTypeIdx];
1401  const unsigned DstEltSize = EltTy.getSizeInBits();
1402  const unsigned VecSize = VecTy.getSizeInBits();
1403 
1404  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1405  return std::make_pair(
1406  VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1407  })
1408  .clampScalar(EltTypeIdx, S32, S64)
1409  .clampScalar(VecTypeIdx, S32, S64)
1410  .clampScalar(IdxTypeIdx, S32, S32)
1411  .clampMaxNumElements(VecTypeIdx, S32, 32)
1412  // TODO: Clamp elements for 64-bit vectors?
1413  // It should only be necessary with variable indexes.
1414  // As a last resort, lower to the stack
1415  .lower();
1416  }
1417 
1418  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1419  .unsupportedIf([=](const LegalityQuery &Query) {
1420  const LLT &EltTy = Query.Types[1].getElementType();
1421  return Query.Types[0] != EltTy;
1422  });
1423 
1424  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1425  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1426  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1427 
1428  // FIXME: Doesn't handle extract of illegal sizes.
1429  getActionDefinitionsBuilder(Op)
1430  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1431  // FIXME: Multiples of 16 should not be legal.
1432  .legalIf([=](const LegalityQuery &Query) {
1433  const LLT BigTy = Query.Types[BigTyIdx];
1434  const LLT LitTy = Query.Types[LitTyIdx];
1435  return (BigTy.getSizeInBits() % 32 == 0) &&
1436  (LitTy.getSizeInBits() % 16 == 0);
1437  })
1438  .widenScalarIf(
1439  [=](const LegalityQuery &Query) {
1440  const LLT BigTy = Query.Types[BigTyIdx];
1441  return (BigTy.getScalarSizeInBits() < 16);
1442  },
1444  .widenScalarIf(
1445  [=](const LegalityQuery &Query) {
1446  const LLT LitTy = Query.Types[LitTyIdx];
1447  return (LitTy.getScalarSizeInBits() < 16);
1448  },
1450  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1451  .widenScalarToNextPow2(BigTyIdx, 32);
1452 
1453  }
1454 
1455  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1456  .legalForCartesianProduct(AllS32Vectors, {S32})
1457  .legalForCartesianProduct(AllS64Vectors, {S64})
1458  .clampNumElements(0, V16S32, V32S32)
1459  .clampNumElements(0, V2S64, V16S64)
1460  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1461 
1462  if (ST.hasScalarPackInsts()) {
1463  BuildVector
1464  // FIXME: Should probably widen s1 vectors straight to s32
1465  .minScalarOrElt(0, S16)
1466  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1467  .minScalar(1, S32);
1468 
1469  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1470  .legalFor({V2S16, S32})
1471  .lower();
1472  BuildVector.minScalarOrElt(0, S32);
1473  } else {
1474  BuildVector.customFor({V2S16, S16});
1475  BuildVector.minScalarOrElt(0, S32);
1476 
1477  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1478  .customFor({V2S16, S32})
1479  .lower();
1480  }
1481 
1482  BuildVector.legalIf(isRegisterType(0));
1483 
1484  // FIXME: Clamp maximum size
1485  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1486  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1487  .clampMaxNumElements(0, S32, 32)
1488  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1489  .clampMaxNumElements(0, S16, 64);
1490 
1491  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1492  // pre-legalize.
1493  if (ST.hasVOP3PInsts()) {
1494  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1495  .customFor({V2S16, V2S16})
1496  .lower();
1497  } else
1498  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1499 
1500  // Merge/Unmerge
1501  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1502  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1503  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1504 
1505  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1506  const LLT Ty = Query.Types[TypeIdx];
1507  if (Ty.isVector()) {
1508  const LLT &EltTy = Ty.getElementType();
1509  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1510  return true;
1511  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1512  return true;
1513  }
1514  return false;
1515  };
1516 
1517  auto &Builder = getActionDefinitionsBuilder(Op)
1518  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1519  .lowerFor({{S16, V2S16}})
1520  .lowerIf([=](const LegalityQuery &Query) {
1521  const LLT BigTy = Query.Types[BigTyIdx];
1522  return BigTy.getSizeInBits() == 32;
1523  })
1524  // Try to widen to s16 first for small types.
1525  // TODO: Only do this on targets with legal s16 shifts
1526  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1527  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1528  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1529  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1530  elementTypeIs(1, S16)),
1531  changeTo(1, V2S16))
1532  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1533  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1534  // valid.
1535  .clampScalar(LitTyIdx, S32, S512)
1536  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1537  // Break up vectors with weird elements into scalars
1538  .fewerElementsIf(
1539  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1540  scalarize(0))
1541  .fewerElementsIf(
1542  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1543  scalarize(1))
1544  .clampScalar(BigTyIdx, S32, MaxScalar);
1545 
1546  if (Op == G_MERGE_VALUES) {
1547  Builder.widenScalarIf(
1548  // TODO: Use 16-bit shifts if legal for 8-bit values?
1549  [=](const LegalityQuery &Query) {
1550  const LLT Ty = Query.Types[LitTyIdx];
1551  return Ty.getSizeInBits() < 32;
1552  },
1553  changeTo(LitTyIdx, S32));
1554  }
1555 
1556  Builder.widenScalarIf(
1557  [=](const LegalityQuery &Query) {
1558  const LLT Ty = Query.Types[BigTyIdx];
1559  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1560  Ty.getSizeInBits() % 16 != 0;
1561  },
1562  [=](const LegalityQuery &Query) {
1563  // Pick the next power of 2, or a multiple of 64 over 128.
1564  // Whichever is smaller.
1565  const LLT &Ty = Query.Types[BigTyIdx];
1566  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1567  if (NewSizeInBits >= 256) {
1568  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1569  if (RoundedTo < NewSizeInBits)
1570  NewSizeInBits = RoundedTo;
1571  }
1572  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1573  })
1574  // Any vectors left are the wrong size. Scalarize them.
1575  .scalarize(0)
1576  .scalarize(1);
1577  }
1578 
1579  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1580  // RegBankSelect.
1581  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1582  .legalFor({{S32}, {S64}});
1583 
1584  if (ST.hasVOP3PInsts()) {
1585  SextInReg.lowerFor({{V2S16}})
1586  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1587  // get more vector shift opportunities, since we'll get those when
1588  // expanded.
1589  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1590  } else if (ST.has16BitInsts()) {
1591  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1592  } else {
1593  // Prefer to promote to s32 before lowering if we don't have 16-bit
1594  // shifts. This avoid a lot of intermediate truncate and extend operations.
1595  SextInReg.lowerFor({{S32}, {S64}});
1596  }
1597 
1598  SextInReg
1599  .scalarize(0)
1600  .clampScalar(0, S32, S64)
1601  .lower();
1602 
1603  // TODO: Only Try to form v2s16 with legal packed instructions.
1604  getActionDefinitionsBuilder(G_FSHR)
1605  .legalFor({{S32, S32}})
1606  .lowerFor({{V2S16, V2S16}})
1607  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1608  .scalarize(0)
1609  .lower();
1610 
1611  if (ST.hasVOP3PInsts()) {
1612  getActionDefinitionsBuilder(G_FSHL)
1613  .lowerFor({{V2S16, V2S16}})
1614  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1615  .scalarize(0)
1616  .lower();
1617  } else {
1618  getActionDefinitionsBuilder(G_FSHL)
1619  .scalarize(0)
1620  .lower();
1621  }
1622 
1623  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1624  .legalFor({S64});
1625 
1626  getActionDefinitionsBuilder(G_FENCE)
1627  .alwaysLegal();
1628 
1629  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1630  .scalarize(0)
1631  .minScalar(0, S32)
1632  .lower();
1633 
1634  getActionDefinitionsBuilder({
1635  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1636  G_FCOPYSIGN,
1637 
1638  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1639  G_ATOMICRMW_NAND,
1640  G_ATOMICRMW_FSUB,
1641  G_READ_REGISTER,
1642  G_WRITE_REGISTER,
1643 
1644  G_SADDO, G_SSUBO,
1645 
1646  // TODO: Implement
1647  G_FMINIMUM, G_FMAXIMUM}).lower();
1648 
1649  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1650  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1651  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1652  .unsupported();
1653 
1654  computeTables();
1655  verify(*ST.getInstrInfo());
1656 }
1657 
1658 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1659  MachineInstr &MI) const {
1660  MachineIRBuilder &B = Helper.MIRBuilder;
1661  MachineRegisterInfo &MRI = *B.getMRI();
1662 
1663  switch (MI.getOpcode()) {
1664  case TargetOpcode::G_ADDRSPACE_CAST:
1665  return legalizeAddrSpaceCast(MI, MRI, B);
1666  case TargetOpcode::G_FRINT:
1667  return legalizeFrint(MI, MRI, B);
1668  case TargetOpcode::G_FCEIL:
1669  return legalizeFceil(MI, MRI, B);
1670  case TargetOpcode::G_FREM:
1671  return legalizeFrem(MI, MRI, B);
1672  case TargetOpcode::G_INTRINSIC_TRUNC:
1673  return legalizeIntrinsicTrunc(MI, MRI, B);
1674  case TargetOpcode::G_SITOFP:
1675  return legalizeITOFP(MI, MRI, B, true);
1676  case TargetOpcode::G_UITOFP:
1677  return legalizeITOFP(MI, MRI, B, false);
1678  case TargetOpcode::G_FPTOSI:
1679  return legalizeFPTOI(MI, MRI, B, true);
1680  case TargetOpcode::G_FPTOUI:
1681  return legalizeFPTOI(MI, MRI, B, false);
1682  case TargetOpcode::G_FMINNUM:
1683  case TargetOpcode::G_FMAXNUM:
1684  case TargetOpcode::G_FMINNUM_IEEE:
1685  case TargetOpcode::G_FMAXNUM_IEEE:
1686  return legalizeMinNumMaxNum(Helper, MI);
1687  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1688  return legalizeExtractVectorElt(MI, MRI, B);
1689  case TargetOpcode::G_INSERT_VECTOR_ELT:
1690  return legalizeInsertVectorElt(MI, MRI, B);
1691  case TargetOpcode::G_SHUFFLE_VECTOR:
1692  return legalizeShuffleVector(MI, MRI, B);
1693  case TargetOpcode::G_FSIN:
1694  case TargetOpcode::G_FCOS:
1695  return legalizeSinCos(MI, MRI, B);
1696  case TargetOpcode::G_GLOBAL_VALUE:
1697  return legalizeGlobalValue(MI, MRI, B);
1698  case TargetOpcode::G_LOAD:
1699  return legalizeLoad(Helper, MI);
1700  case TargetOpcode::G_FMAD:
1701  return legalizeFMad(MI, MRI, B);
1702  case TargetOpcode::G_FDIV:
1703  return legalizeFDIV(MI, MRI, B);
1704  case TargetOpcode::G_UDIV:
1705  case TargetOpcode::G_UREM:
1706  return legalizeUDIV_UREM(MI, MRI, B);
1707  case TargetOpcode::G_SDIV:
1708  case TargetOpcode::G_SREM:
1709  return legalizeSDIV_SREM(MI, MRI, B);
1710  case TargetOpcode::G_ATOMIC_CMPXCHG:
1711  return legalizeAtomicCmpXChg(MI, MRI, B);
1712  case TargetOpcode::G_FLOG:
1713  return legalizeFlog(MI, B, numbers::ln2f);
1714  case TargetOpcode::G_FLOG10:
1716  case TargetOpcode::G_FEXP:
1717  return legalizeFExp(MI, B);
1718  case TargetOpcode::G_FPOW:
1719  return legalizeFPow(MI, B);
1720  case TargetOpcode::G_FFLOOR:
1721  return legalizeFFloor(MI, MRI, B);
1722  case TargetOpcode::G_BUILD_VECTOR:
1723  return legalizeBuildVector(MI, MRI, B);
1724  default:
1725  return false;
1726  }
1727 
1728  llvm_unreachable("expected switch to return");
1729 }
1730 
1732  unsigned AS,
1734  MachineIRBuilder &B) const {
1735  MachineFunction &MF = B.getMF();
1736  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1737  const LLT S32 = LLT::scalar(32);
1738 
1740 
1741  if (ST.hasApertureRegs()) {
1742  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1743  // getreg.
1744  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1747  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1750  unsigned Encoding =
1753  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1754 
1755  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1756 
1757  B.buildInstr(AMDGPU::S_GETREG_B32)
1758  .addDef(GetReg)
1759  .addImm(Encoding);
1760  MRI.setType(GetReg, S32);
1761 
1762  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1763  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1764  }
1765 
1768 
1770  return Register();
1771 
1772  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1773  // private_segment_aperture_base_hi.
1774  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1775 
1776  // TODO: can we be smarter about machine pointer info?
1779  PtrInfo,
1782  4, commonAlignment(Align(64), StructOffset));
1783 
1784  Register LoadAddr;
1785 
1786  B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1787  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1788 }
1789 
1792  MachineIRBuilder &B) const {
1793  MachineFunction &MF = B.getMF();
1794 
1795  const LLT S32 = LLT::scalar(32);
1796  Register Dst = MI.getOperand(0).getReg();
1797  Register Src = MI.getOperand(1).getReg();
1798 
1799  LLT DstTy = MRI.getType(Dst);
1800  LLT SrcTy = MRI.getType(Src);
1801  unsigned DestAS = DstTy.getAddressSpace();
1802  unsigned SrcAS = SrcTy.getAddressSpace();
1803 
1804  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1805  // vector element.
1806  assert(!DstTy.isVector());
1807 
1808  const AMDGPUTargetMachine &TM
1809  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1810 
1811  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1812  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1813  return true;
1814  }
1815 
1816  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1817  // Truncate.
1818  B.buildExtract(Dst, Src, 0);
1819  MI.eraseFromParent();
1820  return true;
1821  }
1822 
1823  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1825  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1826 
1827  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1828  // another. Merge operands are required to be the same type, but creating an
1829  // extra ptrtoint would be kind of pointless.
1830  auto HighAddr = B.buildConstant(
1832  B.buildMerge(Dst, {Src, HighAddr});
1833  MI.eraseFromParent();
1834  return true;
1835  }
1836 
1837  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1838  assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1839  DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1840  unsigned NullVal = TM.getNullPointerValue(DestAS);
1841 
1842  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1843  auto FlatNull = B.buildConstant(SrcTy, 0);
1844 
1845  // Extract low 32-bits of the pointer.
1846  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1847 
1848  auto CmpRes =
1849  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1850  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1851 
1852  MI.eraseFromParent();
1853  return true;
1854  }
1855 
1856  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1857  return false;
1858 
1859  if (!ST.hasFlatAddressSpace())
1860  return false;
1861 
1862  auto SegmentNull =
1863  B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1864  auto FlatNull =
1865  B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1866 
1867  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1868  if (!ApertureReg.isValid())
1869  return false;
1870 
1871  auto CmpRes =
1872  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1873 
1874  // Coerce the type of the low half of the result so we can use merge_values.
1875  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1876 
1877  // TODO: Should we allow mismatched types but matching sizes in merges to
1878  // avoid the ptrtoint?
1879  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1880  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1881 
1882  MI.eraseFromParent();
1883  return true;
1884 }
1885 
1888  MachineIRBuilder &B) const {
1889  Register Src = MI.getOperand(1).getReg();
1890  LLT Ty = MRI.getType(Src);
1891  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1892 
1893  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1894  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1895 
1896  auto C1 = B.buildFConstant(Ty, C1Val);
1897  auto CopySign = B.buildFCopysign(Ty, C1, Src);
1898 
1899  // TODO: Should this propagate fast-math-flags?
1900  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1901  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1902 
1903  auto C2 = B.buildFConstant(Ty, C2Val);
1904  auto Fabs = B.buildFAbs(Ty, Src);
1905 
1906  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1907  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1908  MI.eraseFromParent();
1909  return true;
1910 }
1911 
1914  MachineIRBuilder &B) const {
1915 
1916  const LLT S1 = LLT::scalar(1);
1917  const LLT S64 = LLT::scalar(64);
1918 
1919  Register Src = MI.getOperand(1).getReg();
1920  assert(MRI.getType(Src) == S64);
1921 
1922  // result = trunc(src)
1923  // if (src > 0.0 && src != result)
1924  // result += 1.0
1925 
1926  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1927 
1928  const auto Zero = B.buildFConstant(S64, 0.0);
1929  const auto One = B.buildFConstant(S64, 1.0);
1930  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1931  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1932  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1933  auto Add = B.buildSelect(S64, And, One, Zero);
1934 
1935  // TODO: Should this propagate fast-math-flags?
1936  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1937  return true;
1938 }
1939 
1942  MachineIRBuilder &B) const {
1943  Register DstReg = MI.getOperand(0).getReg();
1944  Register Src0Reg = MI.getOperand(1).getReg();
1945  Register Src1Reg = MI.getOperand(2).getReg();
1946  auto Flags = MI.getFlags();
1947  LLT Ty = MRI.getType(DstReg);
1948 
1949  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1950  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1951  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1952  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1953  MI.eraseFromParent();
1954  return true;
1955 }
1956 
1958  MachineIRBuilder &B) {
1959  const unsigned FractBits = 52;
1960  const unsigned ExpBits = 11;
1961  LLT S32 = LLT::scalar(32);
1962 
1963  auto Const0 = B.buildConstant(S32, FractBits - 32);
1964  auto Const1 = B.buildConstant(S32, ExpBits);
1965 
1966  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1967  .addUse(Hi)
1968  .addUse(Const0.getReg(0))
1969  .addUse(Const1.getReg(0));
1970 
1971  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1972 }
1973 
1976  MachineIRBuilder &B) const {
1977  const LLT S1 = LLT::scalar(1);
1978  const LLT S32 = LLT::scalar(32);
1979  const LLT S64 = LLT::scalar(64);
1980 
1981  Register Src = MI.getOperand(1).getReg();
1982  assert(MRI.getType(Src) == S64);
1983 
1984  // TODO: Should this use extract since the low half is unused?
1985  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1986  Register Hi = Unmerge.getReg(1);
1987 
1988  // Extract the upper half, since this is where we will find the sign and
1989  // exponent.
1990  auto Exp = extractF64Exponent(Hi, B);
1991 
1992  const unsigned FractBits = 52;
1993 
1994  // Extract the sign bit.
1995  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1996  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1997 
1998  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1999 
2000  const auto Zero32 = B.buildConstant(S32, 0);
2001 
2002  // Extend back to 64-bits.
2003  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2004 
2005  auto Shr = B.buildAShr(S64, FractMask, Exp);
2006  auto Not = B.buildNot(S64, Shr);
2007  auto Tmp0 = B.buildAnd(S64, Src, Not);
2008  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2009 
2010  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2011  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2012 
2013  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2014  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2015  MI.eraseFromParent();
2016  return true;
2017 }
2018 
2021  MachineIRBuilder &B, bool Signed) const {
2022 
2023  Register Dst = MI.getOperand(0).getReg();
2024  Register Src = MI.getOperand(1).getReg();
2025 
2026  const LLT S64 = LLT::scalar(64);
2027  const LLT S32 = LLT::scalar(32);
2028 
2029  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2030 
2031  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2032 
2033  auto CvtHi = Signed ?
2034  B.buildSITOFP(S64, Unmerge.getReg(1)) :
2035  B.buildUITOFP(S64, Unmerge.getReg(1));
2036 
2037  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2038 
2039  auto ThirtyTwo = B.buildConstant(S32, 32);
2040  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2041  .addUse(CvtHi.getReg(0))
2042  .addUse(ThirtyTwo.getReg(0));
2043 
2044  // TODO: Should this propagate fast-math-flags?
2045  B.buildFAdd(Dst, LdExp, CvtLo);
2046  MI.eraseFromParent();
2047  return true;
2048 }
2049 
2050 // TODO: Copied from DAG implementation. Verify logic and document how this
2051 // actually works.
2054  MachineIRBuilder &B, bool Signed) const {
2055 
2056  Register Dst = MI.getOperand(0).getReg();
2057  Register Src = MI.getOperand(1).getReg();
2058 
2059  const LLT S64 = LLT::scalar(64);
2060  const LLT S32 = LLT::scalar(32);
2061 
2062  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2063 
2064  unsigned Flags = MI.getFlags();
2065 
2066  auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2067  auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2068  auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2069 
2070  auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2071  auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2072  auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2073 
2074  auto Hi = Signed ?
2075  B.buildFPTOSI(S32, FloorMul) :
2076  B.buildFPTOUI(S32, FloorMul);
2077  auto Lo = B.buildFPTOUI(S32, Fma);
2078 
2079  B.buildMerge(Dst, { Lo, Hi });
2080  MI.eraseFromParent();
2081 
2082  return true;
2083 }
2084 
2086  MachineInstr &MI) const {
2087  MachineFunction &MF = Helper.MIRBuilder.getMF();
2089 
2090  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2091  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2092 
2093  // With ieee_mode disabled, the instructions have the correct behavior
2094  // already for G_FMINNUM/G_FMAXNUM
2095  if (!MFI->getMode().IEEE)
2096  return !IsIEEEOp;
2097 
2098  if (IsIEEEOp)
2099  return true;
2100 
2102 }
2103 
2106  MachineIRBuilder &B) const {
2107  // TODO: Should move some of this into LegalizerHelper.
2108 
2109  // TODO: Promote dynamic indexing of s16 to s32
2110 
2111  // FIXME: Artifact combiner probably should have replaced the truncated
2112  // constant before this, so we shouldn't need
2113  // getConstantVRegValWithLookThrough.
2114  Optional<ValueAndVReg> MaybeIdxVal =
2115  getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2116  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2117  return true;
2118  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2119 
2120  Register Dst = MI.getOperand(0).getReg();
2121  Register Vec = MI.getOperand(1).getReg();
2122 
2123  LLT VecTy = MRI.getType(Vec);
2124  LLT EltTy = VecTy.getElementType();
2125  assert(EltTy == MRI.getType(Dst));
2126 
2127  if (IdxVal < VecTy.getNumElements())
2128  B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2129  else
2130  B.buildUndef(Dst);
2131 
2132  MI.eraseFromParent();
2133  return true;
2134 }
2135 
2138  MachineIRBuilder &B) const {
2139  // TODO: Should move some of this into LegalizerHelper.
2140 
2141  // TODO: Promote dynamic indexing of s16 to s32
2142 
2143  // FIXME: Artifact combiner probably should have replaced the truncated
2144  // constant before this, so we shouldn't need
2145  // getConstantVRegValWithLookThrough.
2146  Optional<ValueAndVReg> MaybeIdxVal =
2147  getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2148  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2149  return true;
2150 
2151  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2152  Register Dst = MI.getOperand(0).getReg();
2153  Register Vec = MI.getOperand(1).getReg();
2154  Register Ins = MI.getOperand(2).getReg();
2155 
2156  LLT VecTy = MRI.getType(Vec);
2157  LLT EltTy = VecTy.getElementType();
2158  assert(EltTy == MRI.getType(Ins));
2159 
2160  if (IdxVal < VecTy.getNumElements())
2161  B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2162  else
2163  B.buildUndef(Dst);
2164 
2165  MI.eraseFromParent();
2166  return true;
2167 }
2168 
2171  MachineIRBuilder &B) const {
2172  const LLT V2S16 = LLT::vector(2, 16);
2173 
2174  Register Dst = MI.getOperand(0).getReg();
2175  Register Src0 = MI.getOperand(1).getReg();
2176  LLT DstTy = MRI.getType(Dst);
2177  LLT SrcTy = MRI.getType(Src0);
2178 
2179  if (SrcTy == V2S16 && DstTy == V2S16 &&
2180  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2181  return true;
2182 
2183  MachineIRBuilder HelperBuilder(MI);
2184  GISelObserverWrapper DummyObserver;
2185  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2187 }
2188 
2191  MachineIRBuilder &B) const {
2192 
2193  Register DstReg = MI.getOperand(0).getReg();
2194  Register SrcReg = MI.getOperand(1).getReg();
2195  LLT Ty = MRI.getType(DstReg);
2196  unsigned Flags = MI.getFlags();
2197 
2198  Register TrigVal;
2199  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2200  if (ST.hasTrigReducedRange()) {
2201  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2202  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2203  .addUse(MulVal.getReg(0))
2204  .setMIFlags(Flags).getReg(0);
2205  } else
2206  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2207 
2208  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2209  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2210  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2211  .addUse(TrigVal)
2212  .setMIFlags(Flags);
2213  MI.eraseFromParent();
2214  return true;
2215 }
2216 
2219  const GlobalValue *GV,
2220  int64_t Offset,
2221  unsigned GAFlags) const {
2222  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2223  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2224  // to the following code sequence:
2225  //
2226  // For constant address space:
2227  // s_getpc_b64 s[0:1]
2228  // s_add_u32 s0, s0, $symbol
2229  // s_addc_u32 s1, s1, 0
2230  //
2231  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2232  // a fixup or relocation is emitted to replace $symbol with a literal
2233  // constant, which is a pc-relative offset from the encoding of the $symbol
2234  // operand to the global variable.
2235  //
2236  // For global address space:
2237  // s_getpc_b64 s[0:1]
2238  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2239  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2240  //
2241  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2242  // fixups or relocations are emitted to replace $symbol@*@lo and
2243  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2244  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2245  // operand to the global variable.
2246  //
2247  // What we want here is an offset from the value returned by s_getpc
2248  // (which is the address of the s_add_u32 instruction) to the global
2249  // variable, but since the encoding of $symbol starts 4 bytes after the start
2250  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2251  // small. This requires us to add 4 to the global variable offset in order to
2252  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2253  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2254  // instruction.
2255 
2256  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2257 
2258  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2259  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2260 
2261  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2262  .addDef(PCReg);
2263 
2264  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2265  if (GAFlags == SIInstrInfo::MO_NONE)
2266  MIB.addImm(0);
2267  else
2268  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2269 
2270  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2271 
2272  if (PtrTy.getSizeInBits() == 32)
2273  B.buildExtract(DstReg, PCReg, 0);
2274  return true;
2275  }
2276 
2279  MachineIRBuilder &B) const {
2280  Register DstReg = MI.getOperand(0).getReg();
2281  LLT Ty = MRI.getType(DstReg);
2282  unsigned AS = Ty.getAddressSpace();
2283 
2284  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2285  MachineFunction &MF = B.getMF();
2287 
2289  if (!MFI->isModuleEntryFunction()) {
2290  const Function &Fn = MF.getFunction();
2291  DiagnosticInfoUnsupported BadLDSDecl(
2292  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2293  DS_Warning);
2294  Fn.getContext().diagnose(BadLDSDecl);
2295 
2296  // We currently don't have a way to correctly allocate LDS objects that
2297  // aren't directly associated with a kernel. We do force inlining of
2298  // functions that use local objects. However, if these dead functions are
2299  // not eliminated, we don't want a compile time error. Just emit a warning
2300  // and a trap, since there should be no callable path here.
2301  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2302  B.buildUndef(DstReg);
2303  MI.eraseFromParent();
2304  return true;
2305  }
2306 
2307  // TODO: We could emit code to handle the initialization somewhere.
2309  const SITargetLowering *TLI = ST.getTargetLowering();
2310  if (!TLI->shouldUseLDSConstAddress(GV)) {
2311  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2312  return true; // Leave in place;
2313  }
2314 
2315  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2316  Type *Ty = GV->getValueType();
2317  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2318  // zero-sized type in other languages to declare the dynamic shared
2319  // memory which size is not known at the compile time. They will be
2320  // allocated by the runtime and placed directly after the static
2321  // allocated ones. They all share the same offset.
2322  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2323  // Adjust alignment for that dynamic shared memory array.
2324  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2325  LLT S32 = LLT::scalar(32);
2326  auto Sz =
2327  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2328  B.buildIntToPtr(DstReg, Sz);
2329  MI.eraseFromParent();
2330  return true;
2331  }
2332  }
2333 
2334  B.buildConstant(
2335  DstReg,
2336  MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2337  MI.eraseFromParent();
2338  return true;
2339  }
2340 
2341  const Function &Fn = MF.getFunction();
2342  DiagnosticInfoUnsupported BadInit(
2343  Fn, "unsupported initializer for address space", MI.getDebugLoc());
2344  Fn.getContext().diagnose(BadInit);
2345  return true;
2346  }
2347 
2348  const SITargetLowering *TLI = ST.getTargetLowering();
2349 
2350  if (TLI->shouldEmitFixup(GV)) {
2351  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2352  MI.eraseFromParent();
2353  return true;
2354  }
2355 
2356  if (TLI->shouldEmitPCReloc(GV)) {
2357  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2358  MI.eraseFromParent();
2359  return true;
2360  }
2361 
2363  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2364 
2369  8 /*Size*/, Align(8));
2370 
2371  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2372 
2373  if (Ty.getSizeInBits() == 32) {
2374  // Truncate if this is a 32-bit constant adrdess.
2375  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2376  B.buildExtract(DstReg, Load, 0);
2377  } else
2378  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2379 
2380  MI.eraseFromParent();
2381  return true;
2382 }
2383 
2385  if (Ty.isVector())
2387  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2388 }
2389 
2391  MachineInstr &MI) const {
2392  MachineIRBuilder &B = Helper.MIRBuilder;
2393  MachineRegisterInfo &MRI = *B.getMRI();
2394  GISelChangeObserver &Observer = Helper.Observer;
2395 
2396  Register PtrReg = MI.getOperand(1).getReg();
2397  LLT PtrTy = MRI.getType(PtrReg);
2398  unsigned AddrSpace = PtrTy.getAddressSpace();
2399 
2400  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2402  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2403  Observer.changingInstr(MI);
2404  MI.getOperand(1).setReg(Cast.getReg(0));
2405  Observer.changedInstr(MI);
2406  return true;
2407  }
2408 
2409  Register ValReg = MI.getOperand(0).getReg();
2410  LLT ValTy = MRI.getType(ValReg);
2411 
2412  MachineMemOperand *MMO = *MI.memoperands_begin();
2413  const unsigned ValSize = ValTy.getSizeInBits();
2414  const unsigned MemSize = 8 * MMO->getSize();
2415  const Align MemAlign = MMO->getAlign();
2416  const unsigned AlignInBits = 8 * MemAlign.value();
2417 
2418  // Widen non-power-of-2 loads to the alignment if needed
2419  if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2420  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2421 
2422  // This was already the correct extending load result type, so just adjust
2423  // the memory type.
2424  if (WideMemSize == ValSize) {
2425  MachineFunction &MF = B.getMF();
2426 
2427  MachineMemOperand *WideMMO =
2428  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2429  Observer.changingInstr(MI);
2430  MI.setMemRefs(MF, {WideMMO});
2431  Observer.changedInstr(MI);
2432  return true;
2433  }
2434 
2435  // Don't bother handling edge case that should probably never be produced.
2436  if (ValSize > WideMemSize)
2437  return false;
2438 
2439  LLT WideTy = widenToNextPowerOf2(ValTy);
2440 
2441  Register WideLoad;
2442  if (!WideTy.isVector()) {
2443  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2444  B.buildTrunc(ValReg, WideLoad).getReg(0);
2445  } else {
2446  // Extract the subvector.
2447 
2448  if (isRegisterType(ValTy)) {
2449  // If this a case where G_EXTRACT is legal, use it.
2450  // (e.g. <3 x s32> -> <4 x s32>)
2451  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2452  B.buildExtract(ValReg, WideLoad, 0);
2453  } else {
2454  // For cases where the widened type isn't a nice register value, unmerge
2455  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2456  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2457  WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2458  B.setInsertPt(B.getMBB(), MI.getIterator());
2459  B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2460  }
2461  }
2462 
2463  MI.eraseFromParent();
2464  return true;
2465  }
2466 
2467  return false;
2468 }
2469 
2472  MachineIRBuilder &B) const {
2473  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2474  assert(Ty.isScalar());
2475 
2476  MachineFunction &MF = B.getMF();
2478 
2479  // TODO: Always legal with future ftz flag.
2480  // FIXME: Do we need just output?
2481  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2482  return true;
2483  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2484  return true;
2485 
2486  MachineIRBuilder HelperBuilder(MI);
2487  GISelObserverWrapper DummyObserver;
2488  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2489  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2490 }
2491 
2494  Register DstReg = MI.getOperand(0).getReg();
2495  Register PtrReg = MI.getOperand(1).getReg();
2496  Register CmpVal = MI.getOperand(2).getReg();
2497  Register NewVal = MI.getOperand(3).getReg();
2498 
2500  "this should not have been custom lowered");
2501 
2502  LLT ValTy = MRI.getType(CmpVal);
2503  LLT VecTy = LLT::vector(2, ValTy);
2504 
2505  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2506 
2507  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2508  .addDef(DstReg)
2509  .addUse(PtrReg)
2510  .addUse(PackedVal)
2511  .setMemRefs(MI.memoperands());
2512 
2513  MI.eraseFromParent();
2514  return true;
2515 }
2516 
2518  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2519  Register Dst = MI.getOperand(0).getReg();
2520  Register Src = MI.getOperand(1).getReg();
2521  LLT Ty = B.getMRI()->getType(Dst);
2522  unsigned Flags = MI.getFlags();
2523 
2524  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2525  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2526 
2527  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2528  MI.eraseFromParent();
2529  return true;
2530 }
2531 
2533  MachineIRBuilder &B) const {
2534  Register Dst = MI.getOperand(0).getReg();
2535  Register Src = MI.getOperand(1).getReg();
2536  unsigned Flags = MI.getFlags();
2537  LLT Ty = B.getMRI()->getType(Dst);
2538 
2539  auto K = B.buildFConstant(Ty, numbers::log2e);
2540  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2541  B.buildFExp2(Dst, Mul, Flags);
2542  MI.eraseFromParent();
2543  return true;
2544 }
2545 
2547  MachineIRBuilder &B) const {
2548  Register Dst = MI.getOperand(0).getReg();
2549  Register Src0 = MI.getOperand(1).getReg();
2550  Register Src1 = MI.getOperand(2).getReg();
2551  unsigned Flags = MI.getFlags();
2552  LLT Ty = B.getMRI()->getType(Dst);
2553  const LLT S16 = LLT::scalar(16);
2554  const LLT S32 = LLT::scalar(32);
2555 
2556  if (Ty == S32) {
2557  auto Log = B.buildFLog2(S32, Src0, Flags);
2558  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2559  .addUse(Log.getReg(0))
2560  .addUse(Src1)
2561  .setMIFlags(Flags);
2562  B.buildFExp2(Dst, Mul, Flags);
2563  } else if (Ty == S16) {
2564  // There's no f16 fmul_legacy, so we need to convert for it.
2565  auto Log = B.buildFLog2(S16, Src0, Flags);
2566  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2567  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2568  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2569  .addUse(Ext0.getReg(0))
2570  .addUse(Ext1.getReg(0))
2571  .setMIFlags(Flags);
2572 
2573  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2574  } else
2575  return false;
2576 
2577  MI.eraseFromParent();
2578  return true;
2579 }
2580 
2581 // Find a source register, ignoring any possible source modifiers.
2583  Register ModSrc = OrigSrc;
2584  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2585  ModSrc = SrcFNeg->getOperand(1).getReg();
2586  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2587  ModSrc = SrcFAbs->getOperand(1).getReg();
2588  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2589  ModSrc = SrcFAbs->getOperand(1).getReg();
2590  return ModSrc;
2591 }
2592 
2595  MachineIRBuilder &B) const {
2596 
2597  const LLT S1 = LLT::scalar(1);
2598  const LLT S64 = LLT::scalar(64);
2599  Register Dst = MI.getOperand(0).getReg();
2600  Register OrigSrc = MI.getOperand(1).getReg();
2601  unsigned Flags = MI.getFlags();
2602  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2603  "this should not have been custom lowered");
2604 
2605  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2606  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2607  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2608  // V_FRACT bug is:
2609  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2610  //
2611  // Convert floor(x) to (x - fract(x))
2612 
2613  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2614  .addUse(OrigSrc)
2615  .setMIFlags(Flags);
2616 
2617  // Give source modifier matching some assistance before obscuring a foldable
2618  // pattern.
2619 
2620  // TODO: We can avoid the neg on the fract? The input sign to fract
2621  // shouldn't matter?
2622  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2623 
2624  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2625 
2627 
2628  // We don't need to concern ourselves with the snan handling difference, so
2629  // use the one which will directly select.
2630  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2631  if (MFI->getMode().IEEE)
2632  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2633  else
2634  B.buildFMinNum(Min, Fract, Const, Flags);
2635 
2636  Register CorrectedFract = Min;
2637  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2638  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2639  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2640  }
2641 
2642  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2643  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2644 
2645  MI.eraseFromParent();
2646  return true;
2647 }
2648 
2649 // Turn an illegal packed v2s16 build vector into bit operations.
2650 // TODO: This should probably be a bitcast action in LegalizerHelper.
2653  Register Dst = MI.getOperand(0).getReg();
2654  const LLT S32 = LLT::scalar(32);
2655  assert(MRI.getType(Dst) == LLT::vector(2, 16));
2656 
2657  Register Src0 = MI.getOperand(1).getReg();
2658  Register Src1 = MI.getOperand(2).getReg();
2659  assert(MRI.getType(Src0) == LLT::scalar(16));
2660 
2661  auto Merge = B.buildMerge(S32, {Src0, Src1});
2662  B.buildBitcast(Dst, Merge);
2663 
2664  MI.eraseFromParent();
2665  return true;
2666 }
2667 
2668 // Check that this is a G_XOR x, -1
2669 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2670  if (MI.getOpcode() != TargetOpcode::G_XOR)
2671  return false;
2672  auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2673  return ConstVal && *ConstVal == -1;
2674 }
2675 
2676 // Return the use branch instruction, otherwise null if the usage is invalid.
2677 static MachineInstr *
2679  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2680  Register CondDef = MI.getOperand(0).getReg();
2681  if (!MRI.hasOneNonDBGUse(CondDef))
2682  return nullptr;
2683 
2684  MachineBasicBlock *Parent = MI.getParent();
2686 
2687  if (isNot(MRI, *UseMI)) {
2688  Register NegatedCond = UseMI->getOperand(0).getReg();
2689  if (!MRI.hasOneNonDBGUse(NegatedCond))
2690  return nullptr;
2691 
2692  // We're deleting the def of this value, so we need to remove it.
2694 
2695  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2696  Negated = true;
2697  }
2698 
2699  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2700  return nullptr;
2701 
2702  // Make sure the cond br is followed by a G_BR, or is the last instruction.
2703  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2704  if (Next == Parent->end()) {
2705  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2706  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2707  return nullptr;
2708  UncondBrTarget = &*NextMBB;
2709  } else {
2710  if (Next->getOpcode() != AMDGPU::G_BR)
2711  return nullptr;
2712  Br = &*Next;
2713  UncondBrTarget = Br->getOperand(0).getMBB();
2714  }
2715 
2716  return UseMI;
2717 }
2718 
2720  const ArgDescriptor *Arg,
2721  const TargetRegisterClass *ArgRC,
2722  LLT ArgTy) const {
2723  MCRegister SrcReg = Arg->getRegister();
2724  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2725  assert(DstReg.isVirtual() && "Virtual register expected");
2726 
2727  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2728  ArgTy);
2729  if (Arg->isMasked()) {
2730  // TODO: Should we try to emit this once in the entry block?
2731  const LLT S32 = LLT::scalar(32);
2732  const unsigned Mask = Arg->getMask();
2733  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2734 
2735  Register AndMaskSrc = LiveIn;
2736 
2737  if (Shift != 0) {
2738  auto ShiftAmt = B.buildConstant(S32, Shift);
2739  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2740  }
2741 
2742  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2743  } else {
2744  B.buildCopy(DstReg, LiveIn);
2745  }
2746 
2747  return true;
2748 }
2749 
2751  Register DstReg, MachineIRBuilder &B,
2752  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2753  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2754  const ArgDescriptor *Arg;
2755  const TargetRegisterClass *ArgRC;
2756  LLT ArgTy;
2757  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2758 
2759  if (!Arg->isRegister() || !Arg->getRegister().isValid())
2760  return false; // TODO: Handle these
2761  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2762 }
2763 
2766  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2767  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2768  return false;
2769 
2770  MI.eraseFromParent();
2771  return true;
2772 }
2773 
2776  MachineIRBuilder &B) const {
2777  Register Dst = MI.getOperand(0).getReg();
2778  LLT DstTy = MRI.getType(Dst);
2779  LLT S16 = LLT::scalar(16);
2780  LLT S32 = LLT::scalar(32);
2781  LLT S64 = LLT::scalar(64);
2782 
2783  if (DstTy == S16)
2784  return legalizeFDIV16(MI, MRI, B);
2785  if (DstTy == S32)
2786  return legalizeFDIV32(MI, MRI, B);
2787  if (DstTy == S64)
2788  return legalizeFDIV64(MI, MRI, B);
2789 
2790  return false;
2791 }
2792 
2794  Register DstReg,
2795  Register X,
2796  Register Y,
2797  bool IsDiv) const {
2798  const LLT S1 = LLT::scalar(1);
2799  const LLT S32 = LLT::scalar(32);
2800 
2801  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2802  // algorithm used here.
2803 
2804  // Initial estimate of inv(y).
2805  auto FloatY = B.buildUITOFP(S32, Y);
2806  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2807  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2808  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2809  auto Z = B.buildFPTOUI(S32, ScaledY);
2810 
2811  // One round of UNR.
2812  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2813  auto NegYZ = B.buildMul(S32, NegY, Z);
2814  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2815 
2816  // Quotient/remainder estimate.
2817  auto Q = B.buildUMulH(S32, X, Z);
2818  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2819 
2820  // First quotient/remainder refinement.
2821  auto One = B.buildConstant(S32, 1);
2822  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2823  if (IsDiv)
2824  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2825  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2826 
2827  // Second quotient/remainder refinement.
2828  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2829  if (IsDiv)
2830  B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2831  else
2832  B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2833 }
2834 
2835 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2836 //
2837 // Return lo, hi of result
2838 //
2839 // %cvt.lo = G_UITOFP Val.lo
2840 // %cvt.hi = G_UITOFP Val.hi
2841 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2842 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2843 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2844 // %mul2 = G_FMUL %mul1, 2**(-32)
2845 // %trunc = G_INTRINSIC_TRUNC %mul2
2846 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2847 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2848 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2849  Register Val) {
2850  const LLT S32 = LLT::scalar(32);
2851  auto Unmerge = B.buildUnmerge(S32, Val);
2852 
2853  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2854  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2855 
2856  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2857  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2858 
2859  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2860  auto Mul1 =
2861  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2862 
2863  // 2**(-32)
2864  auto Mul2 =
2865  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2866  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2867 
2868  // -(2**32)
2869  auto Mad2 = B.buildFMAD(S32, Trunc,
2870  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2871 
2872  auto ResultLo = B.buildFPTOUI(S32, Mad2);
2873  auto ResultHi = B.buildFPTOUI(S32, Trunc);
2874 
2875  return {ResultLo.getReg(0), ResultHi.getReg(0)};
2876 }
2877 
2879  Register DstReg,
2880  Register Numer,
2881  Register Denom,
2882  bool IsDiv) const {
2883  const LLT S32 = LLT::scalar(32);
2884  const LLT S64 = LLT::scalar(64);
2885  const LLT S1 = LLT::scalar(1);
2886  Register RcpLo, RcpHi;
2887 
2888  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2889 
2890  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2891 
2892  auto Zero64 = B.buildConstant(S64, 0);
2893  auto NegDenom = B.buildSub(S64, Zero64, Denom);
2894 
2895  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2896  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2897 
2898  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2899  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2900  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2901 
2902  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2903  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2904  auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2905  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2906 
2907  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2908  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2909  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2910  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2911  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2912 
2913  auto Zero32 = B.buildConstant(S32, 0);
2914  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2915  auto Add2_HiC =
2916  B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2917  auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2918  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2919 
2920  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2921  Register NumerLo = UnmergeNumer.getReg(0);
2922  Register NumerHi = UnmergeNumer.getReg(1);
2923 
2924  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2925  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2926  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2927  Register Mul3_Lo = UnmergeMul3.getReg(0);
2928  Register Mul3_Hi = UnmergeMul3.getReg(1);
2929  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2930  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2931  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2932  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2933 
2934  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2935  Register DenomLo = UnmergeDenom.getReg(0);
2936  Register DenomHi = UnmergeDenom.getReg(1);
2937 
2938  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2939  auto C1 = B.buildSExt(S32, CmpHi);
2940 
2941  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2942  auto C2 = B.buildSExt(S32, CmpLo);
2943 
2944  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2945  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2946 
2947  // TODO: Here and below portions of the code can be enclosed into if/endif.
2948  // Currently control flow is unconditional and we have 4 selects after
2949  // potential endif to substitute PHIs.
2950 
2951  // if C3 != 0 ...
2952  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2953  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2954  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2955  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2956 
2957  auto One64 = B.buildConstant(S64, 1);
2958  auto Add3 = B.buildAdd(S64, MulHi3, One64);
2959 
2960  auto C4 =
2961  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2962  auto C5 =
2963  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2964  auto C6 = B.buildSelect(
2965  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2966 
2967  // if (C6 != 0)
2968  auto Add4 = B.buildAdd(S64, Add3, One64);
2969  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2970 
2971  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2972  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2973  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2974 
2975  // endif C6
2976  // endif C3
2977 
2978  if (IsDiv) {
2979  auto Sel1 = B.buildSelect(
2980  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2981  B.buildSelect(DstReg,
2982  B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2983  } else {
2984  auto Sel2 = B.buildSelect(
2985  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2986  B.buildSelect(DstReg,
2987  B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2988  }
2989 }
2990 
2993  MachineIRBuilder &B) const {
2994  const LLT S64 = LLT::scalar(64);
2995  const LLT S32 = LLT::scalar(32);
2996  const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2997  Register DstReg = MI.getOperand(0).getReg();
2998  Register Num = MI.getOperand(1).getReg();
2999  Register Den = MI.getOperand(2).getReg();
3000  LLT Ty = MRI.getType(DstReg);
3001 
3002  if (Ty == S32)
3003  legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
3004  else if (Ty == S64)
3005  legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
3006  else
3007  return false;
3008 
3009  MI.eraseFromParent();
3010  return true;
3011 
3012 }
3013 
3016  MachineIRBuilder &B) const {
3017  const LLT S64 = LLT::scalar(64);
3018  const LLT S32 = LLT::scalar(32);
3019 
3020  Register DstReg = MI.getOperand(0).getReg();
3021  const LLT Ty = MRI.getType(DstReg);
3022  if (Ty != S32 && Ty != S64)
3023  return false;
3024 
3025  const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3026 
3027  Register LHS = MI.getOperand(1).getReg();
3028  Register RHS = MI.getOperand(2).getReg();
3029 
3030  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3031  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3032  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3033 
3034  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3035  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3036 
3037  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3038  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3039 
3041  if (Ty == S32)
3042  legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3043  else
3044  legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3045 
3046  Register Sign;
3047  if (IsDiv)
3048  Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3049  else
3050  Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3051 
3052  UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3053  B.buildSub(DstReg, UDivRem, Sign);
3054 
3055  MI.eraseFromParent();
3056  return true;
3057 }
3058 
3061  MachineIRBuilder &B) const {
3062  Register Res = MI.getOperand(0).getReg();
3063  Register LHS = MI.getOperand(1).getReg();
3064  Register RHS = MI.getOperand(2).getReg();
3065  uint16_t Flags = MI.getFlags();
3066  LLT ResTy = MRI.getType(Res);
3067 
3068  const MachineFunction &MF = B.getMF();
3069  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3070  MI.getFlag(MachineInstr::FmAfn);
3071 
3072  if (!AllowInaccurateRcp)
3073  return false;
3074 
3075  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3076  // 1 / x -> RCP(x)
3077  if (CLHS->isExactlyValue(1.0)) {
3078  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3079  .addUse(RHS)
3080  .setMIFlags(Flags);
3081 
3082  MI.eraseFromParent();
3083  return true;
3084  }
3085 
3086  // -1 / x -> RCP( FNEG(x) )
3087  if (CLHS->isExactlyValue(-1.0)) {
3088  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3089  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3090  .addUse(FNeg.getReg(0))
3091  .setMIFlags(Flags);
3092 
3093  MI.eraseFromParent();
3094  return true;
3095  }
3096  }
3097 
3098  // x / y -> x * (1.0 / y)
3099  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3100  .addUse(RHS)
3101  .setMIFlags(Flags);
3102  B.buildFMul(Res, LHS, RCP, Flags);
3103 
3104  MI.eraseFromParent();
3105  return true;
3106 }
3107 
3110  MachineIRBuilder &B) const {
3111  Register Res = MI.getOperand(0).getReg();
3112  Register X = MI.getOperand(1).getReg();
3113  Register Y = MI.getOperand(2).getReg();
3114  uint16_t Flags = MI.getFlags();
3115  LLT ResTy = MRI.getType(Res);
3116 
3117  const MachineFunction &MF = B.getMF();
3118  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3119  MI.getFlag(MachineInstr::FmAfn);
3120 
3121  if (!AllowInaccurateRcp)
3122  return false;
3123 
3124  auto NegY = B.buildFNeg(ResTy, Y);
3125  auto One = B.buildFConstant(ResTy, 1.0);
3126 
3127  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3128  .addUse(Y)
3129  .setMIFlags(Flags);
3130 
3131  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3132  R = B.buildFMA(ResTy, Tmp0, R, R);
3133 
3134  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3135  R = B.buildFMA(ResTy, Tmp1, R, R);
3136 
3137  auto Ret = B.buildFMul(ResTy, X, R);
3138  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3139 
3140  B.buildFMA(Res, Tmp2, R, Ret);
3141  MI.eraseFromParent();
3142  return true;
3143 }
3144 
3147  MachineIRBuilder &B) const {
3148  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3149  return true;
3150 
3151  Register Res = MI.getOperand(0).getReg();
3152  Register LHS = MI.getOperand(1).getReg();
3153  Register RHS = MI.getOperand(2).getReg();
3154 
3155  uint16_t Flags = MI.getFlags();
3156 
3157  LLT S16 = LLT::scalar(16);
3158  LLT S32 = LLT::scalar(32);
3159 
3160  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3161  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3162 
3163  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3164  .addUse(RHSExt.getReg(0))
3165  .setMIFlags(Flags);
3166 
3167  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3168  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3169 
3170  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3171  .addUse(RDst.getReg(0))
3172  .addUse(RHS)
3173  .addUse(LHS)
3174  .setMIFlags(Flags);
3175 
3176  MI.eraseFromParent();
3177  return true;
3178 }
3179 
3180 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3181 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3182 static void toggleSPDenormMode(bool Enable,
3184  const GCNSubtarget &ST,
3186  // Set SP denorm mode to this value.
3187  unsigned SPDenormMode =
3188  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3189 
3190  if (ST.hasDenormModeInst()) {
3191  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3192  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3193 
3194  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3195  B.buildInstr(AMDGPU::S_DENORM_MODE)
3196  .addImm(NewDenormModeValue);
3197 
3198  } else {
3199  // Select FP32 bit field in mode register.
3200  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3203 
3204  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3205  .addImm(SPDenormMode)
3206  .addImm(SPDenormModeBitField);
3207  }
3208 }
3209 
3212  MachineIRBuilder &B) const {
3213  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3214  return true;
3215 
3216  Register Res = MI.getOperand(0).getReg();
3217  Register LHS = MI.getOperand(1).getReg();
3218  Register RHS = MI.getOperand(2).getReg();
3219  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3221 
3222  uint16_t Flags = MI.getFlags();
3223 
3224  LLT S32 = LLT::scalar(32);
3225  LLT S1 = LLT::scalar(1);
3226 
3227  auto One = B.buildFConstant(S32, 1.0f);
3228 
3229  auto DenominatorScaled =
3230  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3231  .addUse(LHS)
3232  .addUse(RHS)
3233  .addImm(0)
3234  .setMIFlags(Flags);
3235  auto NumeratorScaled =
3236  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3237  .addUse(LHS)
3238  .addUse(RHS)
3239  .addImm(1)
3240  .setMIFlags(Flags);
3241 
3242  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3243  .addUse(DenominatorScaled.getReg(0))
3244  .setMIFlags(Flags);
3245  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3246 
3247  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3248  // aren't modeled as reading it.
3249  if (!Mode.allFP32Denormals())
3250  toggleSPDenormMode(true, B, ST, Mode);
3251 
3252  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3253  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3254  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3255  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3256  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3257  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3258 
3259  if (!Mode.allFP32Denormals())
3260  toggleSPDenormMode(false, B, ST, Mode);
3261 
3262  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3263  .addUse(Fma4.getReg(0))
3264  .addUse(Fma1.getReg(0))
3265  .addUse(Fma3.getReg(0))
3266  .addUse(NumeratorScaled.getReg(1))
3267  .setMIFlags(Flags);
3268 
3269  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3270  .addUse(Fmas.getReg(0))
3271  .addUse(RHS)
3272  .addUse(LHS)
3273  .setMIFlags(Flags);
3274 
3275  MI.eraseFromParent();
3276  return true;
3277 }
3278 
3281  MachineIRBuilder &B) const {
3283  return true;
3284 
3285  Register Res = MI.getOperand(0).getReg();
3286  Register LHS = MI.getOperand(1).getReg();
3287  Register RHS = MI.getOperand(2).getReg();
3288 
3289  uint16_t Flags = MI.getFlags();
3290 
3291  LLT S64 = LLT::scalar(64);
3292  LLT S1 = LLT::scalar(1);
3293 
3294  auto One = B.buildFConstant(S64, 1.0);
3295 
3296  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3297  .addUse(LHS)
3298  .addUse(RHS)
3299  .addImm(0)
3300  .setMIFlags(Flags);
3301 
3302  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3303 
3304  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3305  .addUse(DivScale0.getReg(0))
3306  .setMIFlags(Flags);
3307 
3308  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3309  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3310  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3311 
3312  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3313  .addUse(LHS)
3314  .addUse(RHS)
3315  .addImm(1)
3316  .setMIFlags(Flags);
3317 
3318  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3319  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3320  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3321 
3322  Register Scale;
3324  // Workaround a hardware bug on SI where the condition output from div_scale
3325  // is not usable.
3326 
3327  LLT S32 = LLT::scalar(32);
3328 
3329  auto NumUnmerge = B.buildUnmerge(S32, LHS);
3330  auto DenUnmerge = B.buildUnmerge(S32, RHS);
3331  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3332  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3333 
3334  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3335  Scale1Unmerge.getReg(1));
3336  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3337  Scale0Unmerge.getReg(1));
3338  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3339  } else {
3340  Scale = DivScale1.getReg(1);
3341  }
3342 
3343  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3344  .addUse(Fma4.getReg(0))
3345  .addUse(Fma3.getReg(0))
3346  .addUse(Mul.getReg(0))
3347  .addUse(Scale)
3348  .setMIFlags(Flags);
3349 
3350  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3351  .addUse(Fmas.getReg(0))
3352  .addUse(RHS)
3353  .addUse(LHS)
3354  .setMIFlags(Flags);
3355 
3356  MI.eraseFromParent();
3357  return true;
3358 }
3359 
3362  MachineIRBuilder &B) const {
3363  Register Res = MI.getOperand(0).getReg();
3364  Register LHS = MI.getOperand(2).getReg();
3365  Register RHS = MI.getOperand(3).getReg();
3366  uint16_t Flags = MI.getFlags();
3367 
3368  LLT S32 = LLT::scalar(32);
3369  LLT S1 = LLT::scalar(1);
3370 
3371  auto Abs = B.buildFAbs(S32, RHS, Flags);
3372  const APFloat C0Val(1.0f);
3373 
3374  auto C0 = B.buildConstant(S32, 0x6f800000);
3375  auto C1 = B.buildConstant(S32, 0x2f800000);
3376  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3377 
3378  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3379  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3380 
3381  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3382 
3383  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3384  .addUse(Mul0.getReg(0))
3385  .setMIFlags(Flags);
3386 
3387  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3388 
3389  B.buildFMul(Res, Sel, Mul1, Flags);
3390 
3391  MI.eraseFromParent();
3392  return true;
3393 }
3394 
3395 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3396 // FIXME: Why do we handle this one but not other removed instructions?
3397 //
3398 // Reciprocal square root. The clamp prevents infinite results, clamping
3399 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3400 // +-max_float.
3403  MachineIRBuilder &B) const {
3405  return true;
3406 
3407  Register Dst = MI.getOperand(0).getReg();
3408  Register Src = MI.getOperand(2).getReg();
3409  auto Flags = MI.getFlags();
3410 
3411  LLT Ty = MRI.getType(Dst);
3412 
3413  const fltSemantics *FltSemantics;
3414  if (Ty == LLT::scalar(32))
3415  FltSemantics = &APFloat::IEEEsingle();
3416  else if (Ty == LLT::scalar(64))
3417  FltSemantics = &APFloat::IEEEdouble();
3418  else
3419  return false;
3420 
3421  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3422  .addUse(Src)
3423  .setMIFlags(Flags);
3424 
3425  // We don't need to concern ourselves with the snan handling difference, since
3426  // the rsq quieted (or not) so use the one which will directly select.
3427  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3428  const bool UseIEEE = MFI->getMode().IEEE;
3429 
3430  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3431  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3432  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3433 
3434  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3435 
3436  if (UseIEEE)
3437  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3438  else
3439  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3440  MI.eraseFromParent();
3441  return true;
3442 }
3443 
3444 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3445  switch (IID) {
3446  case Intrinsic::amdgcn_ds_fadd:
3447  return AMDGPU::G_ATOMICRMW_FADD;
3448  case Intrinsic::amdgcn_ds_fmin:
3449  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3450  case Intrinsic::amdgcn_ds_fmax:
3451  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3452  default:
3453  llvm_unreachable("not a DS FP intrinsic");
3454  }
3455 }
3456 
3458  MachineInstr &MI,
3459  Intrinsic::ID IID) const {
3460  GISelChangeObserver &Observer = Helper.Observer;
3461  Observer.changingInstr(MI);
3462 
3463  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3464 
3465  // The remaining operands were used to set fields in the MemOperand on
3466  // construction.
3467  for (int I = 6; I > 3; --I)
3468  MI.RemoveOperand(I);
3469 
3470  MI.RemoveOperand(1); // Remove the intrinsic ID.
3471  Observer.changedInstr(MI);
3472  return true;
3473 }
3474 
3477  MachineIRBuilder &B) const {
3478  uint64_t Offset =
3481  LLT DstTy = MRI.getType(DstReg);
3482  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3483 
3484  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3485  if (!loadInputValue(KernargPtrReg, B,
3487  return false;
3488 
3489  // FIXME: This should be nuw
3490  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3491  return true;
3492 }
3493 
3496  MachineIRBuilder &B) const {
3497  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3498  if (!MFI->isEntryFunction()) {
3501  }
3502 
3503  Register DstReg = MI.getOperand(0).getReg();
3504  if (!getImplicitArgPtr(DstReg, MRI, B))
3505  return false;
3506 
3507  MI.eraseFromParent();
3508  return true;
3509 }
3510 
3514  unsigned AddrSpace) const {
3515  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3516  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3517  Register Hi32 = Unmerge.getReg(1);
3518 
3519  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3520  MI.eraseFromParent();
3521  return true;
3522 }
3523 
3524 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3525 // offset (the offset that is included in bounds checking and swizzling, to be
3526 // split between the instruction's voffset and immoffset fields) and soffset
3527 // (the offset that is excluded from bounds checking and swizzling, to go in
3528 // the instruction's soffset field). This function takes the first kind of
3529 // offset and figures out how to split it between voffset and immoffset.
3530 std::tuple<Register, unsigned, unsigned>
3532  Register OrigOffset) const {
3533  const unsigned MaxImm = 4095;
3534  Register BaseReg;
3535  unsigned TotalConstOffset;
3536  const LLT S32 = LLT::scalar(32);
3537  MachineRegisterInfo &MRI = *B.getMRI();
3538 
3539  std::tie(BaseReg, TotalConstOffset) =
3541 
3542  unsigned ImmOffset = TotalConstOffset;
3543 
3544  // If BaseReg is a pointer, convert it to int.
3545  if (MRI.getType(BaseReg).isPointer())
3546  BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3547 
3548  // If the immediate value is too big for the immoffset field, put the value
3549  // and -4096 into the immoffset field so that the value that is copied/added
3550  // for the voffset field is a multiple of 4096, and it stands more chance
3551  // of being CSEd with the copy/add for another similar load/store.
3552  // However, do not do that rounding down to a multiple of 4096 if that is a
3553  // negative number, as it appears to be illegal to have a negative offset
3554  // in the vgpr, even if adding the immediate offset makes it positive.
3555  unsigned Overflow = ImmOffset & ~MaxImm;
3556  ImmOffset -= Overflow;
3557  if ((int32_t)Overflow < 0) {
3558  Overflow += ImmOffset;
3559  ImmOffset = 0;
3560  }
3561 
3562  if (Overflow != 0) {
3563  if (!BaseReg) {
3564  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3565  } else {
3566  auto OverflowVal = B.buildConstant(S32, Overflow);
3567  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3568  }
3569  }
3570 
3571  if (!BaseReg)
3572  BaseReg = B.buildConstant(S32, 0).getReg(0);
3573 
3574  return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3575 }
3576 
3577 /// Handle register layout difference for f16 images for some subtargets.
3580  Register Reg,
3581  bool ImageStore) const {
3582  const LLT S16 = LLT::scalar(16);
3583  const LLT S32 = LLT::scalar(32);
3584  LLT StoreVT = MRI.getType(Reg);
3585  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3586 
3587  if (ST.hasUnpackedD16VMem()) {
3588  auto Unmerge = B.buildUnmerge(S16, Reg);
3589 
3590  SmallVector<Register, 4> WideRegs;
3591  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3592  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3593 
3594  int NumElts = StoreVT.getNumElements();
3595 
3596  return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3597  }
3598 
3599  if (ImageStore && ST.hasImageStoreD16Bug()) {
3600  if (StoreVT.getNumElements() == 2) {
3601  SmallVector<Register, 4> PackedRegs;
3602  Reg = B.buildBitcast(S32, Reg).getReg(0);
3603  PackedRegs.push_back(Reg);
3604  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3605  return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3606  }
3607 
3608  if (StoreVT.getNumElements() == 3) {
3609  SmallVector<Register, 4> PackedRegs;
3610  auto Unmerge = B.buildUnmerge(S16, Reg);
3611  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3612  PackedRegs.push_back(Unmerge.getReg(I));
3613  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3614  Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3615  return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3616  }
3617 
3618  if (StoreVT.getNumElements() == 4) {
3619  SmallVector<Register, 4> PackedRegs;
3620  Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3621  auto Unmerge = B.buildUnmerge(S32, Reg);
3622  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3623  PackedRegs.push_back(Unmerge.getReg(I));
3624  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3625  return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3626  }
3627 
3628  llvm_unreachable("invalid data type");
3629  }
3630 
3631  return Reg;
3632 }
3633 
3635  MachineIRBuilder &B, Register VData, bool IsFormat) const {
3636  MachineRegisterInfo *MRI = B.getMRI();
3637  LLT Ty = MRI->getType(VData);
3638 
3639  const LLT S16 = LLT::scalar(16);
3640 
3641  // Fixup illegal register types for i8 stores.
3642  if (Ty == LLT::scalar(8) || Ty == S16) {
3643  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3644  return AnyExt;
3645  }
3646 
3647  if (Ty.isVector()) {
3648  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3649  if (IsFormat)
3650  return handleD16VData(B, *MRI, VData);
3651  }
3652  }
3653 
3654  return VData;
3655 }
3656 
3660  bool IsTyped,
3661  bool IsFormat) const {
3662  Register VData = MI.getOperand(1).getReg();
3663  LLT Ty = MRI.getType(VData);
3664  LLT EltTy = Ty.getScalarType();
3665  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3666  const LLT S32 = LLT::scalar(32);
3667 
3668  VData = fixStoreSourceType(B, VData, IsFormat);
3669  Register RSrc = MI.getOperand(2).getReg();
3670 
3671  MachineMemOperand *MMO = *MI.memoperands_begin();
3672  const int MemSize = MMO->getSize();
3673 
3674  unsigned ImmOffset;
3675  unsigned TotalOffset;
3676 
3677  // The typed intrinsics add an immediate after the registers.
3678  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3679 
3680  // The struct intrinsic variants add one additional operand over raw.
3681  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3682  Register VIndex;
3683  int OpOffset = 0;
3684  if (HasVIndex) {
3685  VIndex = MI.getOperand(3).getReg();
3686  OpOffset = 1;
3687  }
3688 
3689  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3690  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3691 
3692  unsigned Format = 0;
3693  if (IsTyped) {
3694  Format = MI.getOperand(5 + OpOffset).getImm();
3695  ++OpOffset;
3696  }
3697 
3698  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3699 
3700  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3701  if (TotalOffset != 0)
3702  MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3703 
3704  unsigned Opc;
3705  if (IsTyped) {
3706  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3707  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3708  } else if (IsFormat) {
3709  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3710  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3711  } else {
3712  switch (MemSize) {
3713  case 1:
3714  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3715  break;
3716  case 2:
3717  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3718  break;
3719  default:
3720  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3721  break;
3722  }
3723  }
3724 
3725  if (!VIndex)
3726  VIndex = B.buildConstant(S32, 0).getReg(0);
3727 
3728  auto MIB = B.buildInstr(Opc)
3729  .addUse(VData) // vdata
3730  .addUse(RSrc) // rsrc
3731  .addUse(VIndex) // vindex
3732  .addUse(VOffset) // voffset
3733  .addUse(SOffset) // soffset
3734  .addImm(ImmOffset); // offset(imm)
3735 
3736  if (IsTyped)
3737  MIB.addImm(Format);
3738 
3739  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3740  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3741  .addMemOperand(MMO);
3742 
3743  MI.eraseFromParent();
3744  return true;
3745 }
3746 
3750  bool IsFormat,
3751  bool IsTyped) const {
3752  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3753  MachineMemOperand *MMO = *MI.memoperands_begin();
3754  const int MemSize = MMO->getSize();
3755  const LLT S32 = LLT::scalar(32);
3756 
3757  Register Dst = MI.getOperand(0).getReg();
3758  Register RSrc = MI.getOperand(2).getReg();
3759 
3760  // The typed intrinsics add an immediate after the registers.
3761  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3762 
3763  // The struct intrinsic variants add one additional operand over raw.
3764  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3765  Register VIndex;
3766  int OpOffset = 0;
3767  if (HasVIndex) {
3768  VIndex = MI.getOperand(3).getReg();
3769  OpOffset = 1;
3770  }
3771 
3772  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3773  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3774 
3775  unsigned Format = 0;
3776  if (IsTyped) {
3777  Format = MI.getOperand(5 + OpOffset).getImm();
3778  ++OpOffset;
3779  }
3780 
3781  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3782  unsigned ImmOffset;
3783  unsigned TotalOffset;
3784 
3785  LLT Ty = MRI.getType(Dst);
3786  LLT EltTy = Ty.getScalarType();
3787  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3788  const bool Unpacked = ST.hasUnpackedD16VMem();
3789 
3790  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3791  if (TotalOffset != 0)
3792  MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3793 
3794  unsigned Opc;
3795 
3796  if (IsTyped) {
3797  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3798  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3799  } else if (IsFormat) {
3800  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3801  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3802  } else {
3803  switch (MemSize) {
3804  case 1:
3805  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3806  break;
3807  case 2:
3808  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3809  break;
3810  default:
3811  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3812  break;
3813  }
3814  }
3815 
3816  Register LoadDstReg;
3817 
3818  bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3819  LLT UnpackedTy = Ty.changeElementSize(32);
3820 
3821  if (IsExtLoad)
3822  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3823  else if (Unpacked && IsD16 && Ty.isVector())
3824  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3825  else
3826  LoadDstReg = Dst;
3827 
3828  if (!VIndex)
3829  VIndex = B.buildConstant(S32, 0).getReg(0);
3830 
3831  auto MIB = B.buildInstr(Opc)
3832  .addDef(LoadDstReg) // vdata
3833  .addUse(RSrc) // rsrc
3834  .addUse(VIndex) // vindex
3835  .addUse(VOffset) // voffset
3836  .addUse(SOffset) // soffset
3837  .addImm(ImmOffset); // offset(imm)
3838 
3839  if (IsTyped)
3840  MIB.addImm(Format);
3841 
3842  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3843  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3844  .addMemOperand(MMO);
3845 
3846  if (LoadDstReg != Dst) {
3847  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3848 
3849  // Widen result for extending loads was widened.
3850  if (IsExtLoad)
3851  B.buildTrunc(Dst, LoadDstReg);
3852  else {
3853  // Repack to original 16-bit vector result
3854  // FIXME: G_TRUNC should work, but legalization currently fails
3855  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3856  SmallVector<Register, 4> Repack;
3857  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3858  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3859  B.buildMerge(Dst, Repack);
3860  }
3861  }
3862 
3863  MI.eraseFromParent();
3864  return true;
3865 }
3866 
3869  bool IsInc) const {
3870  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3871  AMDGPU::G_AMDGPU_ATOMIC_DEC;
3872  B.buildInstr(Opc)
3873  .addDef(MI.getOperand(0).getReg())
3874  .addUse(MI.getOperand(2).getReg())
3875  .addUse(MI.getOperand(3).getReg())
3876  .cloneMemRefs(MI);
3877  MI.eraseFromParent();
3878  return true;
3879 }
3880 
3881 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3882  switch (IntrID) {
3883  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3884  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3885  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3886  case Intrinsic::amdgcn_raw_buffer_atomic_add:
3887  case Intrinsic::amdgcn_struct_buffer_atomic_add:
3888  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3889  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3890  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3891  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3892  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3893  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3894  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3895  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3896  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3897  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3898  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3899  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3900  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3901  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3902  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3903  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3904  case Intrinsic::amdgcn_raw_buffer_atomic_and:
3905  case Intrinsic::amdgcn_struct_buffer_atomic_and:
3906  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3907  case Intrinsic::amdgcn_raw_buffer_atomic_or:
3908  case Intrinsic::amdgcn_struct_buffer_atomic_or:
3909  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3910  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3911  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3912  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3913  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3914  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3915  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3916  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3917  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3918  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3919  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3920  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3921  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3922  case Intrinsic::amdgcn_buffer_atomic_fadd:
3923  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3924  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3925  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3926  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3927  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3928  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3929  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3930  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3931  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
3932  default:
3933  llvm_unreachable("unhandled atomic opcode");
3934  }
3935 }
3936 
3939  Intrinsic::ID IID) const {
3940  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3941  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3942  const bool HasReturn = MI.getNumExplicitDefs() != 0;
3943 
3944  Register Dst;
3945 
3946  int OpOffset = 0;
3947  if (HasReturn) {
3948  // A few FP atomics do not support return values.
3949  Dst = MI.getOperand(0).getReg();
3950  } else {
3951  OpOffset = -1;
3952  }
3953 
3954  Register VData = MI.getOperand(2 + OpOffset).getReg();
3955  Register CmpVal;
3956 
3957  if (IsCmpSwap) {
3958  CmpVal = MI.getOperand(3 + OpOffset).getReg();
3959  ++OpOffset;
3960  }
3961 
3962  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3963  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3964 
3965  // The struct intrinsic variants add one additional operand over raw.
3966  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3967  Register VIndex;
3968  if (HasVIndex) {
3969  VIndex = MI.getOperand(4 + OpOffset).getReg();
3970  ++OpOffset;
3971  }
3972 
3973  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3974  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3975  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3976 
3977  MachineMemOperand *MMO = *MI.memoperands_begin();
3978 
3979  unsigned ImmOffset;
3980  unsigned TotalOffset;
3981  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3982  if (TotalOffset != 0)
3983  MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3984 
3985  if (!VIndex)
3986  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3987 
3988  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3989 
3990  if (HasReturn)
3991  MIB.addDef(Dst);
3992 
3993  MIB.addUse(VData); // vdata
3994 
3995  if (IsCmpSwap)
3996  MIB.addReg(CmpVal);
3997 
3998  MIB.addUse(RSrc) // rsrc
3999  .addUse(VIndex) // vindex
4000  .addUse(VOffset) // voffset
4001  .addUse(SOffset) // soffset
4002  .addImm(ImmOffset) // offset(imm)
4003  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4004  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4005  .addMemOperand(MMO);
4006 
4007  MI.eraseFromParent();
4008  return true;
4009 }
4010 
4011 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
4012 /// vector with s16 typed elements.
4015  SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
4016  const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
4017  const LLT S16 = LLT::scalar(16);
4018  const LLT V2S16 = LLT::vector(2, 16);
4019 
4020  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4021  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4022  if (!SrcOp.isReg())
4023  continue; // _L to _LZ may have eliminated this.
4024 
4025  Register AddrReg = SrcOp.getReg();
4026 
4027  if (I < Intr->GradientStart) {
4028  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4029  PackedAddrs.push_back(AddrReg);
4030  } else {
4031  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4032  // derivatives dx/dh and dx/dv are packed with undef.
4033  if (((I + 1) >= EndIdx) ||
4034  ((Intr->NumGradients / 2) % 2 == 1 &&
4035  (I == static_cast<unsigned>(Intr->GradientStart +
4036  (Intr->NumGradients / 2) - 1) ||
4037  I == static_cast<unsigned>(Intr->GradientStart +
4038  Intr->NumGradients - 1))) ||
4039  // Check for _L to _LZ optimization
4040  !MI.getOperand(ArgOffset + I + 1).isReg()) {
4041  PackedAddrs.push_back(
4042  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4043  .getReg(0));
4044  } else {
4045  PackedAddrs.push_back(
4046  B.buildBuildVector(
4047  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4048  .getReg(0));
4049  ++I;
4050  }
4051  }
4052  }
4053 }
4054 
4055 /// Convert from separate vaddr components to a single vector address register,
4056 /// and replace the remaining operands with $noreg.
4058  int DimIdx, int NumVAddrs) {
4059  const LLT S32 = LLT::scalar(32);
4060 
4061  SmallVector<Register, 8> AddrRegs;
4062  for (int I = 0; I != NumVAddrs; ++I) {
4063  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4064  if (SrcOp.isReg()) {
4065  AddrRegs.push_back(SrcOp.getReg());
4066  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4067  }
4068  }
4069 
4070  int NumAddrRegs = AddrRegs.size();
4071  if (NumAddrRegs != 1) {
4072  // Round up to 8 elements for v5-v7
4073  // FIXME: Missing intermediate sized register classes and instructions.
4074  if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4075  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4076  auto Undef = B.buildUndef(S32);
4077  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4078  NumAddrRegs = RoundedNumRegs;
4079  }
4080 
4081  auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4082  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4083  }
4084 
4085  for (int I = 1; I != NumVAddrs; ++I) {
4086  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4087  if (SrcOp.isReg())
4088  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4089  }
4090 }
4091 
4092 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4093 ///
4094 /// Depending on the subtarget, load/store with 16-bit element data need to be
4095 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4096 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4097 /// registers.
4098 ///
4099 /// We don't want to directly select image instructions just yet, but also want
4100 /// to exposes all register repacking to the legalizer/combiners. We also don't
4101 /// want a selected instrution entering RegBankSelect. In order to avoid
4102 /// defining a multitude of intermediate image instructions, directly hack on
4103 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4104 /// now unnecessary arguments with $noreg.
4107  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4108 
4109  const unsigned NumDefs = MI.getNumExplicitDefs();
4110  const unsigned ArgOffset = NumDefs + 1;
4111  bool IsTFE = NumDefs == 2;
4112  // We are only processing the operands of d16 image operations on subtargets
4113  // that use the unpacked register layout, or need to repack the TFE result.
4114 
4115  // TODO: Do we need to guard against already legalized intrinsics?
4116  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4117  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4118 
4119  MachineRegisterInfo *MRI = B.getMRI();
4120  const LLT S32 = LLT::scalar(32);
4121  const LLT S16 = LLT::scalar(16);
4122  const LLT V2S16 = LLT::vector(2, 16);
4123 
4124  unsigned DMask = 0;
4125 
4126  // Check for 16 bit addresses and pack if true.
4127  LLT GradTy =
4128  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4129  LLT AddrTy =
4130  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4131  const bool IsG16 = GradTy == S16;
4132  const bool IsA16 = AddrTy == S16;
4133 
4134  int DMaskLanes = 0;
4135  if (!BaseOpcode->Atomic) {
4136  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4137  if (BaseOpcode->Gather4) {
4138  DMaskLanes = 4;
4139  } else if (DMask != 0) {
4140  DMaskLanes = countPopulation(DMask);
4141  } else if (!IsTFE && !BaseOpcode->Store) {
4142  // If dmask is 0, this is a no-op load. This can be eliminated.
4143  B.buildUndef(MI.getOperand(0));
4144  MI.eraseFromParent();
4145  return true;
4146  }
4147  }
4148 
4149  Observer.changingInstr(MI);
4150  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4151 
4152  unsigned NewOpcode = NumDefs == 0 ?
4153  AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4154 
4155  // Track that we legalized this
4156  MI.setDesc(B.getTII().get(NewOpcode));
4157 
4158  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4159  // dmask to be at least 1 otherwise the instruction will fail
4160  if (IsTFE && DMask == 0) {
4161  DMask = 0x1;
4162  DMaskLanes = 1;
4163  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4164  }
4165 
4166  if (BaseOpcode->Atomic) {
4167  Register VData0 = MI.getOperand(2).getReg();
4168  LLT Ty = MRI->getType(VData0);
4169 
4170  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4171  if (Ty.isVector())
4172  return false;
4173 
4174  if (BaseOpcode->AtomicX2) {
4175  Register VData1 = MI.getOperand(3).getReg();
4176  // The two values are packed in one register.
4177  LLT PackedTy = LLT::vector(2, Ty);
4178  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4179  MI.getOperand(2).setReg(Concat.getReg(0));
4180  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4181  }
4182  }
4183 
4184  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4185 
4186  // Optimize _L to _LZ when _L is zero
4187  if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4188  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4189  const ConstantFP *ConstantLod;
4190 
4191  if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4192  m_GFCst(ConstantLod))) {
4193  if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4194  // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4195  const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4197  Intr->Dim);
4198 
4199  // The starting indexes should remain in the same place.
4200  --CorrectedNumVAddrs;
4201 
4202  MI.getOperand(MI.getNumExplicitDefs())
4203  .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4204  MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4205  Intr = NewImageDimIntr;
4206  }
4207  }
4208  }
4209 
4210  // Optimize _mip away, when 'lod' is zero
4211  if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4212  int64_t ConstantLod;
4213  if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4214  m_ICst(ConstantLod))) {
4215  if (ConstantLod == 0) {
4216  // TODO: Change intrinsic opcode and remove operand instead or replacing
4217  // it with 0, as the _L to _LZ handling is done above.
4218  MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4219  --CorrectedNumVAddrs;
4220  }
4221  }
4222  }
4223 
4224  // Rewrite the addressing register layout before doing anything else.
4225  if (IsA16 || IsG16) {
4226  if (IsA16) {
4227  // Target must support the feature and gradients need to be 16 bit too
4228  if (!ST.hasA16() || !IsG16)
4229  return false;
4230  } else if (!ST.hasG16())
4231  return false;
4232 
4233  if (Intr->NumVAddrs > 1) {
4234  SmallVector<Register, 4> PackedRegs;
4235  // Don't compress addresses for G16
4236  const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4237  packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4238  PackEndIdx);
4239 
4240  if (!IsA16) {
4241  // Add uncompressed address
4242  for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4243  int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4244  assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4245  PackedRegs.push_back(AddrReg);
4246  }
4247  }
4248 
4249  // See also below in the non-a16 branch
4250  const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4251 
4252  if (!UseNSA && PackedRegs.size() > 1) {
4253  LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4254  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4255  PackedRegs[0] = Concat.getReg(0);
4256  PackedRegs.resize(1);
4257  }
4258 
4259  const unsigned NumPacked = PackedRegs.size();
4260  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4261  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4262  if (!SrcOp.isReg()) {
4263  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4264  continue;
4265  }
4266 
4267  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4268 
4269  if (I - Intr->VAddrStart < NumPacked)
4270  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4271  else
4272  SrcOp.setReg(AMDGPU::NoRegister);
4273  }
4274  }
4275  } else {
4276  // If the register allocator cannot place the address registers contiguously
4277  // without introducing moves, then using the non-sequential address encoding
4278  // is always preferable, since it saves VALU instructions and is usually a
4279  // wash in terms of code size or even better.
4280  //
4281  // However, we currently have no way of hinting to the register allocator
4282  // that MIMG addresses should be placed contiguously when it is possible to
4283  // do so, so force non-NSA for the common 2-address case as a heuristic.
4284  //
4285  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4286  // allocation when possible.
4287  const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4288 
4289  if (!UseNSA && Intr->NumVAddrs > 1)
4290  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4291  Intr->NumVAddrs);
4292  }
4293 
4294  int Flags = 0;
4295  if (IsA16)
4296  Flags |= 1;
4297  if (IsG16)
4298  Flags |= 2;
4299  MI.addOperand(MachineOperand::CreateImm(Flags));
4300 
4301  if (BaseOpcode->Store) { // No TFE for stores?
4302  // TODO: Handle dmask trim
4303  Register VData = MI.getOperand(1).getReg();
4304  LLT Ty = MRI->getType(VData);
4305  if (!Ty.isVector() || Ty.getElementType() != S16)
4306  return true;
4307 
4308  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4309  if (RepackedReg != VData) {
4310  MI.getOperand(1).setReg(RepackedReg);
4311  }
4312 
4313  return true;
4314  }
4315 
4316  Register DstReg = MI.getOperand(0).getReg();
4317  LLT Ty = MRI->getType(DstReg);
4318  const LLT EltTy = Ty.getScalarType();
4319  const bool IsD16 = Ty.getScalarType() == S16;
4320  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4321 
4322  // Confirm that the return type is large enough for the dmask specified
4323  if (NumElts < DMaskLanes)
4324  return false;
4325 
4326  if (NumElts > 4 || DMaskLanes > 4)
4327  return false;
4328 
4329  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4330  const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4331 
4332  // The raw dword aligned data component of the load. The only legal cases
4333  // where this matters should be when using the packed D16 format, for
4334  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4335  LLT RoundedTy;
4336 
4337  // S32 vector to to cover all data, plus TFE result element.
4338  LLT TFETy;
4339 
4340  // Register type to use for each loaded component. Will be S32 or V2S16.
4341  LLT RegTy;
4342 
4343  if (IsD16 && ST.hasUnpackedD16VMem()) {
4344  RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4345  TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4346  RegTy = S32;
4347  } else {
4348  unsigned EltSize = EltTy.getSizeInBits();
4349  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4350  unsigned RoundedSize = 32 * RoundedElts;
4351  RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4352  TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4353  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4354  }
4355 
4356  // The return type does not need adjustment.
4357  // TODO: Should we change s16 case to s32 or <2 x s16>?
4358  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4359  return true;
4360 
4361  Register Dst1Reg;
4362 
4363  // Insert after the instruction.
4364  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4365 
4366  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4367  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4368  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4369  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4370 
4371  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4372 
4373  MI.getOperand(0).setReg(NewResultReg);
4374 
4375  // In the IR, TFE is supposed to be used with a 2 element struct return
4376  // type. The intruction really returns these two values in one contiguous
4377  // register, with one additional dword beyond the loaded data. Rewrite the
4378  // return type to use a single register result.
4379 
4380  if (IsTFE) {
4381  Dst1Reg = MI.getOperand(1).getReg();
4382  if (MRI->getType(Dst1Reg) != S32)
4383  return false;
4384 
4385  // TODO: Make sure the TFE operand bit is set.
4386  MI.RemoveOperand(1);
4387 
4388  // Handle the easy case that requires no repack instructions.
4389  if (Ty == S32) {
4390  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4391  return true;
4392  }
4393  }
4394 
4395  // Now figure out how to copy the new result register back into the old
4396  // result.
4397  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4398 
4399  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4400 
4401  if (ResultNumRegs == 1) {
4402  assert(!IsTFE);
4403  ResultRegs[0] = NewResultReg;
4404  } else {
4405  // We have to repack into a new vector of some kind.
4406  for (int I = 0; I != NumDataRegs; ++I)
4407  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4408  B.buildUnmerge(ResultRegs, NewResultReg);
4409 
4410  // Drop the final TFE element to get the data part. The TFE result is
4411  // directly written to the right place already.
4412  if (IsTFE)
4413  ResultRegs.resize(NumDataRegs);
4414  }
4415 
4416  // For an s16 scalar result, we form an s32 result with a truncate regardless
4417  // of packed vs. unpacked.
4418  if (IsD16 && !Ty.isVector()) {
4419  B.buildTrunc(DstReg, ResultRegs[0]);
4420  return true;
4421  }
4422 
4423  // Avoid a build/concat_vector of 1 entry.
4424  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4425  B.buildBitcast(DstReg, ResultRegs[0]);
4426  return true;
4427  }
4428 
4429  assert(Ty.isVector());
4430 
4431  if (IsD16) {
4432  // For packed D16 results with TFE enabled, all the data components are
4433  // S32. Cast back to the expected type.
4434  //
4435  // TODO: We don't really need to use load s32 elements. We would only need one
4436  // cast for the TFE result if a multiple of v2s16 was used.
4437  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4438  for (Register &Reg : ResultRegs)
4439  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4440  } else if (ST.hasUnpackedD16VMem()) {
4441  for (Register &Reg : ResultRegs)
4442  Reg = B.buildTrunc(S16, Reg).getReg(0);
4443  }
4444  }
4445 
4446  auto padWithUndef = [&](LLT Ty, int NumElts) {
4447  if (NumElts == 0)
4448  return;
4449  Register Undef = B.buildUndef(Ty).getReg(0);
4450  for (int I = 0; I != NumElts; ++I)
4451  ResultRegs.push_back(Undef);
4452  };
4453 
4454  // Pad out any elements eliminated due to the dmask.
4455  LLT ResTy = MRI->getType(ResultRegs[0]);
4456  if (!ResTy.isVector()) {
4457  padWithUndef(ResTy, NumElts - ResultRegs.size());
4458  B.buildBuildVector(DstReg, ResultRegs);
4459  return true;
4460  }
4461 
4462  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4463  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4464 
4465  // Deal with the one annoying legal case.
4466  const LLT V3S16 = LLT::vector(3, 16);
4467  if (Ty == V3S16) {
4468  padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4469  auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4470  B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4471  return true;
4472  }
4473 
4474  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4475  B.buildConcatVectors(DstReg, ResultRegs);
4476  return true;
4477 }
4478 
4480  LegalizerHelper &Helper, MachineInstr &MI) const {
4481  MachineIRBuilder &B = Helper.MIRBuilder;
4482  GISelChangeObserver &Observer = Helper.Observer;
4483 
4484  Register Dst = MI.getOperand(0).getReg();
4485  LLT Ty = B.getMRI()->getType(Dst);
4486  unsigned Size = Ty.getSizeInBits();
4487  MachineFunction &MF = B.getMF();
4488 
4489  Observer.changingInstr(MI);
4490 
4491  if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4492  Ty = getBitcastRegisterType(Ty);
4493  Helper.bitcastDst(MI, Ty, 0);
4494  Dst = MI.getOperand(0).getReg();
4495  B.setInsertPt(B.getMBB(), MI);
4496  }
4497 
4498  // FIXME: We don't really need this intermediate instruction. The intrinsic
4499  // should be fixed to have a memory operand. Since it's readnone, we're not
4500  // allowed to add one.
4501  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4502  MI.RemoveOperand(1); // Remove intrinsic ID
4503 
4504  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4505  // TODO: Should this use datalayout alignment?
4506  const unsigned MemSize = (Size + 7) / 8;
4507  const Align MemAlign(4);
4512  MemSize, MemAlign);
4513  MI.addMemOperand(MF, MMO);
4514 
4515  // There are no 96-bit result scalar loads, but widening to 128-bit should
4516  // always be legal. We may need to restore this to a 96-bit result if it turns
4517  // out this needs to be converted to a vector load during RegBankSelect.
4518  if (!isPowerOf2_32(Size)) {
4519  if (Ty.isVector())
4520  Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4521  else
4522  Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4523  }
4524 
4525  Observer.changedInstr(MI);
4526  return true;
4527 }
4528 
4529 // TODO: Move to selection
4532  MachineIRBuilder &B) const {
4533  if (!ST.isTrapHandlerEnabled() ||
4535  return legalizeTrapEndpgm(MI, MRI, B);
4536 
4537  if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4538  switch (*HsaAbiVer) {
4541  return legalizeTrapHsaQueuePtr(MI, MRI, B);
4543  return ST.supportsGetDoorbellID() ?
4544  legalizeTrapHsa(MI, MRI, B) :
4546  }
4547  }
4548 
4549  llvm_unreachable("Unknown trap handler");
4550 }
4551 
4554  B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4555  MI.eraseFromParent();
4556  return true;
4557 }
4558 
4561  // Pass queue pointer to trap handler as input, and insert trap instruction
4562  // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4563  Register LiveIn =
4566  return false;
4567 
4568  Register SGPR01(AMDGPU::SGPR0_SGPR1);
4569  B.buildCopy(SGPR01, LiveIn);
4570  B.buildInstr(AMDGPU::S_TRAP)
4571  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4572  .addReg(SGPR01, RegState::Implicit);
4573 
4574  MI.eraseFromParent();
4575  return true;
4576 }
4577 
4580  B.buildInstr(AMDGPU::S_TRAP)
4581  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4582  MI.eraseFromParent();
4583  return true;
4584 }
4585 
4588  // Is non-HSA path or trap-handler disabled? then, report a warning
4589  // accordingly
4590  if (!ST.isTrapHandlerEnabled() ||
4592  DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4593  "debugtrap handler not supported",
4594  MI.getDebugLoc(), DS_Warning);
4595  LLVMContext &Ctx = B.getMF().getFunction().getContext();
4596  Ctx.diagnose(NoTrap);
4597  } else {
4598  // Insert debug-trap instruction
4599  B.buildInstr(AMDGPU::S_TRAP)
4600  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4601  }
4602 
4603  MI.eraseFromParent();
4604  return true;
4605 }
4606 
4608  MachineIRBuilder &B) const {
4609  MachineRegisterInfo &MRI = *B.getMRI();
4610  const LLT S16 = LLT::scalar(16);
4611  const LLT S32 = LLT::scalar(32);
4612 
4613  Register DstReg = MI.getOperand(0).getReg();
4614  Register NodePtr = MI.getOperand(2).getReg();
4615  Register RayExtent = MI.getOperand(3).getReg();
4616  Register RayOrigin = MI.getOperand(4).getReg();
4617  Register RayDir = MI.getOperand(5).getReg();
4618  Register RayInvDir = MI.getOperand(6).getReg();
4619  Register TDescr = MI.getOperand(7).getReg();
4620 
4621  bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4622  bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4623  unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4624  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4625  : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4626  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4627 
4629  if (Is64) {
4630  auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4631  Ops.push_back(Unmerge.getReg(0));
4632  Ops.push_back(Unmerge.getReg(1));
4633  } else {
4634  Ops.push_back(NodePtr);
4635  }
4636  Ops.push_back(RayExtent);
4637 
4638  auto packLanes = [&Ops, &S32, &B] (Register Src) {
4639  auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4640  Ops.push_back(Unmerge.getReg(0));
4641  Ops.push_back(Unmerge.getReg(1));
4642  Ops.push_back(Unmerge.getReg(2));
4643  };
4644 
4645  packLanes(RayOrigin);
4646  if (IsA16) {
4647  auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4648  auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4652  B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4653  B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4654  B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4655  Ops.push_back(R1);
4656  Ops.push_back(R2);
4657  Ops.push_back(R3);
4658  } else {
4659  packLanes(RayDir);
4660  packLanes(RayInvDir);
4661  }
4662 
4663  auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4664  .addDef(DstReg)
4665  .addImm(Opcode);
4666 
4667  for (Register R : Ops) {
4668  MIB.addUse(R);
4669  }
4670 
4671  MIB.addUse(TDescr)
4672  .addImm(IsA16 ? 1 : 0)
4673  .cloneMemRefs(MI);
4674 
4675  MI.eraseFromParent();
4676  return true;
4677 }
4678 
4680  MachineInstr &MI) const {
4681  MachineIRBuilder &B = Helper.MIRBuilder;
4682  MachineRegisterInfo &MRI = *B.getMRI();
4683 
4684  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4685  auto IntrID = MI.getIntrinsicID();
4686  switch (IntrID) {
4687  case Intrinsic::amdgcn_if:
4688  case Intrinsic::amdgcn_else: {
4689  MachineInstr *Br = nullptr;
4690  MachineBasicBlock *UncondBrTarget = nullptr;
4691  bool Negated = false;
4692  if (MachineInstr *BrCond =
4693  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4694  const SIRegisterInfo *TRI
4695  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4696 
4697  Register Def = MI.getOperand(1).getReg();
4698  Register Use = MI.getOperand(3).getReg();
4699 
4700  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4701 
4702  if (Negated)
4703  std::swap(CondBrTarget, UncondBrTarget);
4704 
4705  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4706  if (IntrID == Intrinsic::amdgcn_if) {
4707  B.buildInstr(AMDGPU::SI_IF)
4708  .addDef(Def)
4709  .addUse(Use)
4710  .addMBB(UncondBrTarget);
4711  } else {
4712  B.buildInstr(AMDGPU::SI_ELSE)
4713  .addDef(Def)
4714  .addUse(Use)
4715  .addMBB(UncondBrTarget);
4716  }
4717 
4718  if (Br) {
4719  Br->getOperand(0).setMBB(CondBrTarget);
4720  } else {
4721  // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4722  // since we're swapping branch targets it needs to be reinserted.
4723  // FIXME: IRTranslator should probably not do this
4724  B.buildBr(*CondBrTarget);
4725  }
4726 
4727  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4728  MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4729  MI.eraseFromParent();
4730  BrCond->eraseFromParent();
4731  return true;
4732  }
4733 
4734  return false;
4735  }
4736  case Intrinsic::amdgcn_loop: {
4737  MachineInstr *Br = nullptr;
4738  MachineBasicBlock *UncondBrTarget = nullptr;
4739  bool Negated = false;
4740  if (MachineInstr *BrCond =
4741  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4742  const SIRegisterInfo *TRI
4743  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4744 
4745  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4746  Register Reg = MI.getOperand(2).getReg();
4747 
4748  if (Negated)
4749  std::swap(CondBrTarget, UncondBrTarget);
4750 
4751  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4752  B.buildInstr(AMDGPU::SI_LOOP)
4753  .addUse(Reg)
4754  .addMBB(UncondBrTarget);
4755 
4756  if (Br)
4757  Br->getOperand(0).setMBB(CondBrTarget);
4758  else
4759  B.buildBr(*CondBrTarget);
4760 
4761  MI.eraseFromParent();
4762  BrCond->eraseFromParent();
4763  MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4764  return true;
4765  }
4766 
4767  return false;
4768  }
4769  case Intrinsic::amdgcn_kernarg_segment_ptr:
4770  if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4771  // This only makes sense to call in a kernel, so just lower to null.
4772  B.buildConstant(MI.getOperand(0).getReg(), 0);
4773  MI.eraseFromParent();
4774  return true;
4775  }
4776 
4779  case Intrinsic::amdgcn_implicitarg_ptr:
4780  return legalizeImplicitArgPtr(MI, MRI, B);
4781  case Intrinsic::amdgcn_workitem_id_x:
4784  case Intrinsic::amdgcn_workitem_id_y:
4787  case Intrinsic::amdgcn_workitem_id_z:
4790  case Intrinsic::amdgcn_workgroup_id_x:
4793  case Intrinsic::amdgcn_workgroup_id_y:
4796  case Intrinsic::amdgcn_workgroup_id_z:
4799  case Intrinsic::amdgcn_dispatch_ptr:
4802  case Intrinsic::amdgcn_queue_ptr:
4805  case Intrinsic::amdgcn_implicit_buffer_ptr:
4808  case Intrinsic::amdgcn_dispatch_id:
4811  case Intrinsic::amdgcn_fdiv_fast:
4812  return legalizeFDIVFastIntrin(MI, MRI, B);
4813  case Intrinsic::amdgcn_is_shared:
4815  case Intrinsic::amdgcn_is_private:
4817  case Intrinsic::amdgcn_wavefrontsize: {
4818  B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4819  MI.eraseFromParent();
4820  return true;
4821  }
4822  case Intrinsic::amdgcn_s_buffer_load:
4823  return legalizeSBufferLoad(Helper, MI);
4824  case Intrinsic::amdgcn_raw_buffer_store:
4825  case Intrinsic::amdgcn_struct_buffer_store:
4826  return legalizeBufferStore(MI, MRI, B, false, false);
4827  case Intrinsic::amdgcn_raw_buffer_store_format:
4828  case Intrinsic::amdgcn_struct_buffer_store_format:
4829  return legalizeBufferStore(MI, MRI, B, false, true);
4830  case Intrinsic::amdgcn_raw_tbuffer_store:
4831  case Intrinsic::amdgcn_struct_tbuffer_store:
4832  return legalizeBufferStore(MI, MRI, B, true, true);
4833  case Intrinsic::amdgcn_raw_buffer_load:
4834  case Intrinsic::amdgcn_struct_buffer_load:
4835  return legalizeBufferLoad(MI, MRI, B, false, false);
4836  case Intrinsic::amdgcn_raw_buffer_load_format:
4837  case Intrinsic::amdgcn_struct_buffer_load_format:
4838  return legalizeBufferLoad(MI, MRI, B, true, false);
4839  case Intrinsic::amdgcn_raw_tbuffer_load:
4840  case Intrinsic::amdgcn_struct_tbuffer_load:
4841  return legalizeBufferLoad(MI, MRI, B, true, true);
4842  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4843  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4844  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4845  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4846  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4847  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4848  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4849  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4850  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4851  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4852  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4853  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4854  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4855  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4856  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4857  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4858  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4859  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4860  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4861  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4862  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4863  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4864  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4865  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4866  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4867  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4868  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4869  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4870  case Intrinsic::amdgcn_buffer_atomic_fadd:
4871  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4872  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4873  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4874  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4875  return legalizeBufferAtomic(MI, B, IntrID);
4876  case Intrinsic::amdgcn_atomic_inc:
4877  return legalizeAtomicIncDec(MI, B, true);
4878  case Intrinsic::amdgcn_atomic_dec:
4879  return legalizeAtomicIncDec(MI, B, false);
4880  case Intrinsic::trap:
4881  return legalizeTrapIntrinsic(MI, MRI, B);
4882  case Intrinsic::debugtrap:
4883  return legalizeDebugTrapIntrinsic(MI, MRI, B);
4884  case Intrinsic::amdgcn_rsq_clamp:
4885  return legalizeRsqClampIntrinsic(MI, MRI, B);
4886  case Intrinsic::amdgcn_ds_fadd:
4887  case Intrinsic::amdgcn_ds_fmin:
4888  case Intrinsic::amdgcn_ds_fmax:
4889  return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4890  case Intrinsic::amdgcn_image_bvh_intersect_ray:
4891  return legalizeBVHIntrinsic(MI, B);
4892  default: {
4893  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4895  return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4896  return true;
4897  }
4898  }
4899 
4900  return true;
4901 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
llvm::NextPowerOf2
uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:683
MIPatternMatch.h
llvm::AMDGPUFunctionArgInfo::PreloadedValue
PreloadedValue
Definition: AMDGPUArgumentUsageInfo.h:98
llvm::AMDGPUFunctionArgInfo::QUEUE_PTR
@ QUEUE_PTR
Definition: AMDGPUArgumentUsageInfo.h:102
llvm::LegalizeRuleSet::widenScalarToNextPow2
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
Definition: LegalizerInfo.h:799
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4543
llvm::AMDGPUMachineFunction::allocateLDSGlobal
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
Definition: AMDGPUMachineFunction.cpp:41
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
Merge
R600 Clause Merge
Definition: R600ClauseMergePass.cpp:69
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:132
llvm
Definition: AllocatorList.h:23
llvm::LegalityPredicates::scalarOrEltWiderThan
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
Definition: LegalityPredicates.cpp:141
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
llvm::AMDGPULegalizerInfo::legalizeImplicitArgPtr
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:3494
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:743
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:100
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
llvm::AMDGPU::MIMGBaseOpcodeInfo::Store
bool Store
Definition: AMDGPUBaseInfo.h:282
llvm::LLT::getScalarSizeInBits
unsigned getScalarSizeInBits() const
Definition: LowLevelTypeImpl.h:163
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
llvm::GlobalValue::hasExternalLinkage
bool hasExternalLinkage() const
Definition: GlobalValue.h:431
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:993
llvm::MachineMemOperand::getAlign
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
Definition: MachineOperand.cpp:1069
vectorWiderThan
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
Definition: AMDGPULegalizerInfo.cpp:167
llvm::APFloatBase::IEEEsingle
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:163
SIMachineFunctionInfo.h
llvm::AMDGPU::Hwreg::ID_SHIFT_
@ ID_SHIFT_
Definition: SIDefines.h:375
llvm::LegalizerInfo::getActionDefinitionsBuilder
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
Definition: LegalizerInfo.cpp:435
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::ArgDescriptor
Definition: AMDGPUArgumentUsageInfo.h:23
getPow2VectorType
static LLT getPow2VectorType(LLT Ty)
Definition: AMDGPULegalizerInfo.cpp:49
widenToNextPowerOf2
static LLT widenToNextPowerOf2(LLT Ty)
Definition: AMDGPULegalizerInfo.cpp:2384
llvm::Function
Definition: Function.h:61
convertImageAddrToPacked
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
Definition: AMDGPULegalizerInfo.cpp:4057
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:410
llvm::AMDGPU::SIModeRegisterDefaults::IEEE
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
Definition: AMDGPUBaseInfo.h:890
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::SIInstrInfo::MO_GOTPCREL32
@ MO_GOTPCREL32
Definition: SIInstrInfo.h:152
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::MachineFunction::end
iterator end()
Definition: MachineFunction.h:739
llvm::LegalizeRuleSet::customFor
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
Definition: LegalizerInfo.h:773
llvm::AMDGPULegalizerInfo::legalizeIsAddrSpace
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
Definition: AMDGPULegalizerInfo.cpp:3511
llvm::AMDGPU::MIMGBaseOpcodeInfo::Gather4
bool Gather4
Definition: AMDGPUBaseInfo.h:286
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:430
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
llvm::CmpInst::FCMP_ONE
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:730
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::AMDGPU::ImageDimIntrinsicInfo
Definition: AMDGPUInstrInfo.h:50
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::MachineRegisterInfo::getTargetRegisterInfo
const TargetRegisterInfo * getTargetRegisterInfo() const
Definition: MachineRegisterInfo.h:153
llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition: MachineMemOperand.h:144
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:744
llvm::AMDGPU::getHsaAbiVersion
Optional< uint8_t > getHsaAbiVersion(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:91
llvm::LegalityPredicates::typeIsNot
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Definition: LegalizerInfo.h:227
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38
llvm::AMDGPULegalizerInfo::loadInputValue
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
Definition: AMDGPULegalizerInfo.cpp:2719
llvm::MachineRegisterInfo::use_instr_nodbg_begin
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
Definition: MachineRegisterInfo.h:535
loadStoreBitcastWorkaround
static bool loadStoreBitcastWorkaround(const LLT Ty)
Definition: AMDGPULegalizerInfo.cpp:328
llvm::AMDGPULegalizerInfo::legalizeShuffleVector
bool legalizeShuffleVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:2169
llvm::AMDGPULegalizerInfo::legalizeSDIV_SREM
bool legalizeSDIV_SREM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:3014
shouldWidenLoad
static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, unsigned AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
Definition: AMDGPULegalizerInfo.cpp:370
sizeIsMultipleOf32
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
Definition: AMDGPULegalizerInfo.cpp:79
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:749
llvm::LLT::getScalarType
LLT getScalarType() const
Definition: LowLevelTypeImpl.h:121
llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:316
llvm::AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
@ WIDTH_M1_SRC_PRIVATE_BASE
Definition: SIDefines.h:399
Shift
bool Shift
Definition: README.txt:468
llvm::AMDGPULegalizerInfo::legalizeFrem
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:1940
llvm::ConstantFP::isZero
bool isZero() const
Return true if the value is positive or negative zero.
Definition: Constants.h:299
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::LegalizerHelper::lowerFMad
LegalizeResult lowerFMad(MachineInstr &MI)
Definition: LegalizerHelper.cpp:6059
llvm::LegalityPredicates::largerThan
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
Definition: LegalityPredicates.cpp:125
llvm::getFunctionLiveInPhysReg
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:617
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::FloatToBits
uint32_t FloatToBits(float Float)
This function takes a float and returns the bit equivalent 32-bit integer.
Definition: MathExtras.h:663
llvm::LegalityPredicates::scalarNarrowerThan
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
Definition: LegalityPredicates.cpp:101
llvm::GCNSubtarget::isTrapHandlerEnabled
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:524
llvm::MachineInstr::FmAfn
@ FmAfn
Definition: MachineInstr.h:98
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition: MachineMemOperand.h:142
llvm::LegalityPredicates::elementTypeIs
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
Definition: LegalityPredicates.cpp:93
llvm::SITargetLowering::shouldEmitFixup
bool shouldEmitFixup(const GlobalValue *GV) const
Definition: SIISelLowering.cpp:4932
llvm::GCNSubtarget::hasFractBug
bool hasFractBug() const
Definition: GCNSubtarget.h:326
llvm::GCNSubtarget::hasNSAEncoding
bool hasNSAEncoding() const
Definition: GCNSubtarget.h:869
llvm::GCNSubtarget::TrapID::LLVMAMDHSADebugTrap
@ LLVMAMDHSADebugTrap
llvm::Optional
Definition: APInt.h:33
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:885
llvm::AMDGPULegalizerInfo::legalizeInsertVectorElt
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:2136
llvm::LLT::scalarOrVector
static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy)
Definition: LowLevelTypeImpl.h:74
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
getDSFPAtomicOpcode
static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID)
Definition: AMDGPULegalizerInfo.cpp:3444
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl
void legalizeUDIV_UREM32Impl(MachineIRBuilder &B, Register DstReg, Register Num, Register Den, bool IsRem) const
Definition: AMDGPULegalizerInfo.cpp:2793
llvm::AMDGPULegalizerInfo::legalizeSBufferLoad
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Definition: AMDGPULegalizerInfo.cpp:4479
numElementsNotEven
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
Definition: AMDGPULegalizerInfo.cpp:174
llvm::AMDGPULegalizerInfo::legalizeLoad
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Definition: AMDGPULegalizerInfo.cpp:2390
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
MachineIRBuilder.h
llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:702
llvm::AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic
bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:4586
llvm::LegalizeRuleSet::scalarize
LegalizeRuleSet & scalarize(unsigned TypeIdx)
Definition: LegalizerInfo.h:823
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::AMDGPULegalizerInfo::legalizeAtomicCmpXChg
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:2492
llvm::LegalizeRuleSet::lower
LegalizeRuleSet & lower()
The instruction is lowered.
Definition: LegalizerInfo.h:604
llvm::LegalizerHelper
Definition: LegalizerHelper.h:37
llvm::LegalizeMutations::changeTo
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Definition: LegalizeMutations.cpp:17
llvm::AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
@ OFFSET_SRC_PRIVATE_BASE
Definition: SIDefines.h:389
llvm::CmpInst::FCMP_OGT
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:726
llvm::LegalityPredicates::isPointer
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
Definition: LegalityPredicates.cpp:79
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:141
llvm::ValueAndVReg::Value
APInt Value
Definition: Utils.h:175
MaxRegisterSize
static constexpr unsigned MaxRegisterSize
Definition: AMDGPULegalizerInfo.cpp:46
stripAnySourceMods
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
Definition: AMDGPULegalizerInfo.cpp:2582
llvm::AMDGPUFunctionArgInfo::WORKGROUP_ID_X
@ WORKGROUP_ID_X
Definition: AMDGPUArgumentUsageInfo.h:106
llvm::LegalizeRuleSet::legalIf
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
Definition: LegalizerInfo.h:531
llvm::AMDGPULegalizerInfo::legalizeFlog
bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const
Definition: AMDGPULegalizerInfo.cpp:2517
llvm::GCNSubtarget::getInstrInfo
const SIInst