LLVM  15.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
26 #include "llvm/Support/KnownBits.h"
28 
29 using namespace llvm;
30 
31 #include "AMDGPUGenCallingConv.inc"
32 
34  "amdgpu-bypass-slow-div",
35  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
36  cl::init(true));
37 
38 // Find a larger type to do a load / store of a vector with.
40  unsigned StoreSize = VT.getStoreSizeInBits();
41  if (StoreSize <= 32)
42  return EVT::getIntegerVT(Ctx, StoreSize);
43 
44  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
45  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
46 }
47 
50 }
51 
53  // In order for this to be a signed 24-bit value, bit 23, must
54  // be a sign bit.
55  return DAG.ComputeMaxSignificantBits(Op);
56 }
57 
59  const AMDGPUSubtarget &STI)
60  : TargetLowering(TM), Subtarget(&STI) {
61  // Lower floating point store/load to integer store/load to reduce the number
62  // of patterns in tablegen.
65 
68 
71 
74 
77 
80 
83 
86 
89 
92 
95 
98 
101 
104 
107 
110 
113 
116 
119 
122 
125 
128 
129  // There are no 64-bit extloads. These should be done as a 32-bit extload and
130  // an extension to 64-bit.
131  for (MVT VT : MVT::integer_valuetypes())
133  Expand);
134 
135  for (MVT VT : MVT::integer_valuetypes()) {
136  if (VT == MVT::i64)
137  continue;
138 
139  for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
144  }
145  }
146 
148  for (auto MemVT :
151  Expand);
152 
160 
167 
174 
177 
180 
183 
186 
189 
192 
195 
198 
201 
204 
207 
210 
213 
216 
219 
222 
225 
228 
231 
234 
237 
240 
245 
250 
258 
261 
264 
269 
274 
277 
285 
288 
290 
291  // This is totally unsupported, just custom lower to produce an error.
293 
294  // Library functions. These default to Expand, but we have instructions
295  // for them.
298  ISD::FMAXNUM},
299  MVT::f32, Legal);
300 
302 
304 
306 
308 
309  // Expand to fneg + fadd.
311 
316  Custom);
326  Custom);
327 
330 
331  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
332  for (MVT VT : ScalarIntVTs) {
333  // These should use [SU]DIVREM, so set them to expand
335  Expand);
336 
337  // GPU does not have divrem function for signed or unsigned.
339 
340  // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
342 
344 
345  // AMDGPU uses ADDC/SUBC/ADDE/SUBE
347  }
348 
349  // The hardware supports 32-bit FSHR, but not FSHL.
351 
352  // The hardware supports 32-bit ROTR, but not ROTL.
355 
357 
361  MVT::i64, Custom);
363 
365  Legal);
366 
369  MVT::i64, Custom);
370 
371  static const MVT::SimpleValueType VectorIntTypes[] = {
373 
374  for (MVT VT : VectorIntTypes) {
375  // Expand the following operations for the current type by default.
387  ISD::SETCC},
388  VT, Expand);
389  }
390 
391  static const MVT::SimpleValueType FloatVectorTypes[] = {
393 
394  for (MVT VT : FloatVectorTypes) {
404  VT, Expand);
405  }
406 
407  // This causes using an unrolled select operation rather than expansion with
408  // bit operations. This is in general better, but the alternative using BFI
409  // instructions may be better if the select sources are SGPRs.
412 
415 
418 
421 
424 
427 
428  // There are no libcalls of any kind.
429  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
430  setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
431 
433  setJumpIsExpensive(true);
434 
435  // FIXME: This is only partially true. If we have to do vector compares, any
436  // SGPR pair can be a condition register. If we have a uniform condition, we
437  // are better off doing SALU operations, where there is only one SCC. For now,
438  // we don't have a way of knowing during instruction selection if a condition
439  // will be uniform and we always use vector compares. Assume we are using
440  // vector compares until that is fixed.
442 
445 
447 
448  // We want to find all load dependencies for long chains of stores to enable
449  // merging into very wide vectors. The problem is with vectors with > 4
450  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
451  // vectors are a legal type, even though we have to split the loads
452  // usually. When we can more precisely specify load legality per address
453  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
454  // smarter so that they can figure out what to do in 2 iterations without all
455  // N > 4 stores on the same chain.
457 
458  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
459  // about these during lowering.
460  MaxStoresPerMemcpy = 0xffffffff;
461  MaxStoresPerMemmove = 0xffffffff;
462  MaxStoresPerMemset = 0xffffffff;
463 
464  // The expansion for 64-bit division is enormous.
466  addBypassSlowDiv(64, 32);
467 
478 }
479 
481  if (getTargetMachine().Options.NoSignedZerosFPMath)
482  return true;
483 
484  const auto Flags = Op.getNode()->getFlags();
485  if (Flags.hasNoSignedZeros())
486  return true;
487 
488  return false;
489 }
490 
491 //===----------------------------------------------------------------------===//
492 // Target Information
493 //===----------------------------------------------------------------------===//
494 
496 static bool fnegFoldsIntoOp(unsigned Opc) {
497  switch (Opc) {
498  case ISD::FADD:
499  case ISD::FSUB:
500  case ISD::FMUL:
501  case ISD::FMA:
502  case ISD::FMAD:
503  case ISD::FMINNUM:
504  case ISD::FMAXNUM:
505  case ISD::FMINNUM_IEEE:
506  case ISD::FMAXNUM_IEEE:
507  case ISD::FSIN:
508  case ISD::FTRUNC:
509  case ISD::FRINT:
510  case ISD::FNEARBYINT:
511  case ISD::FCANONICALIZE:
512  case AMDGPUISD::RCP:
515  case AMDGPUISD::SIN_HW:
519  case AMDGPUISD::FMED3:
520  // TODO: handle llvm.amdgcn.fma.legacy
521  return true;
522  default:
523  return false;
524  }
525 }
526 
527 /// \p returns true if the operation will definitely need to use a 64-bit
528 /// encoding, and thus will use a VOP3 encoding regardless of the source
529 /// modifiers.
531 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
532  return N->getNumOperands() > 2 || VT == MVT::f64;
533 }
534 
535 // Most FP instructions support source modifiers, but this could be refined
536 // slightly.
538 static bool hasSourceMods(const SDNode *N) {
539  if (isa<MemSDNode>(N))
540  return false;
541 
542  switch (N->getOpcode()) {
543  case ISD::CopyToReg:
544  case ISD::SELECT:
545  case ISD::FDIV:
546  case ISD::FREM:
547  case ISD::INLINEASM:
548  case ISD::INLINEASM_BR:
551 
552  // TODO: Should really be looking at the users of the bitcast. These are
553  // problematic because bitcasts are used to legalize all stores to integer
554  // types.
555  case ISD::BITCAST:
556  return false;
558  switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
559  case Intrinsic::amdgcn_interp_p1:
560  case Intrinsic::amdgcn_interp_p2:
561  case Intrinsic::amdgcn_interp_mov:
562  case Intrinsic::amdgcn_interp_p1_f16:
563  case Intrinsic::amdgcn_interp_p2_f16:
564  return false;
565  default:
566  return true;
567  }
568  }
569  default:
570  return true;
571  }
572 }
573 
575  unsigned CostThreshold) {
576  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
577  // it is truly free to use a source modifier in all cases. If there are
578  // multiple users but for each one will necessitate using VOP3, there will be
579  // a code size increase. Try to avoid increasing code size unless we know it
580  // will save on the instruction count.
581  unsigned NumMayIncreaseSize = 0;
582  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
583 
584  // XXX - Should this limit number of uses to check?
585  for (const SDNode *U : N->uses()) {
586  if (!hasSourceMods(U))
587  return false;
588 
589  if (!opMustUseVOP3Encoding(U, VT)) {
590  if (++NumMayIncreaseSize > CostThreshold)
591  return false;
592  }
593  }
594 
595  return true;
596 }
597 
599  ISD::NodeType ExtendKind) const {
600  assert(!VT.isVector() && "only scalar expected");
601 
602  // Round to the next multiple of 32-bits.
603  unsigned Size = VT.getSizeInBits();
604  if (Size <= 32)
605  return MVT::i32;
606  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
607 }
608 
610  return MVT::i32;
611 }
612 
614  return true;
615 }
616 
617 // The backend supports 32 and 64 bit floating point immediates.
618 // FIXME: Why are we reporting vectors of FP immediates as legal?
620  bool ForCodeSize) const {
621  EVT ScalarVT = VT.getScalarType();
622  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
623  (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
624 }
625 
626 // We don't want to shrink f64 / f32 constants.
628  EVT ScalarVT = VT.getScalarType();
629  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
630 }
631 
633  ISD::LoadExtType ExtTy,
634  EVT NewVT) const {
635  // TODO: This may be worth removing. Check regression tests for diffs.
636  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
637  return false;
638 
639  unsigned NewSize = NewVT.getStoreSizeInBits();
640 
641  // If we are reducing to a 32-bit load or a smaller multi-dword load,
642  // this is always better.
643  if (NewSize >= 32)
644  return true;
645 
646  EVT OldVT = N->getValueType(0);
647  unsigned OldSize = OldVT.getStoreSizeInBits();
648 
649  MemSDNode *MN = cast<MemSDNode>(N);
650  unsigned AS = MN->getAddressSpace();
651  // Do not shrink an aligned scalar load to sub-dword.
652  // Scalar engine cannot do sub-dword loads.
653  if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
656  (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
657  MN->isInvariant())) &&
659  return false;
660 
661  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
662  // extloads, so doing one requires using a buffer_load. In cases where we
663  // still couldn't use a scalar load, using the wider load shouldn't really
664  // hurt anything.
665 
666  // If the old size already had to be an extload, there's no harm in continuing
667  // to reduce the width.
668  return (OldSize < 32);
669 }
670 
672  const SelectionDAG &DAG,
673  const MachineMemOperand &MMO) const {
674 
675  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
676 
677  if (LoadTy.getScalarType() == MVT::i32)
678  return false;
679 
680  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
681  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
682 
683  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
684  return false;
685 
686  bool Fast = false;
688  CastTy, MMO, &Fast) &&
689  Fast;
690 }
691 
692 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
693 // profitable with the expansion for 64-bit since it's generally good to
694 // speculate things.
695 // FIXME: These should really have the size as a parameter.
697  return true;
698 }
699 
701  return true;
702 }
703 
705  switch (N->getOpcode()) {
706  case ISD::EntryToken:
707  case ISD::TokenFactor:
708  return true;
710  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
711  switch (IntrID) {
712  case Intrinsic::amdgcn_readfirstlane:
713  case Intrinsic::amdgcn_readlane:
714  return true;
715  }
716  return false;
717  }
718  case ISD::LOAD:
719  if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
721  return true;
722  return false;
723  case AMDGPUISD::SETCC: // ballot-style instruction
724  return true;
725  }
726  return false;
727 }
728 
730  SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
731  NegatibleCost &Cost, unsigned Depth) const {
732 
733  switch (Op.getOpcode()) {
734  case ISD::FMA:
735  case ISD::FMAD: {
736  // Negating a fma is not free if it has users without source mods.
737  if (!allUsesHaveSourceMods(Op.getNode()))
738  return SDValue();
739  break;
740  }
741  default:
742  break;
743  }
744 
745  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
746  ForCodeSize, Cost, Depth);
747 }
748 
749 //===---------------------------------------------------------------------===//
750 // Target Properties
751 //===---------------------------------------------------------------------===//
752 
754  assert(VT.isFloatingPoint());
755 
756  // Packed operations do not have a fabs modifier.
757  return VT == MVT::f32 || VT == MVT::f64 ||
758  (Subtarget->has16BitInsts() && VT == MVT::f16);
759 }
760 
762  assert(VT.isFloatingPoint());
763  // Report this based on the end legalized type.
764  VT = VT.getScalarType();
765  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
766 }
767 
769  unsigned NumElem,
770  unsigned AS) const {
771  return true;
772 }
773 
775  // There are few operations which truly have vector input operands. Any vector
776  // operation is going to involve operations on each component, and a
777  // build_vector will be a copy per element, so it always makes sense to use a
778  // build_vector input in place of the extracted element to avoid a copy into a
779  // super register.
780  //
781  // We should probably only do this if all users are extracts only, but this
782  // should be the common case.
783  return true;
784 }
785 
787  // Truncate is just accessing a subregister.
788 
789  unsigned SrcSize = Source.getSizeInBits();
790  unsigned DestSize = Dest.getSizeInBits();
791 
792  return DestSize < SrcSize && DestSize % 32 == 0 ;
793 }
794 
796  // Truncate is just accessing a subregister.
797 
798  unsigned SrcSize = Source->getScalarSizeInBits();
799  unsigned DestSize = Dest->getScalarSizeInBits();
800 
801  if (DestSize== 16 && Subtarget->has16BitInsts())
802  return SrcSize >= 32;
803 
804  return DestSize < SrcSize && DestSize % 32 == 0;
805 }
806 
807 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
808  unsigned SrcSize = Src->getScalarSizeInBits();
809  unsigned DestSize = Dest->getScalarSizeInBits();
810 
811  if (SrcSize == 16 && Subtarget->has16BitInsts())
812  return DestSize >= 32;
813 
814  return SrcSize == 32 && DestSize == 64;
815 }
816 
817 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
818  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
819  // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
820  // this will enable reducing 64-bit operations the 32-bit, which is always
821  // good.
822 
823  if (Src == MVT::i16)
824  return Dest == MVT::i32 ||Dest == MVT::i64 ;
825 
826  return Src == MVT::i32 && Dest == MVT::i64;
827 }
828 
830  return isZExtFree(Val.getValueType(), VT2);
831 }
832 
834  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
835  // limited number of native 64-bit operations. Shrinking an operation to fit
836  // in a single 32-bit register should always be helpful. As currently used,
837  // this is much less general than the name suggests, and is only used in
838  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
839  // not profitable, and may actually be harmful.
840  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
841 }
842 
843 //===---------------------------------------------------------------------===//
844 // TargetLowering Callbacks
845 //===---------------------------------------------------------------------===//
846 
848  bool IsVarArg) {
849  switch (CC) {
857  return CC_AMDGPU;
858  case CallingConv::C:
859  case CallingConv::Fast:
860  case CallingConv::Cold:
861  return CC_AMDGPU_Func;
863  return CC_SI_Gfx;
866  default:
867  report_fatal_error("Unsupported calling convention for call");
868  }
869 }
870 
872  bool IsVarArg) {
873  switch (CC) {
876  llvm_unreachable("kernels should not be handled here");
884  return RetCC_SI_Shader;
886  return RetCC_SI_Gfx;
887  case CallingConv::C:
888  case CallingConv::Fast:
889  case CallingConv::Cold:
890  return RetCC_AMDGPU_Func;
891  default:
892  report_fatal_error("Unsupported calling convention.");
893  }
894 }
895 
896 /// The SelectionDAGBuilder will automatically promote function arguments
897 /// with illegal types. However, this does not work for the AMDGPU targets
898 /// since the function arguments are stored in memory as these illegal types.
899 /// In order to handle this properly we need to get the original types sizes
900 /// from the LLVM IR Function and fixup the ISD:InputArg values before
901 /// passing them to AnalyzeFormalArguments()
902 
903 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
904 /// input values across multiple registers. Each item in the Ins array
905 /// represents a single value that will be stored in registers. Ins[x].VT is
906 /// the value type of the value that will be stored in the register, so
907 /// whatever SDNode we lower the argument to needs to be this type.
908 ///
909 /// In order to correctly lower the arguments we need to know the size of each
910 /// argument. Since Ins[x].VT gives us the size of the register that will
911 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
912 /// for the original function argument so that we can deduce the correct memory
913 /// type to use for Ins[x]. In most cases the correct memory type will be
914 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
915 /// we have a kernel argument of type v8i8, this argument will be split into
916 /// 8 parts and each part will be represented by its own item in the Ins array.
917 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
918 /// the argument before it was split. From this, we deduce that the memory type
919 /// for each individual part is i8. We pass the memory type as LocVT to the
920 /// calling convention analysis function and the register type (Ins[x].VT) as
921 /// the ValVT.
923  CCState &State,
924  const SmallVectorImpl<ISD::InputArg> &Ins) const {
925  const MachineFunction &MF = State.getMachineFunction();
926  const Function &Fn = MF.getFunction();
927  LLVMContext &Ctx = Fn.getParent()->getContext();
929  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
931 
932  Align MaxAlign = Align(1);
933  uint64_t ExplicitArgOffset = 0;
934  const DataLayout &DL = Fn.getParent()->getDataLayout();
935 
936  unsigned InIndex = 0;
937 
938  for (const Argument &Arg : Fn.args()) {
939  const bool IsByRef = Arg.hasByRefAttr();
940  Type *BaseArgTy = Arg.getType();
941  Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
942  Align Alignment = DL.getValueOrABITypeAlignment(
943  IsByRef ? Arg.getParamAlign() : None, MemArgTy);
944  MaxAlign = std::max(Alignment, MaxAlign);
945  uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
946 
947  uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
948  ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
949 
950  // We're basically throwing away everything passed into us and starting over
951  // to get accurate in-memory offsets. The "PartOffset" is completely useless
952  // to us as computed in Ins.
953  //
954  // We also need to figure out what type legalization is trying to do to get
955  // the correct memory offsets.
956 
957  SmallVector<EVT, 16> ValueVTs;
959  ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
960 
961  for (unsigned Value = 0, NumValues = ValueVTs.size();
962  Value != NumValues; ++Value) {
963  uint64_t BasePartOffset = Offsets[Value];
964 
965  EVT ArgVT = ValueVTs[Value];
966  EVT MemVT = ArgVT;
967  MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
968  unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
969 
970  if (NumRegs == 1) {
971  // This argument is not split, so the IR type is the memory type.
972  if (ArgVT.isExtended()) {
973  // We have an extended type, like i24, so we should just use the
974  // register type.
975  MemVT = RegisterVT;
976  } else {
977  MemVT = ArgVT;
978  }
979  } else if (ArgVT.isVector() && RegisterVT.isVector() &&
980  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
981  assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
982  // We have a vector value which has been split into a vector with
983  // the same scalar type, but fewer elements. This should handle
984  // all the floating-point vector types.
985  MemVT = RegisterVT;
986  } else if (ArgVT.isVector() &&
987  ArgVT.getVectorNumElements() == NumRegs) {
988  // This arg has been split so that each element is stored in a separate
989  // register.
990  MemVT = ArgVT.getScalarType();
991  } else if (ArgVT.isExtended()) {
992  // We have an extended type, like i65.
993  MemVT = RegisterVT;
994  } else {
995  unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
996  assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
997  if (RegisterVT.isInteger()) {
998  MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
999  } else if (RegisterVT.isVector()) {
1000  assert(!RegisterVT.getScalarType().isFloatingPoint());
1001  unsigned NumElements = RegisterVT.getVectorNumElements();
1002  assert(MemoryBits % NumElements == 0);
1003  // This vector type has been split into another vector type with
1004  // a different elements size.
1005  EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1006  MemoryBits / NumElements);
1007  MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1008  } else {
1009  llvm_unreachable("cannot deduce memory type.");
1010  }
1011  }
1012 
1013  // Convert one element vectors to scalar.
1014  if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1015  MemVT = MemVT.getScalarType();
1016 
1017  // Round up vec3/vec5 argument.
1018  if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1019  assert(MemVT.getVectorNumElements() == 3 ||
1020  MemVT.getVectorNumElements() == 5);
1021  MemVT = MemVT.getPow2VectorType(State.getContext());
1022  } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1023  MemVT = MemVT.getRoundIntegerType(State.getContext());
1024  }
1025 
1026  unsigned PartOffset = 0;
1027  for (unsigned i = 0; i != NumRegs; ++i) {
1028  State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1029  BasePartOffset + PartOffset,
1030  MemVT.getSimpleVT(),
1032  PartOffset += MemVT.getStoreSize();
1033  }
1034  }
1035  }
1036 }
1037 
1039  SDValue Chain, CallingConv::ID CallConv,
1040  bool isVarArg,
1041  const SmallVectorImpl<ISD::OutputArg> &Outs,
1042  const SmallVectorImpl<SDValue> &OutVals,
1043  const SDLoc &DL, SelectionDAG &DAG) const {
1044  // FIXME: Fails for r600 tests
1045  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1046  // "wave terminate should not have return values");
1047  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1048 }
1049 
1050 //===---------------------------------------------------------------------===//
1051 // Target specific lowering
1052 //===---------------------------------------------------------------------===//
1053 
1054 /// Selects the correct CCAssignFn for a given CallingConvention value.
1056  bool IsVarArg) {
1057  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1058 }
1059 
1061  bool IsVarArg) {
1062  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1063 }
1064 
1066  SelectionDAG &DAG,
1067  MachineFrameInfo &MFI,
1068  int ClobberedFI) const {
1069  SmallVector<SDValue, 8> ArgChains;
1070  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1071  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1072 
1073  // Include the original chain at the beginning of the list. When this is
1074  // used by target LowerCall hooks, this helps legalize find the
1075  // CALLSEQ_BEGIN node.
1076  ArgChains.push_back(Chain);
1077 
1078  // Add a chain value for each stack argument corresponding
1079  for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1080  if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1081  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1082  if (FI->getIndex() < 0) {
1083  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1084  int64_t InLastByte = InFirstByte;
1085  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1086 
1087  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1088  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1089  ArgChains.push_back(SDValue(L, 1));
1090  }
1091  }
1092  }
1093  }
1094 
1095  // Build a tokenfactor for all the chains.
1096  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1097 }
1098 
1100  SmallVectorImpl<SDValue> &InVals,
1101  StringRef Reason) const {
1102  SDValue Callee = CLI.Callee;
1103  SelectionDAG &DAG = CLI.DAG;
1104 
1105  const Function &Fn = DAG.getMachineFunction().getFunction();
1106 
1107  StringRef FuncName("<unknown>");
1108 
1109  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1110  FuncName = G->getSymbol();
1111  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1112  FuncName = G->getGlobal()->getName();
1113 
1114  DiagnosticInfoUnsupported NoCalls(
1115  Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1116  DAG.getContext()->diagnose(NoCalls);
1117 
1118  if (!CLI.IsTailCall) {
1119  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1120  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1121  }
1122 
1123  return DAG.getEntryNode();
1124 }
1125 
1127  SmallVectorImpl<SDValue> &InVals) const {
1128  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1129 }
1130 
1132  SelectionDAG &DAG) const {
1133  const Function &Fn = DAG.getMachineFunction().getFunction();
1134 
1135  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1136  SDLoc(Op).getDebugLoc());
1137  DAG.getContext()->diagnose(NoDynamicAlloca);
1138  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1139  return DAG.getMergeValues(Ops, SDLoc());
1140 }
1141 
1143  SelectionDAG &DAG) const {
1144  switch (Op.getOpcode()) {
1145  default:
1146  Op->print(errs(), &DAG);
1147  llvm_unreachable("Custom lowering code for this "
1148  "instruction is not implemented yet!");
1149  break;
1151  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1153  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1154  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1155  case ISD::FREM: return LowerFREM(Op, DAG);
1156  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1157  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1158  case ISD::FRINT: return LowerFRINT(Op, DAG);
1159  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1160  case ISD::FROUND: return LowerFROUND(Op, DAG);
1161  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1162  case ISD::FLOG:
1163  return LowerFLOG(Op, DAG, numbers::ln2f);
1164  case ISD::FLOG10:
1165  return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1166  case ISD::FEXP:
1167  return lowerFEXP(Op, DAG);
1168  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1169  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1170  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1171  case ISD::FP_TO_SINT:
1172  case ISD::FP_TO_UINT:
1173  return LowerFP_TO_INT(Op, DAG);
1174  case ISD::CTTZ:
1175  case ISD::CTTZ_ZERO_UNDEF:
1176  case ISD::CTLZ:
1177  case ISD::CTLZ_ZERO_UNDEF:
1178  return LowerCTLZ_CTTZ(Op, DAG);
1180  }
1181  return Op;
1182 }
1183 
1186  SelectionDAG &DAG) const {
1187  switch (N->getOpcode()) {
1189  // Different parts of legalization seem to interpret which type of
1190  // sign_extend_inreg is the one to check for custom lowering. The extended
1191  // from type is what really matters, but some places check for custom
1192  // lowering of the result type. This results in trying to use
1193  // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1194  // nothing here and let the illegal result integer be handled normally.
1195  return;
1196  default:
1197  return;
1198  }
1199 }
1200 
1202  SDValue Op,
1203  SelectionDAG &DAG) const {
1204 
1205  const DataLayout &DL = DAG.getDataLayout();
1206  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1207  const GlobalValue *GV = G->getGlobal();
1208 
1209  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1210  G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1211  if (!MFI->isModuleEntryFunction() &&
1212  !GV->getName().equals("llvm.amdgcn.module.lds")) {
1213  SDLoc DL(Op);
1214  const Function &Fn = DAG.getMachineFunction().getFunction();
1215  DiagnosticInfoUnsupported BadLDSDecl(
1216  Fn, "local memory global used by non-kernel function",
1217  DL.getDebugLoc(), DS_Warning);
1218  DAG.getContext()->diagnose(BadLDSDecl);
1219 
1220  // We currently don't have a way to correctly allocate LDS objects that
1221  // aren't directly associated with a kernel. We do force inlining of
1222  // functions that use local objects. However, if these dead functions are
1223  // not eliminated, we don't want a compile time error. Just emit a warning
1224  // and a trap, since there should be no callable path here.
1225  SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1226  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1227  Trap, DAG.getRoot());
1228  DAG.setRoot(OutputChain);
1229  return DAG.getUNDEF(Op.getValueType());
1230  }
1231 
1232  // XXX: What does the value of G->getOffset() mean?
1233  assert(G->getOffset() == 0 &&
1234  "Do not know what to do with an non-zero offset");
1235 
1236  // TODO: We could emit code to handle the initialization somewhere.
1237  // We ignore the initializer for now and legalize it to allow selection.
1238  // The initializer will anyway get errored out during assembly emission.
1239  unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1240  return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1241  }
1242  return SDValue();
1243 }
1244 
1246  SelectionDAG &DAG) const {
1248 
1249  EVT VT = Op.getValueType();
1250  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1251  SDLoc SL(Op);
1252  SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1253  SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1254 
1255  SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1256  return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1257  }
1258 
1259  for (const SDUse &U : Op->ops())
1260  DAG.ExtractVectorElements(U.get(), Args);
1261 
1262  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1263 }
1264 
1266  SelectionDAG &DAG) const {
1267 
1269  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1270  EVT VT = Op.getValueType();
1271  EVT SrcVT = Op.getOperand(0).getValueType();
1272 
1273  // For these types, we have some TableGen patterns except if the index is 1
1274  if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1275  (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1276  Start != 1)
1277  return Op;
1278 
1279  if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1280  (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1281  (Start == 0 || Start == 4))
1282  return Op;
1283 
1284  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1285  VT.getVectorNumElements());
1286 
1287  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1288 }
1289 
1290 /// Generate Min/Max node
1293  SDValue True, SDValue False,
1294  SDValue CC,
1295  DAGCombinerInfo &DCI) const {
1296  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1297  return SDValue();
1298 
1299  SelectionDAG &DAG = DCI.DAG;
1300  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1301  switch (CCOpcode) {
1302  case ISD::SETOEQ:
1303  case ISD::SETONE:
1304  case ISD::SETUNE:
1305  case ISD::SETNE:
1306  case ISD::SETUEQ:
1307  case ISD::SETEQ:
1308  case ISD::SETFALSE:
1309  case ISD::SETFALSE2:
1310  case ISD::SETTRUE:
1311  case ISD::SETTRUE2:
1312  case ISD::SETUO:
1313  case ISD::SETO:
1314  break;
1315  case ISD::SETULE:
1316  case ISD::SETULT: {
1317  if (LHS == True)
1318  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1319  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1320  }
1321  case ISD::SETOLE:
1322  case ISD::SETOLT:
1323  case ISD::SETLE:
1324  case ISD::SETLT: {
1325  // Ordered. Assume ordered for undefined.
1326 
1327  // Only do this after legalization to avoid interfering with other combines
1328  // which might occur.
1329  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1330  !DCI.isCalledByLegalizer())
1331  return SDValue();
1332 
1333  // We need to permute the operands to get the correct NaN behavior. The
1334  // selected operand is the second one based on the failing compare with NaN,
1335  // so permute it based on the compare type the hardware uses.
1336  if (LHS == True)
1337  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1338  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1339  }
1340  case ISD::SETUGE:
1341  case ISD::SETUGT: {
1342  if (LHS == True)
1343  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1344  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1345  }
1346  case ISD::SETGT:
1347  case ISD::SETGE:
1348  case ISD::SETOGE:
1349  case ISD::SETOGT: {
1350  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1351  !DCI.isCalledByLegalizer())
1352  return SDValue();
1353 
1354  if (LHS == True)
1355  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1356  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1357  }
1358  case ISD::SETCC_INVALID:
1359  llvm_unreachable("Invalid setcc condcode!");
1360  }
1361  return SDValue();
1362 }
1363 
1364 std::pair<SDValue, SDValue>
1366  SDLoc SL(Op);
1367 
1368  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1369 
1370  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1371  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1372 
1373  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1374  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1375 
1376  return std::make_pair(Lo, Hi);
1377 }
1378 
1380  SDLoc SL(Op);
1381 
1382  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1383  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1384  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1385 }
1386 
1388  SDLoc SL(Op);
1389 
1390  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1391  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1392  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1393 }
1394 
1395 // Split a vector type into two parts. The first part is a power of two vector.
1396 // The second part is whatever is left over, and is a scalar if it would
1397 // otherwise be a 1-vector.
1398 std::pair<EVT, EVT>
1400  EVT LoVT, HiVT;
1401  EVT EltVT = VT.getVectorElementType();
1402  unsigned NumElts = VT.getVectorNumElements();
1403  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1404  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1405  HiVT = NumElts - LoNumElts == 1
1406  ? EltVT
1407  : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1408  return std::make_pair(LoVT, HiVT);
1409 }
1410 
1411 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1412 // scalar.
1413 std::pair<SDValue, SDValue>
1415  const EVT &LoVT, const EVT &HiVT,
1416  SelectionDAG &DAG) const {
1417  assert(LoVT.getVectorNumElements() +
1418  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1419  N.getValueType().getVectorNumElements() &&
1420  "More vector elements requested than available!");
1421  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1422  DAG.getVectorIdxConstant(0, DL));
1423  SDValue Hi = DAG.getNode(
1425  HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1426  return std::make_pair(Lo, Hi);
1427 }
1428 
1430  SelectionDAG &DAG) const {
1431  LoadSDNode *Load = cast<LoadSDNode>(Op);
1432  EVT VT = Op.getValueType();
1433  SDLoc SL(Op);
1434 
1435 
1436  // If this is a 2 element vector, we really want to scalarize and not create
1437  // weird 1 element vectors.
1438  if (VT.getVectorNumElements() == 2) {
1439  SDValue Ops[2];
1440  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1441  return DAG.getMergeValues(Ops, SL);
1442  }
1443 
1444  SDValue BasePtr = Load->getBasePtr();
1445  EVT MemVT = Load->getMemoryVT();
1446 
1447  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1448 
1449  EVT LoVT, HiVT;
1450  EVT LoMemVT, HiMemVT;
1451  SDValue Lo, Hi;
1452 
1453  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1454  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1455  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1456 
1457  unsigned Size = LoMemVT.getStoreSize();
1458  Align BaseAlign = Load->getAlign();
1459  Align HiAlign = commonAlignment(BaseAlign, Size);
1460 
1461  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1462  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1463  BaseAlign, Load->getMemOperand()->getFlags());
1464  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1465  SDValue HiLoad =
1466  DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1467  HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1468  HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1469 
1470  SDValue Join;
1471  if (LoVT == HiVT) {
1472  // This is the case that the vector is power of two so was evenly split.
1473  Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1474  } else {
1475  Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1476  DAG.getVectorIdxConstant(0, SL));
1477  Join = DAG.getNode(
1479  VT, Join, HiLoad,
1480  DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1481  }
1482 
1483  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1484  LoLoad.getValue(1), HiLoad.getValue(1))};
1485 
1486  return DAG.getMergeValues(Ops, SL);
1487 }
1488 
1490  SelectionDAG &DAG) const {
1491  LoadSDNode *Load = cast<LoadSDNode>(Op);
1492  EVT VT = Op.getValueType();
1493  SDValue BasePtr = Load->getBasePtr();
1494  EVT MemVT = Load->getMemoryVT();
1495  SDLoc SL(Op);
1496  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1497  Align BaseAlign = Load->getAlign();
1498  unsigned NumElements = MemVT.getVectorNumElements();
1499 
1500  // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1501  // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1502  if (NumElements != 3 ||
1503  (BaseAlign < Align(8) &&
1504  !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1505  return SplitVectorLoad(Op, DAG);
1506 
1507  assert(NumElements == 3);
1508 
1509  EVT WideVT =
1511  EVT WideMemVT =
1512  EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1513  SDValue WideLoad = DAG.getExtLoad(
1514  Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1515  WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1516  return DAG.getMergeValues(
1517  {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1518  DAG.getVectorIdxConstant(0, SL)),
1519  WideLoad.getValue(1)},
1520  SL);
1521 }
1522 
1524  SelectionDAG &DAG) const {
1525  StoreSDNode *Store = cast<StoreSDNode>(Op);
1526  SDValue Val = Store->getValue();
1527  EVT VT = Val.getValueType();
1528 
1529  // If this is a 2 element vector, we really want to scalarize and not create
1530  // weird 1 element vectors.
1531  if (VT.getVectorNumElements() == 2)
1532  return scalarizeVectorStore(Store, DAG);
1533 
1534  EVT MemVT = Store->getMemoryVT();
1535  SDValue Chain = Store->getChain();
1536  SDValue BasePtr = Store->getBasePtr();
1537  SDLoc SL(Op);
1538 
1539  EVT LoVT, HiVT;
1540  EVT LoMemVT, HiMemVT;
1541  SDValue Lo, Hi;
1542 
1543  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1544  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1545  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1546 
1547  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1548 
1549  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1550  Align BaseAlign = Store->getAlign();
1551  unsigned Size = LoMemVT.getStoreSize();
1552  Align HiAlign = commonAlignment(BaseAlign, Size);
1553 
1554  SDValue LoStore =
1555  DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1556  Store->getMemOperand()->getFlags());
1557  SDValue HiStore =
1558  DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1559  HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1560 
1561  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1562 }
1563 
1564 // This is a shortcut for integer division because we have fast i32<->f32
1565 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1566 // float is enough to accurately represent up to a 24-bit signed integer.
1568  bool Sign) const {
1569  SDLoc DL(Op);
1570  EVT VT = Op.getValueType();
1571  SDValue LHS = Op.getOperand(0);
1572  SDValue RHS = Op.getOperand(1);
1573  MVT IntVT = MVT::i32;
1574  MVT FltVT = MVT::f32;
1575 
1576  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1577  if (LHSSignBits < 9)
1578  return SDValue();
1579 
1580  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1581  if (RHSSignBits < 9)
1582  return SDValue();
1583 
1584  unsigned BitSize = VT.getSizeInBits();
1585  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1586  unsigned DivBits = BitSize - SignBits;
1587  if (Sign)
1588  ++DivBits;
1589 
1592 
1593  SDValue jq = DAG.getConstant(1, DL, IntVT);
1594 
1595  if (Sign) {
1596  // char|short jq = ia ^ ib;
1597  jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1598 
1599  // jq = jq >> (bitsize - 2)
1600  jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1601  DAG.getConstant(BitSize - 2, DL, VT));
1602 
1603  // jq = jq | 0x1
1604  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1605  }
1606 
1607  // int ia = (int)LHS;
1608  SDValue ia = LHS;
1609 
1610  // int ib, (int)RHS;
1611  SDValue ib = RHS;
1612 
1613  // float fa = (float)ia;
1614  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1615 
1616  // float fb = (float)ib;
1617  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1618 
1619  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1620  fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1621 
1622  // fq = trunc(fq);
1623  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1624 
1625  // float fqneg = -fq;
1626  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1627 
1628  MachineFunction &MF = DAG.getMachineFunction();
1630 
1631  // float fr = mad(fqneg, fb, fa);
1632  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1633  (unsigned)ISD::FMA :
1634  !MFI->getMode().allFP32Denormals() ?
1635  (unsigned)ISD::FMAD :
1636  (unsigned)AMDGPUISD::FMAD_FTZ;
1637  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1638 
1639  // int iq = (int)fq;
1640  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1641 
1642  // fr = fabs(fr);
1643  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1644 
1645  // fb = fabs(fb);
1646  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1647 
1648  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1649 
1650  // int cv = fr >= fb;
1651  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1652 
1653  // jq = (cv ? jq : 0);
1654  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1655 
1656  // dst = iq + jq;
1657  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1658 
1659  // Rem needs compensation, it's easier to recompute it
1660  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1661  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1662 
1663  // Truncate to number of bits this divide really is.
1664  if (Sign) {
1665  SDValue InRegSize
1666  = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1667  Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1668  Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1669  } else {
1670  SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1671  Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1672  Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1673  }
1674 
1675  return DAG.getMergeValues({ Div, Rem }, DL);
1676 }
1677 
1679  SelectionDAG &DAG,
1681  SDLoc DL(Op);
1682  EVT VT = Op.getValueType();
1683 
1684  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1685 
1686  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1687 
1688  SDValue One = DAG.getConstant(1, DL, HalfVT);
1689  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1690 
1691  //HiLo split
1692  SDValue LHS = Op.getOperand(0);
1693  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1694  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1695 
1696  SDValue RHS = Op.getOperand(1);
1697  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1698  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1699 
1700  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1702 
1703  SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1704  LHS_Lo, RHS_Lo);
1705 
1706  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1707  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1708 
1709  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1710  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1711  return;
1712  }
1713 
1714  if (isTypeLegal(MVT::i64)) {
1715  // The algorithm here is based on ideas from "Software Integer Division",
1716  // Tom Rodeheffer, August 2008.
1717 
1718  MachineFunction &MF = DAG.getMachineFunction();
1720 
1721  // Compute denominator reciprocal.
1722  unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1723  (unsigned)ISD::FMA :
1724  !MFI->getMode().allFP32Denormals() ?
1725  (unsigned)ISD::FMAD :
1726  (unsigned)AMDGPUISD::FMAD_FTZ;
1727 
1728  SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1729  SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1730  SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1731  DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1732  Cvt_Lo);
1733  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1734  SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1735  DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1736  SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1737  DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1738  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1739  SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1740  DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1741  Mul1);
1742  SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1743  SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1744  SDValue Rcp64 = DAG.getBitcast(VT,
1745  DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1746 
1747  SDValue Zero64 = DAG.getConstant(0, DL, VT);
1748  SDValue One64 = DAG.getConstant(1, DL, VT);
1749  SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1750  SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1751 
1752  // First round of UNR (Unsigned integer Newton-Raphson).
1753  SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1754  SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1755  SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1756  SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1757  Zero);
1758  SDValue Mulhi1_Hi =
1759  DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1760  SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1761  Mulhi1_Lo, Zero1);
1762  SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1763  Mulhi1_Hi, Add1_Lo.getValue(1));
1764  SDValue Add1 = DAG.getBitcast(VT,
1765  DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1766 
1767  // Second round of UNR.
1768  SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1769  SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1770  SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1771  Zero);
1772  SDValue Mulhi2_Hi =
1773  DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1774  SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1775  Mulhi2_Lo, Zero1);
1776  SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1777  Mulhi2_Hi, Add2_Lo.getValue(1));
1778  SDValue Add2 = DAG.getBitcast(VT,
1779  DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1780 
1781  SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1782 
1783  SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1784 
1785  SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1786  SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1787  SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1788  Mul3_Lo, Zero1);
1789  SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1790  Mul3_Hi, Sub1_Lo.getValue(1));
1791  SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1792  SDValue Sub1 = DAG.getBitcast(VT,
1793  DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1794 
1795  SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1796  SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1797  ISD::SETUGE);
1798  SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1799  ISD::SETUGE);
1800  SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1801 
1802  // TODO: Here and below portions of the code can be enclosed into if/endif.
1803  // Currently control flow is unconditional and we have 4 selects after
1804  // potential endif to substitute PHIs.
1805 
1806  // if C3 != 0 ...
1807  SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1808  RHS_Lo, Zero1);
1809  SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1810  RHS_Hi, Sub1_Lo.getValue(1));
1811  SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1812  Zero, Sub2_Lo.getValue(1));
1813  SDValue Sub2 = DAG.getBitcast(VT,
1814  DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1815 
1816  SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1817 
1818  SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1819  ISD::SETUGE);
1820  SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1821  ISD::SETUGE);
1822  SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1823 
1824  // if (C6 != 0)
1825  SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1826 
1827  SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1828  RHS_Lo, Zero1);
1829  SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1830  RHS_Hi, Sub2_Lo.getValue(1));
1831  SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1832  Zero, Sub3_Lo.getValue(1));
1833  SDValue Sub3 = DAG.getBitcast(VT,
1834  DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1835 
1836  // endif C6
1837  // endif C3
1838 
1839  SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1840  SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1841 
1842  SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1843  SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1844 
1845  Results.push_back(Div);
1846  Results.push_back(Rem);
1847 
1848  return;
1849  }
1850 
1851  // r600 expandion.
1852  // Get Speculative values
1853  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1854  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1855 
1856  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1857  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1858  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1859 
1860  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1861  SDValue DIV_Lo = Zero;
1862 
1863  const unsigned halfBitWidth = HalfVT.getSizeInBits();
1864 
1865  for (unsigned i = 0; i < halfBitWidth; ++i) {
1866  const unsigned bitPos = halfBitWidth - i - 1;
1867  SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1868  // Get value of high bit
1869  SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1870  HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1871  HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1872 
1873  // Shift
1874  REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1875  // Add LHS high bit
1876  REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1877 
1878  SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1879  SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1880 
1881  DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1882 
1883  // Update REM
1884  SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1885  REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1886  }
1887 
1888  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1889  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1890  Results.push_back(DIV);
1891  Results.push_back(REM);
1892 }
1893 
1894 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1895  SelectionDAG &DAG) const {
1896  SDLoc DL(Op);
1897  EVT VT = Op.getValueType();
1898 
1899  if (VT == MVT::i64) {
1901  LowerUDIVREM64(Op, DAG, Results);
1902  return DAG.getMergeValues(Results, DL);
1903  }
1904 
1905  if (VT == MVT::i32) {
1906  if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1907  return Res;
1908  }
1909 
1910  SDValue X = Op.getOperand(0);
1911  SDValue Y = Op.getOperand(1);
1912 
1913  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
1914  // algorithm used here.
1915 
1916  // Initial estimate of inv(y).
1917  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
1918 
1919  // One round of UNR.
1920  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
1921  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
1922  Z = DAG.getNode(ISD::ADD, DL, VT, Z,
1923  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
1924 
1925  // Quotient/remainder estimate.
1926  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
1927  SDValue R =
1928  DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
1929 
1930  // First quotient/remainder refinement.
1931  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1932  SDValue One = DAG.getConstant(1, DL, VT);
1933  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
1934  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1935  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
1936  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1937  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
1938 
1939  // Second quotient/remainder refinement.
1940  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
1941  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1942  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
1943  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1944  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
1945 
1946  return DAG.getMergeValues({Q, R}, DL);
1947 }
1948 
1950  SelectionDAG &DAG) const {
1951  SDLoc DL(Op);
1952  EVT VT = Op.getValueType();
1953 
1954  SDValue LHS = Op.getOperand(0);
1955  SDValue RHS = Op.getOperand(1);
1956 
1957  SDValue Zero = DAG.getConstant(0, DL, VT);
1958  SDValue NegOne = DAG.getConstant(-1, DL, VT);
1959 
1960  if (VT == MVT::i32) {
1961  if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1962  return Res;
1963  }
1964 
1965  if (VT == MVT::i64 &&
1966  DAG.ComputeNumSignBits(LHS) > 32 &&
1967  DAG.ComputeNumSignBits(RHS) > 32) {
1968  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1969 
1970  //HiLo split
1971  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1972  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1973  SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1974  LHS_Lo, RHS_Lo);
1975  SDValue Res[2] = {
1976  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1977  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1978  };
1979  return DAG.getMergeValues(Res, DL);
1980  }
1981 
1982  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1983  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1984  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1985  SDValue RSign = LHSign; // Remainder sign is the same as LHS
1986 
1987  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1988  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1989 
1990  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1991  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1992 
1993  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1994  SDValue Rem = Div.getValue(1);
1995 
1996  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1997  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1998 
1999  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2000  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2001 
2002  SDValue Res[2] = {
2003  Div,
2004  Rem
2005  };
2006  return DAG.getMergeValues(Res, DL);
2007 }
2008 
2009 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2011  SDLoc SL(Op);
2012  EVT VT = Op.getValueType();
2013  auto Flags = Op->getFlags();
2014  SDValue X = Op.getOperand(0);
2015  SDValue Y = Op.getOperand(1);
2016 
2017  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2018  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2019  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2020  // TODO: For f32 use FMAD instead if !hasFastFMA32?
2021  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2022 }
2023 
2025  SDLoc SL(Op);
2026  SDValue Src = Op.getOperand(0);
2027 
2028  // result = trunc(src)
2029  // if (src > 0.0 && src != result)
2030  // result += 1.0
2031 
2032  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2033 
2034  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2035  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2036 
2037  EVT SetCCVT =
2039 
2040  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2041  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2042  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2043 
2044  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2045  // TODO: Should this propagate fast-math-flags?
2046  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2047 }
2048 
2050  SelectionDAG &DAG) {
2051  const unsigned FractBits = 52;
2052  const unsigned ExpBits = 11;
2053 
2054  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2055  Hi,
2056  DAG.getConstant(FractBits - 32, SL, MVT::i32),
2057  DAG.getConstant(ExpBits, SL, MVT::i32));
2058  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2059  DAG.getConstant(1023, SL, MVT::i32));
2060 
2061  return Exp;
2062 }
2063 
2065  SDLoc SL(Op);
2066  SDValue Src = Op.getOperand(0);
2067 
2068  assert(Op.getValueType() == MVT::f64);
2069 
2070  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2071 
2072  // Extract the upper half, since this is where we will find the sign and
2073  // exponent.
2074  SDValue Hi = getHiHalf64(Src, DAG);
2075 
2076  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2077 
2078  const unsigned FractBits = 52;
2079 
2080  // Extract the sign bit.
2081  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2082  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2083 
2084  // Extend back to 64-bits.
2085  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2086  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2087 
2088  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2089  const SDValue FractMask
2090  = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2091 
2092  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2093  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2094  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2095 
2096  EVT SetCCVT =
2098 
2099  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2100 
2101  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2102  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2103 
2104  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2105  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2106 
2107  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2108 }
2109 
2111  SDLoc SL(Op);
2112  SDValue Src = Op.getOperand(0);
2113 
2114  assert(Op.getValueType() == MVT::f64);
2115 
2116  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2117  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2118  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2119 
2120  // TODO: Should this propagate fast-math-flags?
2121 
2122  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2123  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2124 
2125  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2126 
2127  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2128  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2129 
2130  EVT SetCCVT =
2132  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2133 
2134  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2135 }
2136 
2138  // FNEARBYINT and FRINT are the same, except in their handling of FP
2139  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2140  // rint, so just treat them as equivalent.
2141  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2142 }
2143 
2144 // XXX - May require not supporting f32 denormals?
2145 
2146 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2147 // compare and vselect end up producing worse code than scalarizing the whole
2148 // operation.
2150  SDLoc SL(Op);
2151  SDValue X = Op.getOperand(0);
2152  EVT VT = Op.getValueType();
2153 
2154  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2155 
2156  // TODO: Should this propagate fast-math-flags?
2157 
2158  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2159 
2160  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2161 
2162  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2163  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2164  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2165 
2166  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2167 
2168  EVT SetCCVT =
2169  getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2170 
2171  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2172 
2173  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2174 
2175  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2176 }
2177 
2179  SDLoc SL(Op);
2180  SDValue Src = Op.getOperand(0);
2181 
2182  // result = trunc(src);
2183  // if (src < 0.0 && src != result)
2184  // result += -1.0.
2185 
2186  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2187 
2188  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2189  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2190 
2191  EVT SetCCVT =
2193 
2194  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2195  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2196  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2197 
2198  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2199  // TODO: Should this propagate fast-math-flags?
2200  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2201 }
2202 
2204  double Log2BaseInverted) const {
2205  EVT VT = Op.getValueType();
2206 
2207  SDLoc SL(Op);
2208  SDValue Operand = Op.getOperand(0);
2209  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2210  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2211 
2212  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2213 }
2214 
2215 // exp2(M_LOG2E_F * f);
2217  EVT VT = Op.getValueType();
2218  SDLoc SL(Op);
2219  SDValue Src = Op.getOperand(0);
2220 
2221  const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2222  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2223  return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2224 }
2225 
2226 static bool isCtlzOpc(unsigned Opc) {
2227  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2228 }
2229 
2230 static bool isCttzOpc(unsigned Opc) {
2231  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2232 }
2233 
2235  SDLoc SL(Op);
2236  SDValue Src = Op.getOperand(0);
2237 
2238  assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2239  bool Ctlz = isCtlzOpc(Op.getOpcode());
2240  unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2241 
2242  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2243  Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2244 
2245  if (Src.getValueType() == MVT::i32) {
2246  // (ctlz hi:lo) -> (umin (ffbh src), 32)
2247  // (cttz hi:lo) -> (umin (ffbl src), 32)
2248  // (ctlz_zero_undef src) -> (ffbh src)
2249  // (cttz_zero_undef src) -> (ffbl src)
2250  SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2251  if (!ZeroUndef) {
2252  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2253  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2254  }
2255  return NewOpr;
2256  }
2257 
2258  SDValue Lo, Hi;
2259  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2260 
2261  SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2262  SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2263 
2264  // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2265  // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2266  // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2267  // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2268 
2269  unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2270  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2271  if (Ctlz)
2272  OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2273  else
2274  OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2275 
2276  SDValue NewOpr;
2277  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2278  if (!ZeroUndef) {
2279  const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2280  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2281  }
2282 
2283  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2284 }
2285 
2287  bool Signed) const {
2288  // The regular method converting a 64-bit integer to float roughly consists of
2289  // 2 steps: normalization and rounding. In fact, after normalization, the
2290  // conversion from a 64-bit integer to a float is essentially the same as the
2291  // one from a 32-bit integer. The only difference is that it has more
2292  // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2293  // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2294  // converted into the correct float number. The basic steps for the unsigned
2295  // conversion are illustrated in the following pseudo code:
2296  //
2297  // f32 uitofp(i64 u) {
2298  // i32 hi, lo = split(u);
2299  // // Only count the leading zeros in hi as we have native support of the
2300  // // conversion from i32 to f32. If hi is all 0s, the conversion is
2301  // // reduced to a 32-bit one automatically.
2302  // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2303  // u <<= shamt;
2304  // hi, lo = split(u);
2305  // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2306  // // convert it as a 32-bit integer and scale the result back.
2307  // return uitofp(hi) * 2^(32 - shamt);
2308  // }
2309  //
2310  // The signed one follows the same principle but uses 'ffbh_i32' to count its
2311  // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2312  // converted instead followed by negation based its sign bit.
2313 
2314  SDLoc SL(Op);
2315  SDValue Src = Op.getOperand(0);
2316 
2317  SDValue Lo, Hi;
2318  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2319  SDValue Sign;
2320  SDValue ShAmt;
2321  if (Signed && Subtarget->isGCN()) {
2322  // We also need to consider the sign bit in Lo if Hi has just sign bits,
2323  // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2324  // account. That is, the maximal shift is
2325  // - 32 if Lo and Hi have opposite signs;
2326  // - 33 if Lo and Hi have the same sign.
2327  //
2328  // Or, MaxShAmt = 33 + OppositeSign, where
2329  //
2330  // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2331  // - -1 if Lo and Hi have opposite signs; and
2332  // - 0 otherwise.
2333  //
2334  // All in all, ShAmt is calculated as
2335  //
2336  // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2337  //
2338  // or
2339  //
2340  // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2341  //
2342  // to reduce the critical path.
2343  SDValue OppositeSign = DAG.getNode(
2344  ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2345  DAG.getConstant(31, SL, MVT::i32));
2346  SDValue MaxShAmt =
2347  DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2348  OppositeSign);
2349  // Count the leading sign bits.
2350  ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2351  // Different from unsigned conversion, the shift should be one bit less to
2352  // preserve the sign bit.
2353  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2354  DAG.getConstant(1, SL, MVT::i32));
2355  ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2356  } else {
2357  if (Signed) {
2358  // Without 'ffbh_i32', only leading zeros could be counted. Take the
2359  // absolute value first.
2360  Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2361  DAG.getConstant(63, SL, MVT::i64));
2362  SDValue Abs =
2363  DAG.getNode(ISD::XOR, SL, MVT::i64,
2364  DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2365  std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2366  }
2367  // Count the leading zeros.
2368  ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2369  // The shift amount for signed integers is [0, 32].
2370  }
2371  // Normalize the given 64-bit integer.
2372  SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2373  // Split it again.
2374  std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2375  // Calculate the adjust bit for rounding.
2376  // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2377  SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2378  DAG.getConstant(1, SL, MVT::i32), Lo);
2379  // Get the 32-bit normalized integer.
2380  Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2381  // Convert the normalized 32-bit integer into f32.
2382  unsigned Opc =
2383  (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2384  SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2385 
2386  // Finally, need to scale back the converted floating number as the original
2387  // 64-bit integer is converted as a 32-bit one.
2388  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2389  ShAmt);
2390  // On GCN, use LDEXP directly.
2391  if (Subtarget->isGCN())
2392  return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2393 
2394  // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2395  // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2396  // exponent is enough to avoid overflowing into the sign bit.
2397  SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2398  DAG.getConstant(23, SL, MVT::i32));
2399  SDValue IVal =
2400  DAG.getNode(ISD::ADD, SL, MVT::i32,
2401  DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2402  if (Signed) {
2403  // Set the sign bit.
2404  Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2405  DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2406  DAG.getConstant(31, SL, MVT::i32));
2407  IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2408  }
2409  return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2410 }
2411 
2413  bool Signed) const {
2414  SDLoc SL(Op);
2415  SDValue Src = Op.getOperand(0);
2416 
2417  SDValue Lo, Hi;
2418  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2419 
2421  SL, MVT::f64, Hi);
2422 
2423  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2424 
2425  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2426  DAG.getConstant(32, SL, MVT::i32));
2427  // TODO: Should this propagate fast-math-flags?
2428  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2429 }
2430 
2432  SelectionDAG &DAG) const {
2433  // TODO: Factor out code common with LowerSINT_TO_FP.
2434  EVT DestVT = Op.getValueType();
2435  SDValue Src = Op.getOperand(0);
2436  EVT SrcVT = Src.getValueType();
2437 
2438  if (SrcVT == MVT::i16) {
2439  if (DestVT == MVT::f16)
2440  return Op;
2441  SDLoc DL(Op);
2442 
2443  // Promote src to i32
2445  return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2446  }
2447 
2448  assert(SrcVT == MVT::i64 && "operation should be legal");
2449 
2450  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2451  SDLoc DL(Op);
2452 
2453  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2454  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2455  SDValue FPRound =
2456  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2457 
2458  return FPRound;
2459  }
2460 
2461  if (DestVT == MVT::f32)
2462  return LowerINT_TO_FP32(Op, DAG, false);
2463 
2464  assert(DestVT == MVT::f64);
2465  return LowerINT_TO_FP64(Op, DAG, false);
2466 }
2467 
2469  SelectionDAG &DAG) const {
2470  EVT DestVT = Op.getValueType();
2471 
2472  SDValue Src = Op.getOperand(0);
2473  EVT SrcVT = Src.getValueType();
2474 
2475  if (SrcVT == MVT::i16) {
2476  if (DestVT == MVT::f16)
2477  return Op;
2478 
2479  SDLoc DL(Op);
2480  // Promote src to i32
2482  return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2483  }
2484 
2485  assert(SrcVT == MVT::i64 && "operation should be legal");
2486 
2487  // TODO: Factor out code common with LowerUINT_TO_FP.
2488 
2489  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2490  SDLoc DL(Op);
2491  SDValue Src = Op.getOperand(0);
2492 
2493  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2494  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2495  SDValue FPRound =
2496  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2497 
2498  return FPRound;
2499  }
2500 
2501  if (DestVT == MVT::f32)
2502  return LowerINT_TO_FP32(Op, DAG, true);
2503 
2504  assert(DestVT == MVT::f64);
2505  return LowerINT_TO_FP64(Op, DAG, true);
2506 }
2507 
2509  bool Signed) const {
2510  SDLoc SL(Op);
2511 
2512  SDValue Src = Op.getOperand(0);
2513  EVT SrcVT = Src.getValueType();
2514 
2515  assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2516 
2517  // The basic idea of converting a floating point number into a pair of 32-bit
2518  // integers is illustrated as follows:
2519  //
2520  // tf := trunc(val);
2521  // hif := floor(tf * 2^-32);
2522  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2523  // hi := fptoi(hif);
2524  // lo := fptoi(lof);
2525  //
2526  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2527  SDValue Sign;
2528  if (Signed && SrcVT == MVT::f32) {
2529  // However, a 32-bit floating point number has only 23 bits mantissa and
2530  // it's not enough to hold all the significant bits of `lof` if val is
2531  // negative. To avoid the loss of precision, We need to take the absolute
2532  // value after truncating and flip the result back based on the original
2533  // signedness.
2534  Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2535  DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2536  DAG.getConstant(31, SL, MVT::i32));
2537  Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2538  }
2539 
2540  SDValue K0, K1;
2541  if (SrcVT == MVT::f64) {
2542  K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
2543  SL, SrcVT);
2544  K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
2545  SL, SrcVT);
2546  } else {
2547  K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
2548  SrcVT);
2549  K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
2550  SrcVT);
2551  }
2552  // TODO: Should this propagate fast-math-flags?
2553  SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2554 
2555  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2556 
2557  SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2558 
2559  SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2560  : ISD::FP_TO_UINT,
2561  SL, MVT::i32, FloorMul);
2562  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2563 
2564  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2565  DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2566 
2567  if (Signed && SrcVT == MVT::f32) {
2568  assert(Sign);
2569  // Flip the result based on the signedness, which is either all 0s or 1s.
2570  Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2571  DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2572  // r := xor(r, sign) - sign;
2573  Result =
2574  DAG.getNode(ISD::SUB, SL, MVT::i64,
2575  DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2576  }
2577 
2578  return Result;
2579 }
2580 
2582  SDLoc DL(Op);
2583  SDValue N0 = Op.getOperand(0);
2584 
2585  // Convert to target node to get known bits
2586  if (N0.getValueType() == MVT::f32)
2587  return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2588 
2589  if (getTargetMachine().Options.UnsafeFPMath) {
2590  // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2591  return SDValue();
2592  }
2593 
2595 
2596  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2597  const unsigned ExpMask = 0x7ff;
2598  const unsigned ExpBiasf64 = 1023;
2599  const unsigned ExpBiasf16 = 15;
2600  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2601  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2602  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2603  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2604  DAG.getConstant(32, DL, MVT::i64));
2605  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2606  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2607  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2608  DAG.getConstant(20, DL, MVT::i64));
2609  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2610  DAG.getConstant(ExpMask, DL, MVT::i32));
2611  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2612  // add the f16 bias (15) to get the biased exponent for the f16 format.
2613  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2614  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2615 
2616  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2617  DAG.getConstant(8, DL, MVT::i32));
2618  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2619  DAG.getConstant(0xffe, DL, MVT::i32));
2620 
2621  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2622  DAG.getConstant(0x1ff, DL, MVT::i32));
2623  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2624 
2625  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2626  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2627 
2628  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2629  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2630  DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2631  Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2632 
2633  // N = M | (E << 12);
2634  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2635  DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2636  DAG.getConstant(12, DL, MVT::i32)));
2637 
2638  // B = clamp(1-E, 0, 13);
2639  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2640  One, E);
2641  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2642  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2643  DAG.getConstant(13, DL, MVT::i32));
2644 
2645  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2646  DAG.getConstant(0x1000, DL, MVT::i32));
2647 
2648  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2649  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2650  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2651  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2652 
2653  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2654  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2655  DAG.getConstant(0x7, DL, MVT::i32));
2656  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2657  DAG.getConstant(2, DL, MVT::i32));
2658  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2659  One, Zero, ISD::SETEQ);
2660  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2661  One, Zero, ISD::SETGT);
2662  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2663  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2664 
2665  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2666  DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2667  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2668  I, V, ISD::SETEQ);
2669 
2670  // Extract the sign bit.
2671  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2672  DAG.getConstant(16, DL, MVT::i32));
2673  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2674  DAG.getConstant(0x8000, DL, MVT::i32));
2675 
2676  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2677  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2678 }
2679 
2681  SelectionDAG &DAG) const {
2682  SDValue Src = Op.getOperand(0);
2683  unsigned OpOpcode = Op.getOpcode();
2684  EVT SrcVT = Src.getValueType();
2685  EVT DestVT = Op.getValueType();
2686 
2687  // Will be selected natively
2688  if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2689  return Op;
2690 
2691  // Promote i16 to i32
2692  if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2693  SDLoc DL(Op);
2694 
2695  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2696  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2697  }
2698 
2699  if (SrcVT == MVT::f16 ||
2700  (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2701  SDLoc DL(Op);
2702 
2703  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2704  unsigned Ext =
2706  return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2707  }
2708 
2709  if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2710  return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2711 
2712  return SDValue();
2713 }
2714 
2716  SelectionDAG &DAG) const {
2717  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2718  MVT VT = Op.getSimpleValueType();
2719  MVT ScalarVT = VT.getScalarType();
2720 
2721  assert(VT.isVector());
2722 
2723  SDValue Src = Op.getOperand(0);
2724  SDLoc DL(Op);
2725 
2726  // TODO: Don't scalarize on Evergreen?
2727  unsigned NElts = VT.getVectorNumElements();
2729  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2730 
2731  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2732  for (unsigned I = 0; I < NElts; ++I)
2733  Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2734 
2735  return DAG.getBuildVector(VT, DL, Args);
2736 }
2737 
2738 //===----------------------------------------------------------------------===//
2739 // Custom DAG optimizations
2740 //===----------------------------------------------------------------------===//
2741 
2742 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2743  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2744 }
2745 
2746 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2747  EVT VT = Op.getValueType();
2748  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2749  // as unsigned 24-bit values.
2751 }
2752 
2755  SelectionDAG &DAG = DCI.DAG;
2756  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2757  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2758 
2759  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2760  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2761  unsigned NewOpcode = Node24->getOpcode();
2762  if (IsIntrin) {
2763  unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2764  switch (IID) {
2765  case Intrinsic::amdgcn_mul_i24:
2766  NewOpcode = AMDGPUISD::MUL_I24;
2767  break;
2768  case Intrinsic::amdgcn_mul_u24:
2769  NewOpcode = AMDGPUISD::MUL_U24;
2770  break;
2771  case Intrinsic::amdgcn_mulhi_i24:
2772  NewOpcode = AMDGPUISD::MULHI_I24;
2773  break;
2774  case Intrinsic::amdgcn_mulhi_u24:
2775  NewOpcode = AMDGPUISD::MULHI_U24;
2776  break;
2777  default:
2778  llvm_unreachable("Expected 24-bit mul intrinsic");
2779  }
2780  }
2781 
2782  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2783 
2784  // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2785  // the operands to have other uses, but will only perform simplifications that
2786  // involve bypassing some nodes for this user.
2787  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2788  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2789  if (DemandedLHS || DemandedRHS)
2790  return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2791  DemandedLHS ? DemandedLHS : LHS,
2792  DemandedRHS ? DemandedRHS : RHS);
2793 
2794  // Now try SimplifyDemandedBits which can simplify the nodes used by our
2795  // operands if this node is the only user.
2796  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2797  return SDValue(Node24, 0);
2798  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2799  return SDValue(Node24, 0);
2800 
2801  return SDValue();
2802 }
2803 
2804 template <typename IntTy>
2805 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2806  uint32_t Width, const SDLoc &DL) {
2807  if (Width + Offset < 32) {
2808  uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2809  IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2810  return DAG.getConstant(Result, DL, MVT::i32);
2811  }
2812 
2813  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2814 }
2815 
2816 static bool hasVolatileUser(SDNode *Val) {
2817  for (SDNode *U : Val->uses()) {
2818  if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2819  if (M->isVolatile())
2820  return true;
2821  }
2822  }
2823 
2824  return false;
2825 }
2826 
2828  // i32 vectors are the canonical memory type.
2829  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2830  return false;
2831 
2832  if (!VT.isByteSized())
2833  return false;
2834 
2835  unsigned Size = VT.getStoreSize();
2836 
2837  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2838  return false;
2839 
2840  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2841  return false;
2842 
2843  return true;
2844 }
2845 
2846 // Replace load of an illegal type with a store of a bitcast to a friendlier
2847 // type.
2849  DAGCombinerInfo &DCI) const {
2850  if (!DCI.isBeforeLegalize())
2851  return SDValue();
2852 
2853  LoadSDNode *LN = cast<LoadSDNode>(N);
2854  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2855  return SDValue();
2856 
2857  SDLoc SL(N);
2858  SelectionDAG &DAG = DCI.DAG;
2859  EVT VT = LN->getMemoryVT();
2860 
2861  unsigned Size = VT.getStoreSize();
2862  Align Alignment = LN->getAlign();
2863  if (Alignment < Size && isTypeLegal(VT)) {
2864  bool IsFast;
2865  unsigned AS = LN->getAddressSpace();
2866 
2867  // Expand unaligned loads earlier than legalization. Due to visitation order
2868  // problems during legalization, the emitted instructions to pack and unpack
2869  // the bytes again are not eliminated in the case of an unaligned copy.
2871  VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2872  if (VT.isVector())
2873  return SplitVectorLoad(SDValue(LN, 0), DAG);
2874 
2875  SDValue Ops[2];
2876  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2877 
2878  return DAG.getMergeValues(Ops, SDLoc(N));
2879  }
2880 
2881  if (!IsFast)
2882  return SDValue();
2883  }
2884 
2885  if (!shouldCombineMemoryType(VT))
2886  return SDValue();
2887 
2888  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2889 
2890  SDValue NewLoad
2891  = DAG.getLoad(NewVT, SL, LN->getChain(),
2892  LN->getBasePtr(), LN->getMemOperand());
2893 
2894  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2895  DCI.CombineTo(N, BC, NewLoad.getValue(1));
2896  return SDValue(N, 0);
2897 }
2898 
2899 // Replace store of an illegal type with a store of a bitcast to a friendlier
2900 // type.
2902  DAGCombinerInfo &DCI) const {
2903  if (!DCI.isBeforeLegalize())
2904  return SDValue();
2905 
2906  StoreSDNode *SN = cast<StoreSDNode>(N);
2907  if (!SN->isSimple() || !ISD::isNormalStore(SN))
2908  return SDValue();
2909 
2910  EVT VT = SN->getMemoryVT();
2911  unsigned Size = VT.getStoreSize();
2912 
2913  SDLoc SL(N);
2914  SelectionDAG &DAG = DCI.DAG;
2915  Align Alignment = SN->getAlign();
2916  if (Alignment < Size && isTypeLegal(VT)) {
2917  bool IsFast;
2918  unsigned AS = SN->getAddressSpace();
2919 
2920  // Expand unaligned stores earlier than legalization. Due to visitation
2921  // order problems during legalization, the emitted instructions to pack and
2922  // unpack the bytes again are not eliminated in the case of an unaligned
2923  // copy.
2925  VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
2926  if (VT.isVector())
2927  return SplitVectorStore(SDValue(SN, 0), DAG);
2928 
2929  return expandUnalignedStore(SN, DAG);
2930  }
2931 
2932  if (!IsFast)
2933  return SDValue();
2934  }
2935 
2936  if (!shouldCombineMemoryType(VT))
2937  return SDValue();
2938 
2939  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2940  SDValue Val = SN->getValue();
2941 
2942  //DCI.AddToWorklist(Val.getNode());
2943 
2944  bool OtherUses = !Val.hasOneUse();
2945  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2946  if (OtherUses) {
2947  SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2948  DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2949  }
2950 
2951  return DAG.getStore(SN->getChain(), SL, CastVal,
2952  SN->getBasePtr(), SN->getMemOperand());
2953 }
2954 
2955 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2956 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2957 // issues.
2959  DAGCombinerInfo &DCI) const {
2960  SelectionDAG &DAG = DCI.DAG;
2961  SDValue N0 = N->getOperand(0);
2962 
2963  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2964  // (vt2 (truncate (assertzext vt0:x, vt1)))
2965  if (N0.getOpcode() == ISD::TRUNCATE) {
2966  SDValue N1 = N->getOperand(1);
2967  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2968  SDLoc SL(N);
2969 
2970  SDValue Src = N0.getOperand(0);
2971  EVT SrcVT = Src.getValueType();
2972  if (SrcVT.bitsGE(ExtVT)) {
2973  SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2974  return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2975  }
2976  }
2977 
2978  return SDValue();
2979 }
2980 
2982  SDNode *N, DAGCombinerInfo &DCI) const {
2983  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2984  switch (IID) {
2985  case Intrinsic::amdgcn_mul_i24:
2986  case Intrinsic::amdgcn_mul_u24:
2987  case Intrinsic::amdgcn_mulhi_i24:
2988  case Intrinsic::amdgcn_mulhi_u24:
2989  return simplifyMul24(N, DCI);
2990  case Intrinsic::amdgcn_fract:
2991  case Intrinsic::amdgcn_rsq:
2992  case Intrinsic::amdgcn_rcp_legacy:
2993  case Intrinsic::amdgcn_rsq_legacy:
2994  case Intrinsic::amdgcn_rsq_clamp:
2995  case Intrinsic::amdgcn_ldexp: {
2996  // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
2997  SDValue Src = N->getOperand(1);
2998  return Src.isUndef() ? Src : SDValue();
2999  }
3000  default:
3001  return SDValue();
3002  }
3003 }
3004 
3005 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3006 /// binary operation \p Opc to it with the corresponding constant operands.
3008  DAGCombinerInfo &DCI, const SDLoc &SL,
3009  unsigned Opc, SDValue LHS,
3010  uint32_t ValLo, uint32_t ValHi) const {
3011  SelectionDAG &DAG = DCI.DAG;
3012  SDValue Lo, Hi;
3013  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3014 
3015  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3016  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3017 
3018  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3019  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3020 
3021  // Re-visit the ands. It's possible we eliminated one of them and it could
3022  // simplify the vector.
3023  DCI.AddToWorklist(Lo.getNode());
3024  DCI.AddToWorklist(Hi.getNode());
3025 
3026  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3027  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3028 }
3029 
3031  DAGCombinerInfo &DCI) const {
3032  EVT VT = N->getValueType(0);
3033 
3034  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3035  if (!RHS)
3036  return SDValue();
3037 
3038  SDValue LHS = N->getOperand(0);
3039  unsigned RHSVal = RHS->getZExtValue();
3040  if (!RHSVal)
3041  return LHS;
3042 
3043  SDLoc SL(N);
3044  SelectionDAG &DAG = DCI.DAG;
3045 
3046  switch (LHS->getOpcode()) {
3047  default:
3048  break;
3049  case ISD::ZERO_EXTEND:
3050  case ISD::SIGN_EXTEND:
3051  case ISD::ANY_EXTEND: {
3052  SDValue X = LHS->getOperand(0);
3053 
3054  if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3056  // Prefer build_vector as the canonical form if packed types are legal.
3057  // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3058  SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3059  { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3060  return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3061  }
3062 
3063  // shl (ext x) => zext (shl x), if shift does not overflow int
3064  if (VT != MVT::i64)
3065  break;
3066  KnownBits Known = DAG.computeKnownBits(X);
3067  unsigned LZ = Known.countMinLeadingZeros();
3068  if (LZ < RHSVal)
3069  break;
3070  EVT XVT = X.getValueType();
3071  SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3072  return DAG.getZExtOrTrunc(Shl, SL, VT);
3073  }
3074  }
3075 
3076  if (VT != MVT::i64)
3077  return SDValue();
3078 
3079  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3080 
3081  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3082  // common case, splitting this into a move and a 32-bit shift is faster and
3083  // the same code size.
3084  if (RHSVal < 32)
3085  return SDValue();
3086 
3087  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3088 
3089  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3090  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3091 
3092  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3093 
3094  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3095  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3096 }
3097 
3099  DAGCombinerInfo &DCI) const {
3100  if (N->getValueType(0) != MVT::i64)
3101  return SDValue();
3102 
3103  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3104  if (!RHS)
3105  return SDValue();
3106 
3107  SelectionDAG &DAG = DCI.DAG;
3108  SDLoc SL(N);
3109  unsigned RHSVal = RHS->getZExtValue();
3110 
3111  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3112  if (RHSVal == 32) {
3113  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3114  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3115  DAG.getConstant(31, SL, MVT::i32));
3116 
3117  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3118  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3119  }
3120 
3121  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3122  if (RHSVal == 63) {
3123  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3124  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3125  DAG.getConstant(31, SL, MVT::i32));
3126  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3127  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3128  }
3129 
3130  return SDValue();
3131 }
3132 
3134  DAGCombinerInfo &DCI) const {
3135  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3136  if (!RHS)
3137  return SDValue();
3138 
3139  EVT VT = N->getValueType(0);
3140  SDValue LHS = N->getOperand(0);
3141  unsigned ShiftAmt = RHS->getZExtValue();
3142  SelectionDAG &DAG = DCI.DAG;
3143  SDLoc SL(N);
3144 
3145  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3146  // this improves the ability to match BFE patterns in isel.
3147  if (LHS.getOpcode() == ISD::AND) {
3148  if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3149  unsigned MaskIdx, MaskLen;
3150  if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3151  MaskIdx == ShiftAmt) {
3152  return DAG.getNode(
3153  ISD::AND, SL, VT,
3154  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3155  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3156  }
3157  }
3158  }
3159 
3160  if (VT != MVT::i64)
3161  return SDValue();
3162 
3163  if (ShiftAmt < 32)
3164  return SDValue();
3165 
3166  // srl i64:x, C for C >= 32
3167  // =>
3168  // build_pair (srl hi_32(x), C - 32), 0
3169  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3170 
3171  SDValue Hi = getHiHalf64(LHS, DAG);
3172 
3173  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3174  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3175 
3176  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3177 
3178  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3179 }
3180 
3182  SDNode *N, DAGCombinerInfo &DCI) const {
3183  SDLoc SL(N);
3184  SelectionDAG &DAG = DCI.DAG;
3185  EVT VT = N->getValueType(0);
3186  SDValue Src = N->getOperand(0);
3187 
3188  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3189  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3190  SDValue Vec = Src.getOperand(0);
3191  if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3192  SDValue Elt0 = Vec.getOperand(0);
3193  EVT EltVT = Elt0.getValueType();
3194  if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3195  if (EltVT.isFloatingPoint()) {
3196  Elt0 = DAG.getNode(ISD::BITCAST, SL,
3197  EltVT.changeTypeToInteger(), Elt0);
3198  }
3199 
3200  return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3201  }
3202  }
3203  }
3204 
3205  // Equivalent of above for accessing the high element of a vector as an
3206  // integer operation.
3207  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3208  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3209  if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3210  if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3211  SDValue BV = stripBitcast(Src.getOperand(0));
3212  if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3213  BV.getValueType().getVectorNumElements() == 2) {
3214  SDValue SrcElt = BV.getOperand(1);
3215  EVT SrcEltVT = SrcElt.getValueType();
3216  if (SrcEltVT.isFloatingPoint()) {
3217  SrcElt = DAG.getNode(ISD::BITCAST, SL,
3218  SrcEltVT.changeTypeToInteger(), SrcElt);
3219  }
3220 
3221  return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3222  }
3223  }
3224  }
3225  }
3226 
3227  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3228  //
3229  // i16 (trunc (srl i64:x, K)), K <= 16 ->
3230  // i16 (trunc (srl (i32 (trunc x), K)))
3231  if (VT.getScalarSizeInBits() < 32) {
3232  EVT SrcVT = Src.getValueType();
3233  if (SrcVT.getScalarSizeInBits() > 32 &&
3234  (Src.getOpcode() == ISD::SRL ||
3235  Src.getOpcode() == ISD::SRA ||
3236  Src.getOpcode() == ISD::SHL)) {
3237  SDValue Amt = Src.getOperand(1);
3238  KnownBits Known = DAG.computeKnownBits(Amt);
3239  unsigned Size = VT.getScalarSizeInBits();
3240  if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3241  (Known.countMaxActiveBits() <= Log2_32(Size))) {
3242  EVT MidVT = VT.isVector() ?
3245 
3246  EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3247  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3248  Src.getOperand(0));
3249  DCI.AddToWorklist(Trunc.getNode());
3250 
3251  if (Amt.getValueType() != NewShiftVT) {
3252  Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3253  DCI.AddToWorklist(Amt.getNode());
3254  }
3255 
3256  SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3257  Trunc, Amt);
3258  return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3259  }
3260  }
3261  }
3262 
3263  return SDValue();
3264 }
3265 
3266 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3267 // instructions. If we only match on the legalized i64 mul expansion,
3268 // SimplifyDemandedBits will be unable to remove them because there will be
3269 // multiple uses due to the separate mul + mulh[su].
3270 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3271  SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3272  if (Size <= 32) {
3273  unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3274  return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3275  }
3276 
3277  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3278  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3279 
3280  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3281  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3282 
3283  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3284 }
3285 
3287  DAGCombinerInfo &DCI) const {
3288  EVT VT = N->getValueType(0);
3289 
3290  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3291  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3292  // unnecessarily). isDivergent() is used as an approximation of whether the
3293  // value is in an SGPR.
3294  if (!N->isDivergent())
3295  return SDValue();
3296 
3297  unsigned Size = VT.getSizeInBits();
3298  if (VT.isVector() || Size > 64)
3299  return SDValue();
3300 
3301  // There are i16 integer mul/mad.
3302  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3303  return SDValue();
3304 
3305  SelectionDAG &DAG = DCI.DAG;
3306  SDLoc DL(N);
3307 
3308  SDValue N0 = N->getOperand(0);
3309  SDValue N1 = N->getOperand(1);
3310 
3311  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3312  // in the source into any_extends if the result of the mul is truncated. Since
3313  // we can assume the high bits are whatever we want, use the underlying value
3314  // to avoid the unknown high bits from interfering.
3315  if (N0.getOpcode() == ISD::ANY_EXTEND)
3316  N0 = N0.getOperand(0);
3317 
3318  if (N1.getOpcode() == ISD::ANY_EXTEND)
3319  N1 = N1.getOperand(0);
3320 
3321  SDValue Mul;
3322 
3323  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3324  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3325  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3326  Mul = getMul24(DAG, DL, N0, N1, Size, false);
3327  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3328  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3329  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3330  Mul = getMul24(DAG, DL, N0, N1, Size, true);
3331  } else {
3332  return SDValue();
3333  }
3334 
3335  // We need to use sext even for MUL_U24, because MUL_U24 is used
3336  // for signed multiply of 8 and 16-bit types.
3337  return DAG.getSExtOrTrunc(Mul, DL, VT);
3338 }
3339 
3340 SDValue
3342  DAGCombinerInfo &DCI) const {
3343  if (N->getValueType(0) != MVT::i32)
3344  return SDValue();
3345 
3346  SelectionDAG &DAG = DCI.DAG;
3347  SDLoc DL(N);
3348 
3349  SDValue N0 = N->getOperand(0);
3350  SDValue N1 = N->getOperand(1);
3351 
3352  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3353  // in the source into any_extends if the result of the mul is truncated. Since
3354  // we can assume the high bits are whatever we want, use the underlying value
3355  // to avoid the unknown high bits from interfering.
3356  if (N0.getOpcode() == ISD::ANY_EXTEND)
3357  N0 = N0.getOperand(0);
3358  if (N1.getOpcode() == ISD::ANY_EXTEND)
3359  N1 = N1.getOperand(0);
3360 
3361  // Try to use two fast 24-bit multiplies (one for each half of the result)
3362  // instead of one slow extending multiply.
3363  unsigned LoOpcode, HiOpcode;
3364  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3365  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3366  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3367  LoOpcode = AMDGPUISD::MUL_U24;
3368  HiOpcode = AMDGPUISD::MULHI_U24;
3369  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3370  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3371  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3372  LoOpcode = AMDGPUISD::MUL_I24;
3373  HiOpcode = AMDGPUISD::MULHI_I24;
3374  } else {
3375  return SDValue();
3376  }
3377 
3378  SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3379  SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3380  DCI.CombineTo(N, Lo, Hi);
3381  return SDValue(N, 0);
3382 }
3383 
3385  DAGCombinerInfo &DCI) const {
3386  EVT VT = N->getValueType(0);
3387 
3388  if (!Subtarget->hasMulI24() || VT.isVector())
3389  return SDValue();
3390 
3391  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3392  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3393  // unnecessarily). isDivergent() is used as an approximation of whether the
3394  // value is in an SGPR.
3395  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3396  // valu op anyway)
3397  if (Subtarget->hasSMulHi() && !N->isDivergent())
3398  return SDValue();
3399 
3400  SelectionDAG &DAG = DCI.DAG;
3401  SDLoc DL(N);
3402 
3403  SDValue N0 = N->getOperand(0);
3404  SDValue N1 = N->getOperand(1);
3405 
3406  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3407  return SDValue();
3408 
3409  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3410  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3411 
3412  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3413  DCI.AddToWorklist(Mulhi.getNode());
3414  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3415 }
3416 
3418  DAGCombinerInfo &DCI) const {
3419  EVT VT = N->getValueType(0);
3420 
3421  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3422  return SDValue();
3423 
3424  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3425  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3426  // unnecessarily). isDivergent() is used as an approximation of whether the
3427  // value is in an SGPR.
3428  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3429  // valu op anyway)
3430  if (Subtarget->hasSMulHi() && !N->isDivergent())
3431  return SDValue();
3432 
3433  SelectionDAG &DAG = DCI.DAG;
3434  SDLoc DL(N);
3435 
3436  SDValue N0 = N->getOperand(0);
3437  SDValue N1 = N->getOperand(1);
3438 
3439  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3440  return SDValue();
3441 
3442  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3443  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3444 
3445  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3446  DCI.AddToWorklist(Mulhi.getNode());
3447  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3448 }
3449 
3450 static bool isNegativeOne(SDValue Val) {
3451  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3452  return C->isAllOnes();
3453  return false;
3454 }
3455 
3456 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3457  SDValue Op,
3458  const SDLoc &DL,
3459  unsigned Opc) const {
3460  EVT VT = Op.getValueType();
3461  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3462  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3463  LegalVT != MVT::i16))
3464  return SDValue();
3465 
3466  if (VT != MVT::i32)
3468 
3469  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3470  if (VT != MVT::i32)
3471  FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3472 
3473  return FFBX;
3474 }
3475 
3476 // The native instructions return -1 on 0 input. Optimize out a select that
3477 // produces -1 on 0.
3478 //
3479 // TODO: If zero is not undef, we could also do this if the output is compared
3480 // against the bitwidth.
3481 //
3482 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3485  DAGCombinerInfo &DCI) const {
3486  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3487  if (!CmpRhs || !CmpRhs->isZero())
3488  return SDValue();
3489 
3490  SelectionDAG &DAG = DCI.DAG;
3491  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3492  SDValue CmpLHS = Cond.getOperand(0);
3493 
3494  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3495  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3496  if (CCOpcode == ISD::SETEQ &&
3497  (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3498  RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3499  unsigned Opc =
3501  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3502  }
3503 
3504  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3505  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3506  if (CCOpcode == ISD::SETNE &&
3507  (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3508  LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3509  unsigned Opc =
3511 
3512  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3513  }
3514 
3515  return SDValue();
3516 }
3517 
3519  unsigned Op,
3520  const SDLoc &SL,
3521  SDValue Cond,
3522  SDValue N1,
3523  SDValue N2) {
3524  SelectionDAG &DAG = DCI.DAG;
3525  EVT VT = N1.getValueType();
3526 
3527  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3528  N1.getOperand(0), N2.getOperand(0));
3529  DCI.AddToWorklist(NewSelect.getNode());
3530  return DAG.getNode(Op, SL, VT, NewSelect);
3531 }
3532 
3533 // Pull a free FP operation out of a select so it may fold into uses.
3534 //
3535 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3536 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3537 //
3538 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3539 // select c, (fabs x), +k -> fabs (select c, x, k)
3541  SDValue N) {
3542  SelectionDAG &DAG = DCI.DAG;
3543  SDValue Cond = N.getOperand(0);
3544  SDValue LHS = N.getOperand(1);
3545  SDValue RHS = N.getOperand(2);
3546 
3547  EVT VT = N.getValueType();
3548  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3549  (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3550  return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3551  SDLoc(N), Cond, LHS, RHS);
3552  }
3553 
3554  bool Inv = false;
3555  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3556  std::swap(LHS, RHS);
3557  Inv = true;
3558  }
3559 
3560  // TODO: Support vector constants.
3561  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3562  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3563  SDLoc SL(N);
3564  // If one side is an fneg/fabs and the other is a constant, we can push the
3565  // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3566  SDValue NewLHS = LHS.getOperand(0);
3567  SDValue NewRHS = RHS;
3568 
3569  // Careful: if the neg can be folded up, don't try to pull it back down.
3570  bool ShouldFoldNeg = true;
3571 
3572  if (NewLHS.hasOneUse()) {
3573  unsigned Opc = NewLHS.getOpcode();
3574  if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3575  ShouldFoldNeg = false;
3576  if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3577  ShouldFoldNeg = false;
3578  }
3579 
3580  if (ShouldFoldNeg) {
3581  if (LHS.getOpcode() == ISD::FNEG)
3582  NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3583  else if (CRHS->isNegative())
3584  return SDValue();
3585 
3586  if (Inv)
3587  std::swap(NewLHS, NewRHS);
3588 
3589  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3590  Cond, NewLHS, NewRHS);
3591  DCI.AddToWorklist(NewSelect.getNode());
3592  return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3593  }
3594  }
3595 
3596  return SDValue();
3597 }
3598 
3599 
3601  DAGCombinerInfo &DCI) const {
3602  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3603  return Folded;
3604 
3605  SDValue Cond = N->getOperand(0);
3606  if (Cond.getOpcode() != ISD::SETCC)
3607  return SDValue();
3608 
3609  EVT VT = N->getValueType(0);
3610  SDValue LHS = Cond.getOperand(0);
3611  SDValue RHS = Cond.getOperand(1);
3612  SDValue CC = Cond.getOperand(2);
3613 
3614  SDValue True = N->getOperand(1);
3615  SDValue False = N->getOperand(2);
3616 
3617  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3618  SelectionDAG &DAG = DCI.DAG;
3619  if (DAG.isConstantValueOfAnyType(True) &&
3620  !DAG.isConstantValueOfAnyType(False)) {
3621  // Swap cmp + select pair to move constant to false input.
3622  // This will allow using VOPC cndmasks more often.
3623  // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3624 
3625  SDLoc SL(N);
3626  ISD::CondCode NewCC =
3627  getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3628 
3629  SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3630  return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3631  }
3632 
3633  if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3634  SDValue MinMax
3635  = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3636  // Revisit this node so we can catch min3/max3/med3 patterns.
3637  //DCI.AddToWorklist(MinMax.getNode());
3638  return MinMax;
3639  }
3640  }
3641 
3642  // There's no reason to not do this if the condition has other uses.
3643  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3644 }
3645 
3646 static bool isInv2Pi(const APFloat &APF) {
3647  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3648  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3649  static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3650 
3651  return APF.bitwiseIsEqual(KF16) ||
3652  APF.bitwiseIsEqual(KF32) ||
3653  APF.bitwiseIsEqual(KF64);
3654 }
3655 
3656 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3657 // additional cost to negate them.
3659  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3660  if (C->isZero() && !C->isNegative())
3661  return true;
3662 
3663  if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3664  return true;
3665  }
3666 
3667  return false;
3668 }
3669 
3670 static unsigned inverseMinMax(unsigned Opc) {
3671  switch (Opc) {
3672  case ISD::FMAXNUM:
3673  return ISD::FMINNUM;
3674  case ISD::FMINNUM:
3675  return ISD::FMAXNUM;
3676  case ISD::FMAXNUM_IEEE:
3677  return ISD::FMINNUM_IEEE;
3678  case ISD::FMINNUM_IEEE:
3679  return ISD::FMAXNUM_IEEE;
3681  return AMDGPUISD::FMIN_LEGACY;
3683  return AMDGPUISD::FMAX_LEGACY;
3684  default:
3685  llvm_unreachable("invalid min/max opcode");
3686  }
3687 }
3688 
3690  DAGCombinerInfo &DCI) const {
3691  SelectionDAG &DAG = DCI.DAG;
3692  SDValue N0 = N->getOperand(0);
3693  EVT VT = N->getValueType(0);
3694 
3695  unsigned Opc = N0.getOpcode();
3696 
3697  // If the input has multiple uses and we can either fold the negate down, or
3698  // the other uses cannot, give up. This both prevents unprofitable
3699  // transformations and infinite loops: we won't repeatedly try to fold around
3700  // a negate that has no 'good' form.
3701  if (N0.hasOneUse()) {
3702  // This may be able to fold into the source, but at a code size cost. Don't
3703  // fold if the fold into the user is free.
3704  if (allUsesHaveSourceMods(N, 0))
3705  return SDValue();
3706  } else {
3707  if (fnegFoldsIntoOp(Opc) &&
3709  return SDValue();
3710  }
3711 
3712  SDLoc SL(N);
3713  switch (Opc) {
3714  case ISD::FADD: {
3715  if (!mayIgnoreSignedZero(N0))
3716  return SDValue();
3717 
3718  // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3719  SDValue LHS = N0.getOperand(0);
3720  SDValue RHS = N0.getOperand(1);
3721 
3722  if (LHS.getOpcode() != ISD::FNEG)
3723  LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3724  else
3725  LHS = LHS.getOperand(0);
3726 
3727  if (RHS.getOpcode() != ISD::FNEG)
3728  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3729  else
3730  RHS = RHS.getOperand(0);
3731 
3732  SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3733  if (Res.getOpcode() != ISD::FADD)
3734  return SDValue(); // Op got folded away.
3735  if (!N0.hasOneUse())
3736  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3737  return Res;
3738  }
3739  case ISD::FMUL:
3740  case AMDGPUISD::FMUL_LEGACY: {
3741  // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3742  // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3743  SDValue LHS = N0.getOperand(0);
3744  SDValue RHS = N0.getOperand(1);
3745 
3746  if (LHS.getOpcode() == ISD::FNEG)
3747  LHS = LHS.getOperand(0);
3748  else if (RHS.getOpcode() == ISD::FNEG)
3749  RHS = RHS.getOperand(0);
3750  else
3751  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3752 
3753  SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3754  if (Res.getOpcode() != Opc)
3755  return SDValue(); // Op got folded away.
3756  if (!N0.hasOneUse())
3757  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3758  return Res;
3759  }
3760  case ISD::FMA:
3761  case ISD::FMAD: {
3762  // TODO: handle llvm.amdgcn.fma.legacy
3763  if (!mayIgnoreSignedZero(N0))
3764  return SDValue();
3765 
3766  // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3767  SDValue LHS = N0.getOperand(0);
3768  SDValue MHS = N0.getOperand(1);
3769  SDValue RHS = N0.getOperand(2);
3770 
3771  if (LHS.getOpcode() == ISD::FNEG)
3772  LHS = LHS.getOperand(0);
3773  else if (MHS.getOpcode() == ISD::FNEG)
3774  MHS = MHS.getOperand(0);
3775  else
3776  MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3777 
3778  if (RHS.getOpcode() != ISD::FNEG)
3779  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3780  else
3781  RHS = RHS.getOperand(0);
3782 
3783  SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3784  if (Res.getOpcode() != Opc)
3785  return SDValue(); // Op got folded away.
3786  if (!N0.hasOneUse())
3787  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3788  return Res;
3789  }
3790  case ISD::FMAXNUM:
3791  case ISD::FMINNUM:
3792  case ISD::FMAXNUM_IEEE:
3793  case ISD::FMINNUM_IEEE:
3795  case AMDGPUISD::FMIN_LEGACY: {
3796  // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3797  // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3798  // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3799  // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3800 
3801  SDValue LHS = N0.getOperand(0);
3802  SDValue RHS = N0.getOperand(1);
3803 
3804  // 0 doesn't have a negated inline immediate.
3805  // TODO: This constant check should be generalized to other operations.
3807  return SDValue();
3808 
3809  SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3810  SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3811  unsigned Opposite = inverseMinMax(Opc);
3812 
3813  SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3814  if (Res.getOpcode() != Opposite)
3815  return SDValue(); // Op got folded away.
3816  if (!N0.hasOneUse())
3817  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3818  return Res;
3819  }
3820  case AMDGPUISD::FMED3: {
3821  SDValue Ops[3];
3822  for (unsigned I = 0; I < 3; ++I)
3823  Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3824 
3825  SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3826  if (Res.getOpcode() != AMDGPUISD::FMED3)
3827  return SDValue(); // Op got folded away.
3828 
3829  if (!N0.hasOneUse()) {
3830  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
3831  DAG.ReplaceAllUsesWith(N0, Neg);
3832 
3833  for (SDNode *U : Neg->uses())
3834  DCI.AddToWorklist(U);
3835  }
3836 
3837  return Res;
3838  }
3839  case ISD::FP_EXTEND:
3840  case ISD::FTRUNC:
3841  case ISD::FRINT:
3842  case ISD::FNEARBYINT: // XXX - Should fround be handled?
3843  case ISD::FSIN:
3844  case ISD::FCANONICALIZE:
3845  case AMDGPUISD::RCP:
3846  case AMDGPUISD::RCP_LEGACY:
3847  case AMDGPUISD::RCP_IFLAG:
3848  case AMDGPUISD::SIN_HW: {
3849  SDValue CvtSrc = N0.getOperand(0);
3850  if (CvtSrc.getOpcode() == ISD::FNEG) {
3851  // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3852  // (fneg (rcp (fneg x))) -> (rcp x)
3853  return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3854  }
3855 
3856  if (!N0.hasOneUse())
3857  return SDValue();
3858 
3859  // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3860  // (fneg (rcp x)) -> (rcp (fneg x))
3861  SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3862  return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3863  }
3864  case ISD::FP_ROUND: {
3865  SDValue CvtSrc = N0.getOperand(0);
3866 
3867  if (CvtSrc.getOpcode() == ISD::FNEG) {
3868  // (fneg (fp_round (fneg x))) -> (fp_round x)
3869  return DAG.getNode(ISD::FP_ROUND, SL, VT,
3870  CvtSrc.getOperand(0), N0.getOperand(1));
3871  }
3872 
3873  if (!N0.hasOneUse())
3874  return SDValue();
3875 
3876  // (fneg (fp_round x)) -> (fp_round (fneg x))
3877  SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3878  return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3879  }
3880  case ISD::FP16_TO_FP: {
3881  // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3882  // f16, but legalization of f16 fneg ends up pulling it out of the source.
3883  // Put the fneg back as a legal source operation that can be matched later.
3884  SDLoc SL(N);
3885 
3886  SDValue Src = N0.getOperand(0);
3887  EVT SrcVT = Src.getValueType();
3888 
3889  // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3890  SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,