LLVM  14.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/Support/KnownBits.h"
27 
28 using namespace llvm;
29 
30 #include "AMDGPUGenCallingConv.inc"
31 
33  "amdgpu-bypass-slow-div",
34  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
35  cl::init(true));
36 
37 // Find a larger type to do a load / store of a vector with.
39  unsigned StoreSize = VT.getStoreSizeInBits();
40  if (StoreSize <= 32)
41  return EVT::getIntegerVT(Ctx, StoreSize);
42 
43  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
44  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
45 }
46 
48  EVT VT = Op.getValueType();
49  KnownBits Known = DAG.computeKnownBits(Op);
50  return VT.getSizeInBits() - Known.countMinLeadingZeros();
51 }
52 
54  EVT VT = Op.getValueType();
55 
56  // In order for this to be a signed 24-bit value, bit 23, must
57  // be a sign bit.
58  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
59 }
60 
62  const AMDGPUSubtarget &STI)
63  : TargetLowering(TM), Subtarget(&STI) {
64  // Lower floating point store/load to integer store/load to reduce the number
65  // of patterns in tablegen.
68 
71 
74 
77 
80 
83 
86 
89 
92 
95 
98 
101 
104 
107 
110 
113 
116 
119 
122 
125 
128 
131 
132  // There are no 64-bit extloads. These should be done as a 32-bit extload and
133  // an extension to 64-bit.
134  for (MVT VT : MVT::integer_valuetypes()) {
138  }
139 
140  for (MVT VT : MVT::integer_valuetypes()) {
141  if (VT == MVT::i64)
142  continue;
143 
148 
153 
158  }
159 
176  }
177 
185 
192 
199 
202 
205 
208 
211 
214 
217 
220 
223 
226 
229 
232 
235 
238 
241 
244 
247 
250 
253 
256 
259 
262 
265 
270 
275 
283 
286 
289 
294 
299 
302 
310 
315 
318 
319  // This is totally unsupported, just custom lower to produce an error.
321 
322  // Library functions. These default to Expand, but we have instructions
323  // for them.
334 
337 
341 
342 
345 
349 
350  // Expand to fneg + fadd.
352 
395 
399 
400  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
401  for (MVT VT : ScalarIntVTs) {
402  // These should use [SU]DIVREM, so set them to expand
407 
408  // GPU does not have divrem function for signed or unsigned.
411 
412  // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
415 
419 
420  // AMDGPU uses ADDC/SUBC/ADDE/SUBE
425  }
426 
427  // The hardware supports 32-bit FSHR, but not FSHL.
429 
430  // The hardware supports 32-bit ROTR, but not ROTL.
434 
437 
446 
451 
456 
457  static const MVT::SimpleValueType VectorIntTypes[] = {
459 
460  for (MVT VT : VectorIntTypes) {
461  // Expand the following operations for the current type by default.
496  }
497 
498  static const MVT::SimpleValueType FloatVectorTypes[] = {
500 
501  for (MVT VT : FloatVectorTypes) {
532  }
533 
534  // This causes using an unrolled select operation rather than expansion with
535  // bit operations. This is in general better, but the alternative using BFI
536  // instructions may be better if the select sources are SGPRs.
539 
542 
545 
548 
551 
554 
555  // There are no libcalls of any kind.
556  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
557  setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
558 
560  setJumpIsExpensive(true);
561 
562  // FIXME: This is only partially true. If we have to do vector compares, any
563  // SGPR pair can be a condition register. If we have a uniform condition, we
564  // are better off doing SALU operations, where there is only one SCC. For now,
565  // we don't have a way of knowing during instruction selection if a condition
566  // will be uniform and we always use vector compares. Assume we are using
567  // vector compares until that is fixed.
569 
572 
574 
575  // We want to find all load dependencies for long chains of stores to enable
576  // merging into very wide vectors. The problem is with vectors with > 4
577  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
578  // vectors are a legal type, even though we have to split the loads
579  // usually. When we can more precisely specify load legality per address
580  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
581  // smarter so that they can figure out what to do in 2 iterations without all
582  // N > 4 stores on the same chain.
584 
585  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
586  // about these during lowering.
587  MaxStoresPerMemcpy = 0xffffffff;
588  MaxStoresPerMemmove = 0xffffffff;
589  MaxStoresPerMemset = 0xffffffff;
590 
591  // The expansion for 64-bit division is enormous.
593  addBypassSlowDiv(64, 32);
594 
613 }
614 
616  if (getTargetMachine().Options.NoSignedZerosFPMath)
617  return true;
618 
619  const auto Flags = Op.getNode()->getFlags();
620  if (Flags.hasNoSignedZeros())
621  return true;
622 
623  return false;
624 }
625 
626 //===----------------------------------------------------------------------===//
627 // Target Information
628 //===----------------------------------------------------------------------===//
629 
631 static bool fnegFoldsIntoOp(unsigned Opc) {
632  switch (Opc) {
633  case ISD::FADD:
634  case ISD::FSUB:
635  case ISD::FMUL:
636  case ISD::FMA:
637  case ISD::FMAD:
638  case ISD::FMINNUM:
639  case ISD::FMAXNUM:
640  case ISD::FMINNUM_IEEE:
641  case ISD::FMAXNUM_IEEE:
642  case ISD::FSIN:
643  case ISD::FTRUNC:
644  case ISD::FRINT:
645  case ISD::FNEARBYINT:
646  case ISD::FCANONICALIZE:
647  case AMDGPUISD::RCP:
650  case AMDGPUISD::SIN_HW:
654  case AMDGPUISD::FMED3:
655  // TODO: handle llvm.amdgcn.fma.legacy
656  return true;
657  default:
658  return false;
659  }
660 }
661 
662 /// \p returns true if the operation will definitely need to use a 64-bit
663 /// encoding, and thus will use a VOP3 encoding regardless of the source
664 /// modifiers.
666 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
667  return N->getNumOperands() > 2 || VT == MVT::f64;
668 }
669 
670 // Most FP instructions support source modifiers, but this could be refined
671 // slightly.
673 static bool hasSourceMods(const SDNode *N) {
674  if (isa<MemSDNode>(N))
675  return false;
676 
677  switch (N->getOpcode()) {
678  case ISD::CopyToReg:
679  case ISD::SELECT:
680  case ISD::FDIV:
681  case ISD::FREM:
682  case ISD::INLINEASM:
683  case ISD::INLINEASM_BR:
686 
687  // TODO: Should really be looking at the users of the bitcast. These are
688  // problematic because bitcasts are used to legalize all stores to integer
689  // types.
690  case ISD::BITCAST:
691  return false;
693  switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
694  case Intrinsic::amdgcn_interp_p1:
695  case Intrinsic::amdgcn_interp_p2:
696  case Intrinsic::amdgcn_interp_mov:
697  case Intrinsic::amdgcn_interp_p1_f16:
698  case Intrinsic::amdgcn_interp_p2_f16:
699  return false;
700  default:
701  return true;
702  }
703  }
704  default:
705  return true;
706  }
707 }
708 
710  unsigned CostThreshold) {
711  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
712  // it is truly free to use a source modifier in all cases. If there are
713  // multiple users but for each one will necessitate using VOP3, there will be
714  // a code size increase. Try to avoid increasing code size unless we know it
715  // will save on the instruction count.
716  unsigned NumMayIncreaseSize = 0;
717  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
718 
719  // XXX - Should this limit number of uses to check?
720  for (const SDNode *U : N->uses()) {
721  if (!hasSourceMods(U))
722  return false;
723 
724  if (!opMustUseVOP3Encoding(U, VT)) {
725  if (++NumMayIncreaseSize > CostThreshold)
726  return false;
727  }
728  }
729 
730  return true;
731 }
732 
734  ISD::NodeType ExtendKind) const {
735  assert(!VT.isVector() && "only scalar expected");
736 
737  // Round to the next multiple of 32-bits.
738  unsigned Size = VT.getSizeInBits();
739  if (Size <= 32)
740  return MVT::i32;
741  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
742 }
743 
745  return MVT::i32;
746 }
747 
749  return true;
750 }
751 
752 // The backend supports 32 and 64 bit floating point immediates.
753 // FIXME: Why are we reporting vectors of FP immediates as legal?
755  bool ForCodeSize) const {
756  EVT ScalarVT = VT.getScalarType();
757  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
758  (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
759 }
760 
761 // We don't want to shrink f64 / f32 constants.
763  EVT ScalarVT = VT.getScalarType();
764  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
765 }
766 
768  ISD::LoadExtType ExtTy,
769  EVT NewVT) const {
770  // TODO: This may be worth removing. Check regression tests for diffs.
771  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
772  return false;
773 
774  unsigned NewSize = NewVT.getStoreSizeInBits();
775 
776  // If we are reducing to a 32-bit load or a smaller multi-dword load,
777  // this is always better.
778  if (NewSize >= 32)
779  return true;
780 
781  EVT OldVT = N->getValueType(0);
782  unsigned OldSize = OldVT.getStoreSizeInBits();
783 
784  MemSDNode *MN = cast<MemSDNode>(N);
785  unsigned AS = MN->getAddressSpace();
786  // Do not shrink an aligned scalar load to sub-dword.
787  // Scalar engine cannot do sub-dword loads.
788  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
791  (isa<LoadSDNode>(N) &&
792  AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
794  return false;
795 
796  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
797  // extloads, so doing one requires using a buffer_load. In cases where we
798  // still couldn't use a scalar load, using the wider load shouldn't really
799  // hurt anything.
800 
801  // If the old size already had to be an extload, there's no harm in continuing
802  // to reduce the width.
803  return (OldSize < 32);
804 }
805 
807  const SelectionDAG &DAG,
808  const MachineMemOperand &MMO) const {
809 
810  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
811 
812  if (LoadTy.getScalarType() == MVT::i32)
813  return false;
814 
815  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
816  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
817 
818  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
819  return false;
820 
821  bool Fast = false;
823  CastTy, MMO, &Fast) &&
824  Fast;
825 }
826 
827 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
828 // profitable with the expansion for 64-bit since it's generally good to
829 // speculate things.
830 // FIXME: These should really have the size as a parameter.
832  return true;
833 }
834 
836  return true;
837 }
838 
840  switch (N->getOpcode()) {
841  case ISD::EntryToken:
842  case ISD::TokenFactor:
843  return true;
845  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
846  switch (IntrID) {
847  case Intrinsic::amdgcn_readfirstlane:
848  case Intrinsic::amdgcn_readlane:
849  return true;
850  }
851  return false;
852  }
853  case ISD::LOAD:
854  if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
856  return true;
857  return false;
858  }
859  return false;
860 }
861 
863  SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
864  NegatibleCost &Cost, unsigned Depth) const {
865 
866  switch (Op.getOpcode()) {
867  case ISD::FMA:
868  case ISD::FMAD: {
869  // Negating a fma is not free if it has users without source mods.
870  if (!allUsesHaveSourceMods(Op.getNode()))
871  return SDValue();
872  break;
873  }
874  default:
875  break;
876  }
877 
878  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
879  ForCodeSize, Cost, Depth);
880 }
881 
882 //===---------------------------------------------------------------------===//
883 // Target Properties
884 //===---------------------------------------------------------------------===//
885 
887  assert(VT.isFloatingPoint());
888 
889  // Packed operations do not have a fabs modifier.
890  return VT == MVT::f32 || VT == MVT::f64 ||
891  (Subtarget->has16BitInsts() && VT == MVT::f16);
892 }
893 
895  assert(VT.isFloatingPoint());
896  // Report this based on the end legalized type.
897  VT = VT.getScalarType();
898  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
899 }
900 
902  unsigned NumElem,
903  unsigned AS) const {
904  return true;
905 }
906 
908  // There are few operations which truly have vector input operands. Any vector
909  // operation is going to involve operations on each component, and a
910  // build_vector will be a copy per element, so it always makes sense to use a
911  // build_vector input in place of the extracted element to avoid a copy into a
912  // super register.
913  //
914  // We should probably only do this if all users are extracts only, but this
915  // should be the common case.
916  return true;
917 }
918 
920  // Truncate is just accessing a subregister.
921 
922  unsigned SrcSize = Source.getSizeInBits();
923  unsigned DestSize = Dest.getSizeInBits();
924 
925  return DestSize < SrcSize && DestSize % 32 == 0 ;
926 }
927 
929  // Truncate is just accessing a subregister.
930 
931  unsigned SrcSize = Source->getScalarSizeInBits();
932  unsigned DestSize = Dest->getScalarSizeInBits();
933 
934  if (DestSize== 16 && Subtarget->has16BitInsts())
935  return SrcSize >= 32;
936 
937  return DestSize < SrcSize && DestSize % 32 == 0;
938 }
939 
940 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
941  unsigned SrcSize = Src->getScalarSizeInBits();
942  unsigned DestSize = Dest->getScalarSizeInBits();
943 
944  if (SrcSize == 16 && Subtarget->has16BitInsts())
945  return DestSize >= 32;
946 
947  return SrcSize == 32 && DestSize == 64;
948 }
949 
950 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
951  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
952  // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
953  // this will enable reducing 64-bit operations the 32-bit, which is always
954  // good.
955 
956  if (Src == MVT::i16)
957  return Dest == MVT::i32 ||Dest == MVT::i64 ;
958 
959  return Src == MVT::i32 && Dest == MVT::i64;
960 }
961 
963  return isZExtFree(Val.getValueType(), VT2);
964 }
965 
967  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
968  // limited number of native 64-bit operations. Shrinking an operation to fit
969  // in a single 32-bit register should always be helpful. As currently used,
970  // this is much less general than the name suggests, and is only used in
971  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
972  // not profitable, and may actually be harmful.
973  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
974 }
975 
976 //===---------------------------------------------------------------------===//
977 // TargetLowering Callbacks
978 //===---------------------------------------------------------------------===//
979 
981  bool IsVarArg) {
982  switch (CC) {
990  return CC_AMDGPU;
991  case CallingConv::C:
992  case CallingConv::Fast:
993  case CallingConv::Cold:
994  return CC_AMDGPU_Func;
996  return CC_SI_Gfx;
999  default:
1000  report_fatal_error("Unsupported calling convention for call");
1001  }
1002 }
1003 
1005  bool IsVarArg) {
1006  switch (CC) {
1009  llvm_unreachable("kernels should not be handled here");
1017  return RetCC_SI_Shader;
1019  return RetCC_SI_Gfx;
1020  case CallingConv::C:
1021  case CallingConv::Fast:
1022  case CallingConv::Cold:
1023  return RetCC_AMDGPU_Func;
1024  default:
1025  report_fatal_error("Unsupported calling convention.");
1026  }
1027 }
1028 
1029 /// The SelectionDAGBuilder will automatically promote function arguments
1030 /// with illegal types. However, this does not work for the AMDGPU targets
1031 /// since the function arguments are stored in memory as these illegal types.
1032 /// In order to handle this properly we need to get the original types sizes
1033 /// from the LLVM IR Function and fixup the ISD:InputArg values before
1034 /// passing them to AnalyzeFormalArguments()
1035 
1036 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1037 /// input values across multiple registers. Each item in the Ins array
1038 /// represents a single value that will be stored in registers. Ins[x].VT is
1039 /// the value type of the value that will be stored in the register, so
1040 /// whatever SDNode we lower the argument to needs to be this type.
1041 ///
1042 /// In order to correctly lower the arguments we need to know the size of each
1043 /// argument. Since Ins[x].VT gives us the size of the register that will
1044 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1045 /// for the orignal function argument so that we can deduce the correct memory
1046 /// type to use for Ins[x]. In most cases the correct memory type will be
1047 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1048 /// we have a kernel argument of type v8i8, this argument will be split into
1049 /// 8 parts and each part will be represented by its own item in the Ins array.
1050 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1051 /// the argument before it was split. From this, we deduce that the memory type
1052 /// for each individual part is i8. We pass the memory type as LocVT to the
1053 /// calling convention analysis function and the register type (Ins[x].VT) as
1054 /// the ValVT.
1056  CCState &State,
1057  const SmallVectorImpl<ISD::InputArg> &Ins) const {
1058  const MachineFunction &MF = State.getMachineFunction();
1059  const Function &Fn = MF.getFunction();
1060  LLVMContext &Ctx = Fn.getParent()->getContext();
1062  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1063  CallingConv::ID CC = Fn.getCallingConv();
1064 
1065  Align MaxAlign = Align(1);
1066  uint64_t ExplicitArgOffset = 0;
1067  const DataLayout &DL = Fn.getParent()->getDataLayout();
1068 
1069  unsigned InIndex = 0;
1070 
1071  for (const Argument &Arg : Fn.args()) {
1072  const bool IsByRef = Arg.hasByRefAttr();
1073  Type *BaseArgTy = Arg.getType();
1074  Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1075  MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1076  if (!Alignment)
1077  Alignment = DL.getABITypeAlign(MemArgTy);
1078  MaxAlign = max(Alignment, MaxAlign);
1079  uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1080 
1081  uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1082  ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1083 
1084  // We're basically throwing away everything passed into us and starting over
1085  // to get accurate in-memory offsets. The "PartOffset" is completely useless
1086  // to us as computed in Ins.
1087  //
1088  // We also need to figure out what type legalization is trying to do to get
1089  // the correct memory offsets.
1090 
1091  SmallVector<EVT, 16> ValueVTs;
1093  ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1094 
1095  for (unsigned Value = 0, NumValues = ValueVTs.size();
1096  Value != NumValues; ++Value) {
1097  uint64_t BasePartOffset = Offsets[Value];
1098 
1099  EVT ArgVT = ValueVTs[Value];
1100  EVT MemVT = ArgVT;
1101  MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1102  unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1103 
1104  if (NumRegs == 1) {
1105  // This argument is not split, so the IR type is the memory type.
1106  if (ArgVT.isExtended()) {
1107  // We have an extended type, like i24, so we should just use the
1108  // register type.
1109  MemVT = RegisterVT;
1110  } else {
1111  MemVT = ArgVT;
1112  }
1113  } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1114  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1115  assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1116  // We have a vector value which has been split into a vector with
1117  // the same scalar type, but fewer elements. This should handle
1118  // all the floating-point vector types.
1119  MemVT = RegisterVT;
1120  } else if (ArgVT.isVector() &&
1121  ArgVT.getVectorNumElements() == NumRegs) {
1122  // This arg has been split so that each element is stored in a separate
1123  // register.
1124  MemVT = ArgVT.getScalarType();
1125  } else if (ArgVT.isExtended()) {
1126  // We have an extended type, like i65.
1127  MemVT = RegisterVT;
1128  } else {
1129  unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1130  assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1131  if (RegisterVT.isInteger()) {
1132  MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1133  } else if (RegisterVT.isVector()) {
1134  assert(!RegisterVT.getScalarType().isFloatingPoint());
1135  unsigned NumElements = RegisterVT.getVectorNumElements();
1136  assert(MemoryBits % NumElements == 0);
1137  // This vector type has been split into another vector type with
1138  // a different elements size.
1139  EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1140  MemoryBits / NumElements);
1141  MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1142  } else {
1143  llvm_unreachable("cannot deduce memory type.");
1144  }
1145  }
1146 
1147  // Convert one element vectors to scalar.
1148  if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1149  MemVT = MemVT.getScalarType();
1150 
1151  // Round up vec3/vec5 argument.
1152  if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1153  assert(MemVT.getVectorNumElements() == 3 ||
1154  MemVT.getVectorNumElements() == 5);
1155  MemVT = MemVT.getPow2VectorType(State.getContext());
1156  } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1157  MemVT = MemVT.getRoundIntegerType(State.getContext());
1158  }
1159 
1160  unsigned PartOffset = 0;
1161  for (unsigned i = 0; i != NumRegs; ++i) {
1162  State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1163  BasePartOffset + PartOffset,
1164  MemVT.getSimpleVT(),
1166  PartOffset += MemVT.getStoreSize();
1167  }
1168  }
1169  }
1170 }
1171 
1173  SDValue Chain, CallingConv::ID CallConv,
1174  bool isVarArg,
1175  const SmallVectorImpl<ISD::OutputArg> &Outs,
1176  const SmallVectorImpl<SDValue> &OutVals,
1177  const SDLoc &DL, SelectionDAG &DAG) const {
1178  // FIXME: Fails for r600 tests
1179  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1180  // "wave terminate should not have return values");
1181  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1182 }
1183 
1184 //===---------------------------------------------------------------------===//
1185 // Target specific lowering
1186 //===---------------------------------------------------------------------===//
1187 
1188 /// Selects the correct CCAssignFn for a given CallingConvention value.
1190  bool IsVarArg) {
1191  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1192 }
1193 
1195  bool IsVarArg) {
1196  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1197 }
1198 
1200  SelectionDAG &DAG,
1201  MachineFrameInfo &MFI,
1202  int ClobberedFI) const {
1203  SmallVector<SDValue, 8> ArgChains;
1204  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1205  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1206 
1207  // Include the original chain at the beginning of the list. When this is
1208  // used by target LowerCall hooks, this helps legalize find the
1209  // CALLSEQ_BEGIN node.
1210  ArgChains.push_back(Chain);
1211 
1212  // Add a chain value for each stack argument corresponding
1213  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1214  UE = DAG.getEntryNode().getNode()->use_end();
1215  U != UE; ++U) {
1216  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1217  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1218  if (FI->getIndex() < 0) {
1219  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1220  int64_t InLastByte = InFirstByte;
1221  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1222 
1223  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1224  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1225  ArgChains.push_back(SDValue(L, 1));
1226  }
1227  }
1228  }
1229  }
1230 
1231  // Build a tokenfactor for all the chains.
1232  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1233 }
1234 
1236  SmallVectorImpl<SDValue> &InVals,
1237  StringRef Reason) const {
1238  SDValue Callee = CLI.Callee;
1239  SelectionDAG &DAG = CLI.DAG;
1240 
1241  const Function &Fn = DAG.getMachineFunction().getFunction();
1242 
1243  StringRef FuncName("<unknown>");
1244 
1245  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1246  FuncName = G->getSymbol();
1247  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1248  FuncName = G->getGlobal()->getName();
1249 
1250  DiagnosticInfoUnsupported NoCalls(
1251  Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1252  DAG.getContext()->diagnose(NoCalls);
1253 
1254  if (!CLI.IsTailCall) {
1255  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1256  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1257  }
1258 
1259  return DAG.getEntryNode();
1260 }
1261 
1263  SmallVectorImpl<SDValue> &InVals) const {
1264  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1265 }
1266 
1268  SelectionDAG &DAG) const {
1269  const Function &Fn = DAG.getMachineFunction().getFunction();
1270 
1271  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1272  SDLoc(Op).getDebugLoc());
1273  DAG.getContext()->diagnose(NoDynamicAlloca);
1274  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1275  return DAG.getMergeValues(Ops, SDLoc());
1276 }
1277 
1279  SelectionDAG &DAG) const {
1280  switch (Op.getOpcode()) {
1281  default:
1282  Op->print(errs(), &DAG);
1283  llvm_unreachable("Custom lowering code for this "
1284  "instruction is not implemented yet!");
1285  break;
1287  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1289  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1290  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1291  case ISD::FREM: return LowerFREM(Op, DAG);
1292  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1293  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1294  case ISD::FRINT: return LowerFRINT(Op, DAG);
1295  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1296  case ISD::FROUND: return LowerFROUND(Op, DAG);
1297  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1298  case ISD::FLOG:
1299  return LowerFLOG(Op, DAG, numbers::ln2f);
1300  case ISD::FLOG10:
1301  return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1302  case ISD::FEXP:
1303  return lowerFEXP(Op, DAG);
1304  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1305  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1306  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1307  case ISD::FP_TO_SINT:
1308  case ISD::FP_TO_UINT:
1309  return LowerFP_TO_INT(Op, DAG);
1310  case ISD::CTTZ:
1311  case ISD::CTTZ_ZERO_UNDEF:
1312  case ISD::CTLZ:
1313  case ISD::CTLZ_ZERO_UNDEF:
1314  return LowerCTLZ_CTTZ(Op, DAG);
1316  }
1317  return Op;
1318 }
1319 
1322  SelectionDAG &DAG) const {
1323  switch (N->getOpcode()) {
1325  // Different parts of legalization seem to interpret which type of
1326  // sign_extend_inreg is the one to check for custom lowering. The extended
1327  // from type is what really matters, but some places check for custom
1328  // lowering of the result type. This results in trying to use
1329  // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1330  // nothing here and let the illegal result integer be handled normally.
1331  return;
1332  default:
1333  return;
1334  }
1335 }
1336 
1338  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1339  if (!GVar || !GVar->hasInitializer())
1340  return false;
1341 
1342  return !isa<UndefValue>(GVar->getInitializer());
1343 }
1344 
1346  SDValue Op,
1347  SelectionDAG &DAG) const {
1348 
1349  const DataLayout &DL = DAG.getDataLayout();
1350  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1351  const GlobalValue *GV = G->getGlobal();
1352 
1353  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1354  G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1355  if (!MFI->isModuleEntryFunction() &&
1356  !GV->getName().equals("llvm.amdgcn.module.lds")) {
1357  SDLoc DL(Op);
1358  const Function &Fn = DAG.getMachineFunction().getFunction();
1359  DiagnosticInfoUnsupported BadLDSDecl(
1360  Fn, "local memory global used by non-kernel function",
1361  DL.getDebugLoc(), DS_Warning);
1362  DAG.getContext()->diagnose(BadLDSDecl);
1363 
1364  // We currently don't have a way to correctly allocate LDS objects that
1365  // aren't directly associated with a kernel. We do force inlining of
1366  // functions that use local objects. However, if these dead functions are
1367  // not eliminated, we don't want a compile time error. Just emit a warning
1368  // and a trap, since there should be no callable path here.
1369  SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1370  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1371  Trap, DAG.getRoot());
1372  DAG.setRoot(OutputChain);
1373  return DAG.getUNDEF(Op.getValueType());
1374  }
1375 
1376  // XXX: What does the value of G->getOffset() mean?
1377  assert(G->getOffset() == 0 &&
1378  "Do not know what to do with an non-zero offset");
1379 
1380  // TODO: We could emit code to handle the initialization somewhere.
1381  if (!hasDefinedInitializer(GV)) {
1382  unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1383  return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1384  }
1385  }
1386 
1387  const Function &Fn = DAG.getMachineFunction().getFunction();
1388  DiagnosticInfoUnsupported BadInit(
1389  Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1390  DAG.getContext()->diagnose(BadInit);
1391  return SDValue();
1392 }
1393 
1395  SelectionDAG &DAG) const {
1397 
1398  EVT VT = Op.getValueType();
1399  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1400  SDLoc SL(Op);
1401  SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1402  SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1403 
1404  SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1405  return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1406  }
1407 
1408  for (const SDUse &U : Op->ops())
1409  DAG.ExtractVectorElements(U.get(), Args);
1410 
1411  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1412 }
1413 
1415  SelectionDAG &DAG) const {
1416 
1418  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1419  EVT VT = Op.getValueType();
1420  EVT SrcVT = Op.getOperand(0).getValueType();
1421 
1422  // For these types, we have some TableGen patterns except if the index is 1
1423  if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1424  (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1425  Start != 1)
1426  return Op;
1427 
1428  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1429  VT.getVectorNumElements());
1430 
1431  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1432 }
1433 
1434 /// Generate Min/Max node
1436  SDValue LHS, SDValue RHS,
1437  SDValue True, SDValue False,
1438  SDValue CC,
1439  DAGCombinerInfo &DCI) const {
1440  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1441  return SDValue();
1442 
1443  SelectionDAG &DAG = DCI.DAG;
1444  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1445  switch (CCOpcode) {
1446  case ISD::SETOEQ:
1447  case ISD::SETONE:
1448  case ISD::SETUNE:
1449  case ISD::SETNE:
1450  case ISD::SETUEQ:
1451  case ISD::SETEQ:
1452  case ISD::SETFALSE:
1453  case ISD::SETFALSE2:
1454  case ISD::SETTRUE:
1455  case ISD::SETTRUE2:
1456  case ISD::SETUO:
1457  case ISD::SETO:
1458  break;
1459  case ISD::SETULE:
1460  case ISD::SETULT: {
1461  if (LHS == True)
1462  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1463  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1464  }
1465  case ISD::SETOLE:
1466  case ISD::SETOLT:
1467  case ISD::SETLE:
1468  case ISD::SETLT: {
1469  // Ordered. Assume ordered for undefined.
1470 
1471  // Only do this after legalization to avoid interfering with other combines
1472  // which might occur.
1473  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1474  !DCI.isCalledByLegalizer())
1475  return SDValue();
1476 
1477  // We need to permute the operands to get the correct NaN behavior. The
1478  // selected operand is the second one based on the failing compare with NaN,
1479  // so permute it based on the compare type the hardware uses.
1480  if (LHS == True)
1481  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1482  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1483  }
1484  case ISD::SETUGE:
1485  case ISD::SETUGT: {
1486  if (LHS == True)
1487  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1488  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1489  }
1490  case ISD::SETGT:
1491  case ISD::SETGE:
1492  case ISD::SETOGE:
1493  case ISD::SETOGT: {
1494  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1495  !DCI.isCalledByLegalizer())
1496  return SDValue();
1497 
1498  if (LHS == True)
1499  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1500  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1501  }
1502  case ISD::SETCC_INVALID:
1503  llvm_unreachable("Invalid setcc condcode!");
1504  }
1505  return SDValue();
1506 }
1507 
1508 std::pair<SDValue, SDValue>
1510  SDLoc SL(Op);
1511 
1512  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1513 
1514  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1515  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1516 
1517  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1518  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1519 
1520  return std::make_pair(Lo, Hi);
1521 }
1522 
1524  SDLoc SL(Op);
1525 
1526  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1527  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1528  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1529 }
1530 
1532  SDLoc SL(Op);
1533 
1534  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1535  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1536  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1537 }
1538 
1539 // Split a vector type into two parts. The first part is a power of two vector.
1540 // The second part is whatever is left over, and is a scalar if it would
1541 // otherwise be a 1-vector.
1542 std::pair<EVT, EVT>
1544  EVT LoVT, HiVT;
1545  EVT EltVT = VT.getVectorElementType();
1546  unsigned NumElts = VT.getVectorNumElements();
1547  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1548  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1549  HiVT = NumElts - LoNumElts == 1
1550  ? EltVT
1551  : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1552  return std::make_pair(LoVT, HiVT);
1553 }
1554 
1555 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1556 // scalar.
1557 std::pair<SDValue, SDValue>
1559  const EVT &LoVT, const EVT &HiVT,
1560  SelectionDAG &DAG) const {
1561  assert(LoVT.getVectorNumElements() +
1562  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1563  N.getValueType().getVectorNumElements() &&
1564  "More vector elements requested than available!");
1565  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1566  DAG.getVectorIdxConstant(0, DL));
1567  SDValue Hi = DAG.getNode(
1569  HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1570  return std::make_pair(Lo, Hi);
1571 }
1572 
1574  SelectionDAG &DAG) const {
1575  LoadSDNode *Load = cast<LoadSDNode>(Op);
1576  EVT VT = Op.getValueType();
1577  SDLoc SL(Op);
1578 
1579 
1580  // If this is a 2 element vector, we really want to scalarize and not create
1581  // weird 1 element vectors.
1582  if (VT.getVectorNumElements() == 2) {
1583  SDValue Ops[2];
1584  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1585  return DAG.getMergeValues(Ops, SL);
1586  }
1587 
1588  SDValue BasePtr = Load->getBasePtr();
1589  EVT MemVT = Load->getMemoryVT();
1590 
1591  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1592 
1593  EVT LoVT, HiVT;
1594  EVT LoMemVT, HiMemVT;
1595  SDValue Lo, Hi;
1596 
1597  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1598  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1599  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1600 
1601  unsigned Size = LoMemVT.getStoreSize();
1602  unsigned BaseAlign = Load->getAlignment();
1603  unsigned HiAlign = MinAlign(BaseAlign, Size);
1604 
1605  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1606  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1607  BaseAlign, Load->getMemOperand()->getFlags());
1608  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1609  SDValue HiLoad =
1610  DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1611  HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1612  HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1613 
1614  SDValue Join;
1615  if (LoVT == HiVT) {
1616  // This is the case that the vector is power of two so was evenly split.
1617  Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1618  } else {
1619  Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1620  DAG.getVectorIdxConstant(0, SL));
1621  Join = DAG.getNode(
1623  VT, Join, HiLoad,
1624  DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1625  }
1626 
1627  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1628  LoLoad.getValue(1), HiLoad.getValue(1))};
1629 
1630  return DAG.getMergeValues(Ops, SL);
1631 }
1632 
1634  SelectionDAG &DAG) const {
1635  LoadSDNode *Load = cast<LoadSDNode>(Op);
1636  EVT VT = Op.getValueType();
1637  SDValue BasePtr = Load->getBasePtr();
1638  EVT MemVT = Load->getMemoryVT();
1639  SDLoc SL(Op);
1640  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1641  unsigned BaseAlign = Load->getAlignment();
1642  unsigned NumElements = MemVT.getVectorNumElements();
1643 
1644  // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1645  // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1646  if (NumElements != 3 ||
1647  (BaseAlign < 8 &&
1648  !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1649  return SplitVectorLoad(Op, DAG);
1650 
1651  assert(NumElements == 3);
1652 
1653  EVT WideVT =
1655  EVT WideMemVT =
1656  EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1657  SDValue WideLoad = DAG.getExtLoad(
1658  Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1659  WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1660  return DAG.getMergeValues(
1661  {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1662  DAG.getVectorIdxConstant(0, SL)),
1663  WideLoad.getValue(1)},
1664  SL);
1665 }
1666 
1668  SelectionDAG &DAG) const {
1669  StoreSDNode *Store = cast<StoreSDNode>(Op);
1670  SDValue Val = Store->getValue();
1671  EVT VT = Val.getValueType();
1672 
1673  // If this is a 2 element vector, we really want to scalarize and not create
1674  // weird 1 element vectors.
1675  if (VT.getVectorNumElements() == 2)
1676  return scalarizeVectorStore(Store, DAG);
1677 
1678  EVT MemVT = Store->getMemoryVT();
1679  SDValue Chain = Store->getChain();
1680  SDValue BasePtr = Store->getBasePtr();
1681  SDLoc SL(Op);
1682 
1683  EVT LoVT, HiVT;
1684  EVT LoMemVT, HiMemVT;
1685  SDValue Lo, Hi;
1686 
1687  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1688  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1689  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1690 
1691  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1692 
1693  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1694  unsigned BaseAlign = Store->getAlignment();
1695  unsigned Size = LoMemVT.getStoreSize();
1696  unsigned HiAlign = MinAlign(BaseAlign, Size);
1697 
1698  SDValue LoStore =
1699  DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1700  Store->getMemOperand()->getFlags());
1701  SDValue HiStore =
1702  DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1703  HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1704 
1705  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1706 }
1707 
1708 // This is a shortcut for integer division because we have fast i32<->f32
1709 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1710 // float is enough to accurately represent up to a 24-bit signed integer.
1712  bool Sign) const {
1713  SDLoc DL(Op);
1714  EVT VT = Op.getValueType();
1715  SDValue LHS = Op.getOperand(0);
1716  SDValue RHS = Op.getOperand(1);
1717  MVT IntVT = MVT::i32;
1718  MVT FltVT = MVT::f32;
1719 
1720  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1721  if (LHSSignBits < 9)
1722  return SDValue();
1723 
1724  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1725  if (RHSSignBits < 9)
1726  return SDValue();
1727 
1728  unsigned BitSize = VT.getSizeInBits();
1729  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1730  unsigned DivBits = BitSize - SignBits;
1731  if (Sign)
1732  ++DivBits;
1733 
1736 
1737  SDValue jq = DAG.getConstant(1, DL, IntVT);
1738 
1739  if (Sign) {
1740  // char|short jq = ia ^ ib;
1741  jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1742 
1743  // jq = jq >> (bitsize - 2)
1744  jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1745  DAG.getConstant(BitSize - 2, DL, VT));
1746 
1747  // jq = jq | 0x1
1748  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1749  }
1750 
1751  // int ia = (int)LHS;
1752  SDValue ia = LHS;
1753 
1754  // int ib, (int)RHS;
1755  SDValue ib = RHS;
1756 
1757  // float fa = (float)ia;
1758  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1759 
1760  // float fb = (float)ib;
1761  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1762 
1763  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1764  fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1765 
1766  // fq = trunc(fq);
1767  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1768 
1769  // float fqneg = -fq;
1770  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1771 
1772  MachineFunction &MF = DAG.getMachineFunction();
1774 
1775  // float fr = mad(fqneg, fb, fa);
1776  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1777  (unsigned)ISD::FMA :
1778  !MFI->getMode().allFP32Denormals() ?
1779  (unsigned)ISD::FMAD :
1780  (unsigned)AMDGPUISD::FMAD_FTZ;
1781  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1782 
1783  // int iq = (int)fq;
1784  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1785 
1786  // fr = fabs(fr);
1787  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1788 
1789  // fb = fabs(fb);
1790  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1791 
1792  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1793 
1794  // int cv = fr >= fb;
1795  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1796 
1797  // jq = (cv ? jq : 0);
1798  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1799 
1800  // dst = iq + jq;
1801  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1802 
1803  // Rem needs compensation, it's easier to recompute it
1804  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1805  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1806 
1807  // Truncate to number of bits this divide really is.
1808  if (Sign) {
1809  SDValue InRegSize
1810  = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1811  Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1812  Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1813  } else {
1814  SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1815  Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1816  Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1817  }
1818 
1819  return DAG.getMergeValues({ Div, Rem }, DL);
1820 }
1821 
1823  SelectionDAG &DAG,
1825  SDLoc DL(Op);
1826  EVT VT = Op.getValueType();
1827 
1828  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1829 
1830  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1831 
1832  SDValue One = DAG.getConstant(1, DL, HalfVT);
1833  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1834 
1835  //HiLo split
1836  SDValue LHS = Op.getOperand(0);
1837  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1838  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1839 
1840  SDValue RHS = Op.getOperand(1);
1841  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1842  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1843 
1844  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1845  DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1846 
1847  SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1848  LHS_Lo, RHS_Lo);
1849 
1850  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1851  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1852 
1853  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1854  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1855  return;
1856  }
1857 
1858  if (isTypeLegal(MVT::i64)) {
1859  MachineFunction &MF = DAG.getMachineFunction();
1861 
1862  // Compute denominator reciprocal.
1863  unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1864  (unsigned)ISD::FMA :
1865  !MFI->getMode().allFP32Denormals() ?
1866  (unsigned)ISD::FMAD :
1867  (unsigned)AMDGPUISD::FMAD_FTZ;
1868 
1869  SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1870  SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1871  SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1872  DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1873  Cvt_Lo);
1874  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1875  SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1876  DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1877  SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1878  DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1879  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1880  SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1881  DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1882  Mul1);
1883  SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1884  SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1885  SDValue Rcp64 = DAG.getBitcast(VT,
1886  DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1887 
1888  SDValue Zero64 = DAG.getConstant(0, DL, VT);
1889  SDValue One64 = DAG.getConstant(1, DL, VT);
1890  SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1891  SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1892 
1893  SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1894  SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1895  SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1896  SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1897  Zero);
1898  SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1899  One);
1900 
1901  SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1902  Mulhi1_Lo, Zero1);
1903  SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1904  Mulhi1_Hi, Add1_Lo.getValue(1));
1905  SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1906  SDValue Add1 = DAG.getBitcast(VT,
1907  DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1908 
1909  SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1910  SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1911  SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1912  Zero);
1913  SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1914  One);
1915 
1916  SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1917  Mulhi2_Lo, Zero1);
1918  SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1919  Mulhi2_Hi, Add1_Lo.getValue(1));
1920  SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1921  Zero, Add2_Lo.getValue(1));
1922  SDValue Add2 = DAG.getBitcast(VT,
1923  DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1924  SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1925 
1926  SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1927 
1928  SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1929  SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1930  SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1931  Mul3_Lo, Zero1);
1932  SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1933  Mul3_Hi, Sub1_Lo.getValue(1));
1934  SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1935  SDValue Sub1 = DAG.getBitcast(VT,
1936  DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1937 
1938  SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1939  SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1940  ISD::SETUGE);
1941  SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1942  ISD::SETUGE);
1943  SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1944 
1945  // TODO: Here and below portions of the code can be enclosed into if/endif.
1946  // Currently control flow is unconditional and we have 4 selects after
1947  // potential endif to substitute PHIs.
1948 
1949  // if C3 != 0 ...
1950  SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1951  RHS_Lo, Zero1);
1952  SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1953  RHS_Hi, Sub1_Lo.getValue(1));
1954  SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1955  Zero, Sub2_Lo.getValue(1));
1956  SDValue Sub2 = DAG.getBitcast(VT,
1957  DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1958 
1959  SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1960 
1961  SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1962  ISD::SETUGE);
1963  SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1964  ISD::SETUGE);
1965  SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1966 
1967  // if (C6 != 0)
1968  SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1969 
1970  SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1971  RHS_Lo, Zero1);
1972  SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1973  RHS_Hi, Sub2_Lo.getValue(1));
1974  SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1975  Zero, Sub3_Lo.getValue(1));
1976  SDValue Sub3 = DAG.getBitcast(VT,
1977  DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1978 
1979  // endif C6
1980  // endif C3
1981 
1982  SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1983  SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1984 
1985  SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1986  SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1987 
1988  Results.push_back(Div);
1989  Results.push_back(Rem);
1990 
1991  return;
1992  }
1993 
1994  // r600 expandion.
1995  // Get Speculative values
1996  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1997  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1998 
1999  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2000  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2001  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2002 
2003  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2004  SDValue DIV_Lo = Zero;
2005 
2006  const unsigned halfBitWidth = HalfVT.getSizeInBits();
2007 
2008  for (unsigned i = 0; i < halfBitWidth; ++i) {
2009  const unsigned bitPos = halfBitWidth - i - 1;
2010  SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2011  // Get value of high bit
2012  SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2013  HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2014  HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2015 
2016  // Shift
2017  REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2018  // Add LHS high bit
2019  REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2020 
2021  SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2022  SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2023 
2024  DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2025 
2026  // Update REM
2027  SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2028  REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2029  }
2030 
2031  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2032  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2033  Results.push_back(DIV);
2034  Results.push_back(REM);
2035 }
2036 
2037 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2038  SelectionDAG &DAG) const {
2039  SDLoc DL(Op);
2040  EVT VT = Op.getValueType();
2041 
2042  if (VT == MVT::i64) {
2044  LowerUDIVREM64(Op, DAG, Results);
2045  return DAG.getMergeValues(Results, DL);
2046  }
2047 
2048  if (VT == MVT::i32) {
2049  if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2050  return Res;
2051  }
2052 
2053  SDValue X = Op.getOperand(0);
2054  SDValue Y = Op.getOperand(1);
2055 
2056  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2057  // algorithm used here.
2058 
2059  // Initial estimate of inv(y).
2060  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2061 
2062  // One round of UNR.
2063  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2064  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2065  Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2066  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2067 
2068  // Quotient/remainder estimate.
2069  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2070  SDValue R =
2071  DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2072 
2073  // First quotient/remainder refinement.
2074  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2075  SDValue One = DAG.getConstant(1, DL, VT);
2076  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2077  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2078  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2079  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2080  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2081 
2082  // Second quotient/remainder refinement.
2083  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2084  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2085  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2086  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2087  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2088 
2089  return DAG.getMergeValues({Q, R}, DL);
2090 }
2091 
2093  SelectionDAG &DAG) const {
2094  SDLoc DL(Op);
2095  EVT VT = Op.getValueType();
2096 
2097  SDValue LHS = Op.getOperand(0);
2098  SDValue RHS = Op.getOperand(1);
2099 
2100  SDValue Zero = DAG.getConstant(0, DL, VT);
2101  SDValue NegOne = DAG.getConstant(-1, DL, VT);
2102 
2103  if (VT == MVT::i32) {
2104  if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2105  return Res;
2106  }
2107 
2108  if (VT == MVT::i64 &&
2109  DAG.ComputeNumSignBits(LHS) > 32 &&
2110  DAG.ComputeNumSignBits(RHS) > 32) {
2111  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2112 
2113  //HiLo split
2114  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2115  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2116  SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2117  LHS_Lo, RHS_Lo);
2118  SDValue Res[2] = {
2119  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2120  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2121  };
2122  return DAG.getMergeValues(Res, DL);
2123  }
2124 
2125  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2126  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2127  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2128  SDValue RSign = LHSign; // Remainder sign is the same as LHS
2129 
2130  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2131  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2132 
2133  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2134  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2135 
2136  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2137  SDValue Rem = Div.getValue(1);
2138 
2139  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2140  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2141 
2142  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2143  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2144 
2145  SDValue Res[2] = {
2146  Div,
2147  Rem
2148  };
2149  return DAG.getMergeValues(Res, DL);
2150 }
2151 
2152 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2154  SDLoc SL(Op);
2155  EVT VT = Op.getValueType();
2156  auto Flags = Op->getFlags();
2157  SDValue X = Op.getOperand(0);
2158  SDValue Y = Op.getOperand(1);
2159 
2160  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2161  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2162  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2163  // TODO: For f32 use FMAD instead if !hasFastFMA32?
2164  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2165 }
2166 
2168  SDLoc SL(Op);
2169  SDValue Src = Op.getOperand(0);
2170 
2171  // result = trunc(src)
2172  // if (src > 0.0 && src != result)
2173  // result += 1.0
2174 
2175  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2176 
2177  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2178  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2179 
2180  EVT SetCCVT =
2182 
2183  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2184  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2185  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2186 
2187  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2188  // TODO: Should this propagate fast-math-flags?
2189  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2190 }
2191 
2193  SelectionDAG &DAG) {
2194  const unsigned FractBits = 52;
2195  const unsigned ExpBits = 11;
2196 
2197  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2198  Hi,
2199  DAG.getConstant(FractBits - 32, SL, MVT::i32),
2200  DAG.getConstant(ExpBits, SL, MVT::i32));
2201  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2202  DAG.getConstant(1023, SL, MVT::i32));
2203 
2204  return Exp;
2205 }
2206 
2208  SDLoc SL(Op);
2209  SDValue Src = Op.getOperand(0);
2210 
2211  assert(Op.getValueType() == MVT::f64);
2212 
2213  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2214  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2215 
2216  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2217 
2218  // Extract the upper half, since this is where we will find the sign and
2219  // exponent.
2220  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2221 
2222  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2223 
2224  const unsigned FractBits = 52;
2225 
2226  // Extract the sign bit.
2227  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2228  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2229 
2230  // Extend back to 64-bits.
2231  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2232  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2233 
2234  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2235  const SDValue FractMask
2236  = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2237 
2238  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2239  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2240  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2241 
2242  EVT SetCCVT =
2244 
2245  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2246 
2247  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2248  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2249 
2250  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2251  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2252 
2253  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2254 }
2255 
2257  SDLoc SL(Op);
2258  SDValue Src = Op.getOperand(0);
2259 
2260  assert(Op.getValueType() == MVT::f64);
2261 
2262  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2263  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2264  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2265 
2266  // TODO: Should this propagate fast-math-flags?
2267 
2268  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2269  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2270 
2271  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2272 
2273  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2274  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2275 
2276  EVT SetCCVT =
2278  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2279 
2280  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2281 }
2282 
2284  // FNEARBYINT and FRINT are the same, except in their handling of FP
2285  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2286  // rint, so just treat them as equivalent.
2287  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2288 }
2289 
2290 // XXX - May require not supporting f32 denormals?
2291 
2292 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2293 // compare and vselect end up producing worse code than scalarizing the whole
2294 // operation.
2296  SDLoc SL(Op);
2297  SDValue X = Op.getOperand(0);
2298  EVT VT = Op.getValueType();
2299 
2300  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2301 
2302  // TODO: Should this propagate fast-math-flags?
2303 
2304  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2305 
2306  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2307 
2308  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2309  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2310  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2311 
2312  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2313 
2314  EVT SetCCVT =
2315  getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2316 
2317  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2318 
2319  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2320 
2321  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2322 }
2323 
2325  SDLoc SL(Op);
2326  SDValue Src = Op.getOperand(0);
2327 
2328  // result = trunc(src);
2329  // if (src < 0.0 && src != result)
2330  // result += -1.0.
2331 
2332  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2333 
2334  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2335  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2336 
2337  EVT SetCCVT =
2339 
2340  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2341  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2342  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2343 
2344  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2345  // TODO: Should this propagate fast-math-flags?
2346  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2347 }
2348 
2350  double Log2BaseInverted) const {
2351  EVT VT = Op.getValueType();
2352 
2353  SDLoc SL(Op);
2354  SDValue Operand = Op.getOperand(0);
2355  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2356  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2357 
2358  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2359 }
2360 
2361 // exp2(M_LOG2E_F * f);
2363  EVT VT = Op.getValueType();
2364  SDLoc SL(Op);
2365  SDValue Src = Op.getOperand(0);
2366 
2367  const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2368  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2369  return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2370 }
2371 
2372 static bool isCtlzOpc(unsigned Opc) {
2373  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2374 }
2375 
2376 static bool isCttzOpc(unsigned Opc) {
2377  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2378 }
2379 
2381  SDLoc SL(Op);
2382  SDValue Src = Op.getOperand(0);
2383  bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2384  Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2385 
2386  unsigned ISDOpc, NewOpc;
2387  if (isCtlzOpc(Op.getOpcode())) {
2388  ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2389  NewOpc = AMDGPUISD::FFBH_U32;
2390  } else if (isCttzOpc(Op.getOpcode())) {
2391  ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2392  NewOpc = AMDGPUISD::FFBL_B32;
2393  } else
2394  llvm_unreachable("Unexpected OPCode!!!");
2395 
2396 
2397  if (ZeroUndef && Src.getValueType() == MVT::i32)
2398  return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2399 
2400  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2401 
2402  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2403  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2404 
2405  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2406  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2407 
2408  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2409  *DAG.getContext(), MVT::i32);
2410 
2411  SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2412  SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2413 
2414  SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2415  SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2416 
2417  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2418  SDValue Add, NewOpr;
2419  if (isCtlzOpc(Op.getOpcode())) {
2420  Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2421  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2422  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2423  } else {
2424  Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2425  // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2426  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2427  }
2428 
2429  if (!ZeroUndef) {
2430  // Test if the full 64-bit input is zero.
2431 
2432  // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2433  // which we probably don't want.
2434  SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2435  SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2436  SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2437 
2438  // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2439  // with the same cycles, otherwise it is slower.
2440  // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2441  // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2442 
2443  const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2444 
2445  // The instruction returns -1 for 0 input, but the defined intrinsic
2446  // behavior is to return the number of bits.
2447  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2448  SrcIsZero, Bits32, NewOpr);
2449  }
2450 
2451  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2452 }
2453 
2455  bool Signed) const {
2456  // Unsigned
2457  // cul2f(ulong u)
2458  //{
2459  // uint lz = clz(u);
2460  // uint e = (u != 0) ? 127U + 63U - lz : 0;
2461  // u = (u << lz) & 0x7fffffffffffffffUL;
2462  // ulong t = u & 0xffffffffffUL;
2463  // uint v = (e << 23) | (uint)(u >> 40);
2464  // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2465  // return as_float(v + r);
2466  //}
2467  // Signed
2468  // cl2f(long l)
2469  //{
2470  // long s = l >> 63;
2471  // float r = cul2f((l + s) ^ s);
2472  // return s ? -r : r;
2473  //}
2474 
2475  SDLoc SL(Op);
2476  SDValue Src = Op.getOperand(0);
2477  SDValue L = Src;
2478 
2479  SDValue S;
2480  if (Signed) {
2481  const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2482  S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2483 
2484  SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2485  L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2486  }
2487 
2488  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2489  *DAG.getContext(), MVT::f32);
2490 
2491 
2492  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2493  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2494  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2495  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2496 
2497  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2498  SDValue E = DAG.getSelect(SL, MVT::i32,
2499  DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2500  DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2501  ZeroI32);
2502 
2503  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2504  DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2505  DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2506 
2507  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2508  DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2509 
2510  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2511  U, DAG.getConstant(40, SL, MVT::i64));
2512 
2513  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2514  DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2515  DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
2516 
2517  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2518  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2519  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2520 
2521  SDValue One = DAG.getConstant(1, SL, MVT::i32);
2522 
2523  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2524 
2525  SDValue R = DAG.getSelect(SL, MVT::i32,
2526  RCmp,
2527  One,
2528  DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2529  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2530  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2531 
2532  if (!Signed)
2533  return R;
2534 
2535  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2536  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2537 }
2538 
2540  bool Signed) const {
2541  SDLoc SL(Op);
2542  SDValue Src = Op.getOperand(0);
2543 
2544  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2545 
2547  DAG.getConstant(0, SL, MVT::i32));
2549  DAG.getConstant(1, SL, MVT::i32));
2550 
2552  SL, MVT::f64, Hi);
2553 
2554  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2555 
2556  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2557  DAG.getConstant(32, SL, MVT::i32));
2558  // TODO: Should this propagate fast-math-flags?
2559  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2560 }
2561 
2563  SelectionDAG &DAG) const {
2564  // TODO: Factor out code common with LowerSINT_TO_FP.
2565  EVT DestVT = Op.getValueType();
2566  SDValue Src = Op.getOperand(0);
2567  EVT SrcVT = Src.getValueType();
2568 
2569  if (SrcVT == MVT::i16) {
2570  if (DestVT == MVT::f16)
2571  return Op;
2572  SDLoc DL(Op);
2573 
2574  // Promote src to i32
2576  return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2577  }
2578 
2579  assert(SrcVT == MVT::i64 && "operation should be legal");
2580 
2581  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2582  SDLoc DL(Op);
2583 
2584  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2585  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2586  SDValue FPRound =
2587  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2588 
2589  return FPRound;
2590  }
2591 
2592  if (DestVT == MVT::f32)
2593  return LowerINT_TO_FP32(Op, DAG, false);
2594 
2595  assert(DestVT == MVT::f64);
2596  return LowerINT_TO_FP64(Op, DAG, false);
2597 }
2598 
2600  SelectionDAG &DAG) const {
2601  EVT DestVT = Op.getValueType();
2602 
2603  SDValue Src = Op.getOperand(0);
2604  EVT SrcVT = Src.getValueType();
2605 
2606  if (SrcVT == MVT::i16) {
2607  if (DestVT == MVT::f16)
2608  return Op;
2609 
2610  SDLoc DL(Op);
2611  // Promote src to i32
2613  return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2614  }
2615 
2616  assert(SrcVT == MVT::i64 && "operation should be legal");
2617 
2618  // TODO: Factor out code common with LowerUINT_TO_FP.
2619 
2620  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2621  SDLoc DL(Op);
2622  SDValue Src = Op.getOperand(0);
2623 
2624  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2625  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2626  SDValue FPRound =
2627  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2628 
2629  return FPRound;
2630  }
2631 
2632  if (DestVT == MVT::f32)
2633  return LowerINT_TO_FP32(Op, DAG, true);
2634 
2635  assert(DestVT == MVT::f64);
2636  return LowerINT_TO_FP64(Op, DAG, true);
2637 }
2638 
2640  bool Signed) const {
2641  SDLoc SL(Op);
2642 
2643  SDValue Src = Op.getOperand(0);
2644  EVT SrcVT = Src.getValueType();
2645 
2646  assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2647 
2648  // The basic idea of converting a floating point number into a pair of 32-bit
2649  // integers is illustrated as follows:
2650  //
2651  // tf := trunc(val);
2652  // hif := floor(tf * 2^-32);
2653  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2654  // hi := fptoi(hif);
2655  // lo := fptoi(lof);
2656  //
2657  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2658  SDValue Sign;
2659  if (Signed && SrcVT == MVT::f32) {
2660  // However, a 32-bit floating point number has only 23 bits mantissa and
2661  // it's not enough to hold all the significant bits of `lof` if val is
2662  // negative. To avoid the loss of precision, We need to take the absolute
2663  // value after truncating and flip the result back based on the original
2664  // signedness.
2665  Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2666  DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2667  DAG.getConstant(31, SL, MVT::i32));
2668  Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2669  }
2670 
2671  SDValue K0, K1;
2672  if (SrcVT == MVT::f64) {
2673  K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
2674  SL, SrcVT);
2675  K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
2676  SL, SrcVT);
2677  } else {
2678  K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
2679  SrcVT);
2680  K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
2681  SrcVT);
2682  }
2683  // TODO: Should this propagate fast-math-flags?
2684  SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2685 
2686  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2687 
2688  SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2689 
2690  SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2691  : ISD::FP_TO_UINT,
2692  SL, MVT::i32, FloorMul);
2693  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2694 
2695  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2696  DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2697 
2698  if (Signed && SrcVT == MVT::f32) {
2699  assert(Sign);
2700  // Flip the result based on the signedness, which is either all 0s or 1s.
2701  Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2702  DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2703  // r := xor(r, sign) - sign;
2704  Result =
2705  DAG.getNode(ISD::SUB, SL, MVT::i64,
2706  DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2707  }
2708 
2709  return Result;
2710 }
2711 
2713  SDLoc DL(Op);
2714  SDValue N0 = Op.getOperand(0);
2715 
2716  // Convert to target node to get known bits
2717  if (N0.getValueType() == MVT::f32)
2718  return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2719 
2720  if (getTargetMachine().Options.UnsafeFPMath) {
2721  // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2722  return SDValue();
2723  }
2724 
2726 
2727  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2728  const unsigned ExpMask = 0x7ff;
2729  const unsigned ExpBiasf64 = 1023;
2730  const unsigned ExpBiasf16 = 15;
2731  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2732  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2733  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2734  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2735  DAG.getConstant(32, DL, MVT::i64));
2736  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2737  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2738  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2739  DAG.getConstant(20, DL, MVT::i64));
2740  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2741  DAG.getConstant(ExpMask, DL, MVT::i32));
2742  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2743  // add the f16 bias (15) to get the biased exponent for the f16 format.
2744  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2745  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2746 
2747  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2748  DAG.getConstant(8, DL, MVT::i32));
2749  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2750  DAG.getConstant(0xffe, DL, MVT::i32));
2751 
2752  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2753  DAG.getConstant(0x1ff, DL, MVT::i32));
2754  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2755 
2756  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2757  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2758 
2759  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2760  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2761  DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2762  Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2763 
2764  // N = M | (E << 12);
2765  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2766  DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2767  DAG.getConstant(12, DL, MVT::i32)));
2768 
2769  // B = clamp(1-E, 0, 13);
2770  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2771  One, E);
2772  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2773  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2774  DAG.getConstant(13, DL, MVT::i32));
2775 
2776  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2777  DAG.getConstant(0x1000, DL, MVT::i32));
2778 
2779  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2780  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2781  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2782  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2783 
2784  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2785  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2786  DAG.getConstant(0x7, DL, MVT::i32));
2787  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2788  DAG.getConstant(2, DL, MVT::i32));
2789  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2790  One, Zero, ISD::SETEQ);
2791  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2792  One, Zero, ISD::SETGT);
2793  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2794  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2795 
2796  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2797  DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2798  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2799  I, V, ISD::SETEQ);
2800 
2801  // Extract the sign bit.
2802  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2803  DAG.getConstant(16, DL, MVT::i32));
2804  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2805  DAG.getConstant(0x8000, DL, MVT::i32));
2806 
2807  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2808  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2809 }
2810 
2812  SelectionDAG &DAG) const {
2813  SDValue Src = Op.getOperand(0);
2814  unsigned OpOpcode = Op.getOpcode();
2815  EVT SrcVT = Src.getValueType();
2816  EVT DestVT = Op.getValueType();
2817 
2818  // Will be selected natively
2819  if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2820  return Op;
2821 
2822  // Promote i16 to i32
2823  if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2824  SDLoc DL(Op);
2825 
2826  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2827  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2828  }
2829 
2830  if (SrcVT == MVT::f16 ||
2831  (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2832  SDLoc DL(Op);
2833 
2834  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2835  unsigned Ext =
2837  return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2838  }
2839 
2840  if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2841  return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2842 
2843  return SDValue();
2844 }
2845 
2847  SelectionDAG &DAG) const {
2848  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2849  MVT VT = Op.getSimpleValueType();
2850  MVT ScalarVT = VT.getScalarType();
2851 
2852  assert(VT.isVector());
2853 
2854  SDValue Src = Op.getOperand(0);
2855  SDLoc DL(Op);
2856 
2857  // TODO: Don't scalarize on Evergreen?
2858  unsigned NElts = VT.getVectorNumElements();
2860  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2861 
2862  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2863  for (unsigned I = 0; I < NElts; ++I)
2864  Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2865 
2866  return DAG.getBuildVector(VT, DL, Args);
2867 }
2868 
2869 //===----------------------------------------------------------------------===//
2870 // Custom DAG optimizations
2871 //===----------------------------------------------------------------------===//
2872 
2873 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2874  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2875 }
2876 
2877 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2878  EVT VT = Op.getValueType();
2879  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2880  // as unsigned 24-bit values.
2882 }
2883 
2886  SelectionDAG &DAG = DCI.DAG;
2887  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2888  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2889 
2890  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2891  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2892  unsigned NewOpcode = Node24->getOpcode();
2893  if (IsIntrin) {
2894  unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2895  NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2897  }
2898 
2899  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2900 
2901  // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2902  // the operands to have other uses, but will only perform simplifications that
2903  // involve bypassing some nodes for this user.
2904  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2905  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2906  if (DemandedLHS || DemandedRHS)
2907  return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2908  DemandedLHS ? DemandedLHS : LHS,
2909  DemandedRHS ? DemandedRHS : RHS);
2910 
2911  // Now try SimplifyDemandedBits which can simplify the nodes used by our
2912  // operands if this node is the only user.
2913  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2914  return SDValue(Node24, 0);
2915  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2916  return SDValue(Node24, 0);
2917 
2918  return SDValue();
2919 }
2920 
2921 template <typename IntTy>
2923  uint32_t Width, const SDLoc &DL) {
2924  if (Width + Offset < 32) {
2925  uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2926  IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2927  return DAG.getConstant(Result, DL, MVT::i32);
2928  }
2929 
2930  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2931 }
2932 
2933 static bool hasVolatileUser(SDNode *Val) {
2934  for (SDNode *U : Val->uses()) {
2935  if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2936  if (M->isVolatile())
2937  return true;
2938  }
2939  }
2940 
2941  return false;
2942 }
2943 
2945  // i32 vectors are the canonical memory type.
2946  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2947  return false;
2948 
2949  if (!VT.isByteSized())
2950  return false;
2951 
2952  unsigned Size = VT.getStoreSize();
2953 
2954  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2955  return false;
2956 
2957  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2958  return false;
2959 
2960  return true;
2961 }
2962 
2963 // Replace load of an illegal type with a store of a bitcast to a friendlier
2964 // type.
2966  DAGCombinerInfo &DCI) const {
2967  if (!DCI.isBeforeLegalize())
2968  return SDValue();
2969 
2970  LoadSDNode *LN = cast<LoadSDNode>(N);
2971  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2972  return SDValue();
2973 
2974  SDLoc SL(N);
2975  SelectionDAG &DAG = DCI.DAG;
2976  EVT VT = LN->getMemoryVT();
2977 
2978  unsigned Size = VT.getStoreSize();
2979  Align Alignment = LN->getAlign();
2980  if (Alignment < Size && isTypeLegal(VT)) {
2981  bool IsFast;
2982  unsigned AS = LN->getAddressSpace();
2983 
2984  // Expand unaligned loads earlier than legalization. Due to visitation order
2985  // problems during legalization, the emitted instructions to pack and unpack
2986  // the bytes again are not eliminated in the case of an unaligned copy.
2988  VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2989  SDValue Ops[2];
2990 
2991  if (VT.isVector())
2992  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
2993  else
2994  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2995 
2996  return DAG.getMergeValues(Ops, SDLoc(N));
2997  }
2998 
2999  if (!IsFast)
3000  return SDValue();
3001  }
3002 
3003  if (!shouldCombineMemoryType(VT))
3004  return SDValue();
3005 
3006  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3007 
3008  SDValue NewLoad
3009  = DAG.getLoad(NewVT, SL, LN->getChain(),
3010  LN->getBasePtr(), LN->getMemOperand());
3011 
3012  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3013  DCI.CombineTo(N, BC, NewLoad.getValue(1));
3014  return SDValue(N, 0);
3015 }
3016 
3017 // Replace store of an illegal type with a store of a bitcast to a friendlier
3018 // type.
3020  DAGCombinerInfo &DCI) const {
3021  if (!DCI.isBeforeLegalize())
3022  return SDValue();
3023 
3024  StoreSDNode *SN = cast<StoreSDNode>(N);
3025  if (!SN->isSimple() || !ISD::isNormalStore(SN))
3026  return SDValue();
3027 
3028  EVT VT = SN->getMemoryVT();
3029  unsigned Size = VT.getStoreSize();
3030 
3031  SDLoc SL(N);
3032  SelectionDAG &DAG = DCI.DAG;
3033  Align Alignment = SN->getAlign();
3034  if (Alignment < Size && isTypeLegal(VT)) {
3035  bool IsFast;
3036  unsigned AS = SN->getAddressSpace();
3037 
3038  // Expand unaligned stores earlier than legalization. Due to visitation
3039  // order problems during legalization, the emitted instructions to pack and
3040  // unpack the bytes again are not eliminated in the case of an unaligned
3041  // copy.
3043  VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3044  if (VT.isVector())
3045  return scalarizeVectorStore(SN, DAG);
3046 
3047  return expandUnalignedStore(SN, DAG);
3048  }
3049 
3050  if (!IsFast)
3051  return SDValue();
3052  }
3053 
3054  if (!shouldCombineMemoryType(VT))
3055  return SDValue();
3056 
3057  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3058  SDValue Val = SN->getValue();
3059 
3060  //DCI.AddToWorklist(Val.getNode());
3061 
3062  bool OtherUses = !Val.hasOneUse();
3063  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3064  if (OtherUses) {
3065  SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3066  DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3067  }
3068 
3069  return DAG.getStore(SN->getChain(), SL, CastVal,
3070  SN->getBasePtr(), SN->getMemOperand());
3071 }
3072 
3073 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3074 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3075 // issues.
3077  DAGCombinerInfo &DCI) const {
3078  SelectionDAG &DAG = DCI.DAG;
3079  SDValue N0 = N->getOperand(0);
3080 
3081  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3082  // (vt2 (truncate (assertzext vt0:x, vt1)))
3083  if (N0.getOpcode() == ISD::TRUNCATE) {
3084  SDValue N1 = N->getOperand(1);
3085  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3086  SDLoc SL(N);
3087 
3088  SDValue Src = N0.getOperand(0);
3089  EVT SrcVT = Src.getValueType();
3090  if (SrcVT.bitsGE(ExtVT)) {
3091  SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3092  return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3093  }
3094  }
3095 
3096  return SDValue();
3097 }
3098 
3100  SDNode *N, DAGCombinerInfo &DCI) const {
3101  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3102  switch (IID) {
3103  case Intrinsic::amdgcn_mul_i24:
3104  case Intrinsic::amdgcn_mul_u24:
3105  return simplifyMul24(N, DCI);
3106  case Intrinsic::amdgcn_fract:
3107  case Intrinsic::amdgcn_rsq:
3108  case Intrinsic::amdgcn_rcp_legacy:
3109  case Intrinsic::amdgcn_rsq_legacy:
3110  case Intrinsic::amdgcn_rsq_clamp:
3111  case Intrinsic::amdgcn_ldexp: {
3112  // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3113  SDValue Src = N->getOperand(1);
3114  return Src.isUndef() ? Src : SDValue();
3115  }
3116  default:
3117  return SDValue();
3118  }
3119 }
3120 
3121 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3122 /// binary operation \p Opc to it with the corresponding constant operands.
3124  DAGCombinerInfo &DCI, const SDLoc &SL,
3125  unsigned Opc, SDValue LHS,
3126  uint32_t ValLo, uint32_t ValHi) const {
3127  SelectionDAG &DAG = DCI.DAG;
3128  SDValue Lo, Hi;
3129  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3130 
3131  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3132  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3133 
3134  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3135  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3136 
3137  // Re-visit the ands. It's possible we eliminated one of them and it could
3138  // simplify the vector.
3139  DCI.AddToWorklist(Lo.getNode());
3140  DCI.AddToWorklist(Hi.getNode());
3141 
3142  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3143  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3144 }
3145 
3147  DAGCombinerInfo &DCI) const {
3148  EVT VT = N->getValueType(0);
3149 
3150  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3151  if (!RHS)
3152  return SDValue();
3153 
3154  SDValue LHS = N->getOperand(0);
3155  unsigned RHSVal = RHS->getZExtValue();
3156  if (!RHSVal)
3157  return LHS;
3158 
3159  SDLoc SL(N);
3160  SelectionDAG &DAG = DCI.DAG;
3161 
3162  switch (LHS->getOpcode()) {
3163  default:
3164  break;
3165  case ISD::ZERO_EXTEND:
3166  case ISD::SIGN_EXTEND:
3167  case ISD::ANY_EXTEND: {
3168  SDValue X = LHS->getOperand(0);
3169 
3170  if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3172  // Prefer build_vector as the canonical form if packed types are legal.
3173  // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3174  SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3175  { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3176  return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3177  }
3178 
3179  // shl (ext x) => zext (shl x), if shift does not overflow int
3180  if (VT != MVT::i64)
3181  break;
3182  KnownBits Known = DAG.computeKnownBits(X);
3183  unsigned LZ = Known.countMinLeadingZeros();
3184  if (LZ < RHSVal)
3185  break;
3186  EVT XVT = X.getValueType();
3187  SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3188  return DAG.getZExtOrTrunc(Shl, SL, VT);
3189  }
3190  }
3191 
3192  if (VT != MVT::i64)
3193  return SDValue();
3194 
3195  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3196 
3197  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3198  // common case, splitting this into a move and a 32-bit shift is faster and
3199  // the same code size.
3200  if (RHSVal < 32)
3201  return SDValue();
3202 
3203  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3204 
3205  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3206  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3207 
3208  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3209 
3210  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3211  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3212 }
3213 
3215  DAGCombinerInfo &DCI) const {
3216  if (N->getValueType(0) != MVT::i64)
3217  return SDValue();
3218 
3219  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3220  if (!RHS)
3221  return SDValue();
3222 
3223  SelectionDAG &DAG = DCI.DAG;
3224  SDLoc SL(N);
3225  unsigned RHSVal = RHS->getZExtValue();
3226 
3227  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3228  if (RHSVal == 32) {
3229  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3230  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3231  DAG.getConstant(31, SL, MVT::i32));
3232 
3233  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3234  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3235  }
3236 
3237  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3238  if (RHSVal == 63) {
3239  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3240  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3241  DAG.getConstant(31, SL, MVT::i32));
3242  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3243  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3244  }
3245 
3246  return SDValue();
3247 }
3248 
3250  DAGCombinerInfo &DCI) const {
3251  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3252  if (!RHS)
3253  return SDValue();
3254 
3255  EVT VT = N->getValueType(0);
3256  SDValue LHS = N->getOperand(0);
3257  unsigned ShiftAmt = RHS->getZExtValue();
3258  SelectionDAG &DAG = DCI.DAG;
3259  SDLoc SL(N);
3260 
3261  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3262  // this improves the ability to match BFE patterns in isel.
3263  if (LHS.getOpcode() == ISD::AND) {
3264  if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3265  if (Mask->getAPIntValue().isShiftedMask() &&
3266  Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3267  return DAG.getNode(
3268  ISD::AND, SL, VT,
3269  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3270  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3271  }
3272  }
3273  }
3274 
3275  if (VT != MVT::i64)
3276  return SDValue();
3277 
3278  if (ShiftAmt < 32)
3279  return SDValue();
3280 
3281  // srl i64:x, C for C >= 32
3282  // =>
3283  // build_pair (srl hi_32(x), C - 32), 0
3284  SDValue One = DAG.getConstant(1, SL, MVT::i32);
3285  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3286 
3287  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3288  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3289 
3290  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3291  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3292 
3293  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3294 
3295  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3296 }
3297 
3299  SDNode *N, DAGCombinerInfo &DCI) const {
3300  SDLoc SL(N);
3301  SelectionDAG &DAG = DCI.DAG;
3302  EVT VT = N->getValueType(0);
3303  SDValue Src = N->getOperand(0);
3304 
3305  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3306  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3307  SDValue Vec = Src.getOperand(0);
3308  if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3309  SDValue Elt0 = Vec.getOperand(0);
3310  EVT EltVT = Elt0.getValueType();
3311  if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3312  if (EltVT.isFloatingPoint()) {
3313  Elt0 = DAG.getNode(ISD::BITCAST, SL,
3314  EltVT.changeTypeToInteger(), Elt0);
3315  }
3316 
3317  return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3318  }
3319  }
3320  }
3321 
3322  // Equivalent of above for accessing the high element of a vector as an
3323  // integer operation.
3324  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3325  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3326  if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3327  if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3328  SDValue BV = stripBitcast(Src.getOperand(0));
3329  if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3330  BV.getValueType().getVectorNumElements() == 2) {
3331  SDValue SrcElt = BV.getOperand(1);
3332  EVT SrcEltVT = SrcElt.getValueType();
3333  if (SrcEltVT.isFloatingPoint()) {
3334  SrcElt = DAG.getNode(ISD::BITCAST, SL,
3335  SrcEltVT.changeTypeToInteger(), SrcElt);
3336  }
3337 
3338  return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3339  }
3340  }
3341  }
3342  }
3343 
3344  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3345  //
3346  // i16 (trunc (srl i64:x, K)), K <= 16 ->
3347  // i16 (trunc (srl (i32 (trunc x), K)))
3348  if (VT.getScalarSizeInBits() < 32) {
3349  EVT SrcVT = Src.getValueType();
3350  if (SrcVT.getScalarSizeInBits() > 32 &&
3351  (Src.getOpcode() == ISD::SRL ||
3352  Src.getOpcode() == ISD::SRA ||
3353  Src.getOpcode() == ISD::SHL)) {
3354  SDValue Amt = Src.getOperand(1);
3355  KnownBits Known = DAG.computeKnownBits(Amt);
3356  unsigned Size = VT.getScalarSizeInBits();
3357  if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3358  (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3359  EVT MidVT = VT.isVector() ?
3362 
3363  EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3364  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3365  Src.getOperand(0));
3366  DCI.AddToWorklist(Trunc.getNode());
3367 
3368  if (Amt.getValueType() != NewShiftVT) {
3369  Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3370  DCI.AddToWorklist(Amt.getNode());
3371  }
3372 
3373  SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3374  Trunc, Amt);
3375  return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3376  }
3377  }
3378  }
3379 
3380  return SDValue();
3381 }
3382 
3383 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3384 // instructions. If we only match on the legalized i64 mul expansion,
3385 // SimplifyDemandedBits will be unable to remove them because there will be
3386 // multiple uses due to the separate mul + mulh[su].
3387 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3388  SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3389  if (Size <= 32) {
3390  unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3391  return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3392  }
3393 
3394  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3395  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3396 
3397  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3398  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3399 
3400  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3401 }
3402 
3404  DAGCombinerInfo &DCI) const {
3405  EVT VT = N->getValueType(0);
3406 
3407  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3408  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3409  // unnecessarily). isDivergent() is used as an approximation of whether the
3410  // value is in an SGPR.
3411  if (!N->isDivergent())
3412  return SDValue();
3413 
3414  unsigned Size = VT.getSizeInBits();
3415  if (VT.isVector() || Size > 64)
3416  return SDValue();
3417 
3418  // There are i16 integer mul/mad.
3419  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3420  return SDValue();
3421 
3422  SelectionDAG &DAG = DCI.DAG;
3423  SDLoc DL(N);
3424 
3425  SDValue N0 = N->getOperand(0);
3426  SDValue N1 = N->getOperand(1);
3427 
3428  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3429  // in the source into any_extends if the result of the mul is truncated. Since
3430  // we can assume the high bits are whatever we want, use the underlying value
3431  // to avoid the unknown high bits from interfering.
3432  if (N0.getOpcode() == ISD::ANY_EXTEND)
3433  N0 = N0.getOperand(0);
3434 
3435  if (N1.getOpcode() == ISD::ANY_EXTEND)
3436  N1 = N1.getOperand(0);
3437 
3438  SDValue Mul;
3439 
3440  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3441  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3442  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3443  Mul = getMul24(DAG, DL, N0, N1, Size, false);
3444  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3445  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3446  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3447  Mul = getMul24(DAG, DL, N0, N1, Size, true);
3448  } else {
3449  return SDValue();
3450  }
3451 
3452  // We need to use sext even for MUL_U24, because MUL_U24 is used
3453  // for signed multiply of 8 and 16-bit types.
3454  return DAG.getSExtOrTrunc(Mul, DL, VT);
3455 }
3456 
3458  DAGCombinerInfo &DCI) const {
3459  EVT VT = N->getValueType(0);
3460 
3461  if (!Subtarget->hasMulI24() || VT.isVector())
3462  return SDValue();
3463 
3464  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3465  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3466  // unnecessarily). isDivergent() is used as an approximation of whether the
3467  // value is in an SGPR.
3468  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3469  // valu op anyway)
3470  if (Subtarget->hasSMulHi() && !N->isDivergent())
3471  return SDValue();
3472 
3473  SelectionDAG &DAG = DCI.DAG;
3474  SDLoc DL(N);
3475 
3476  SDValue N0 = N->getOperand(0);
3477  SDValue N1 = N->getOperand(1);
3478 
3479  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3480  return SDValue();
3481 
3482  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3483  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3484 
3485  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3486  DCI.AddToWorklist(Mulhi.getNode());
3487  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3488 }
3489 
3491  DAGCombinerInfo &DCI) const {
3492  EVT VT = N->getValueType(0);
3493 
3494  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3495  return SDValue();
3496 
3497  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3498  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3499  // unnecessarily). isDivergent() is used as an approximation of whether the
3500  // value is in an SGPR.
3501  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3502  // valu op anyway)
3503  if (Subtarget->hasSMulHi() && !N->isDivergent())
3504  return SDValue();
3505 
3506  SelectionDAG &DAG = DCI.DAG;
3507  SDLoc DL(N);
3508 
3509  SDValue N0 = N->getOperand(0);
3510  SDValue N1 = N->getOperand(1);
3511 
3512  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3513  return SDValue();
3514 
3515  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3516  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3517 
3518  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3519  DCI.AddToWorklist(Mulhi.getNode());
3520  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3521 }
3522 
3523 static bool isNegativeOne(SDValue Val) {
3524  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3525  return C->isAllOnesValue();
3526  return false;
3527 }
3528 
3529 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3530  SDValue Op,
3531  const SDLoc &DL,
3532  unsigned Opc) const {
3533  EVT VT = Op.getValueType();
3534  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3535  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3536  LegalVT != MVT::i16))
3537  return SDValue();
3538 
3539  if (VT != MVT::i32)
3541 
3542  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3543  if (VT != MVT::i32)
3544  FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3545 
3546  return FFBX;
3547 }
3548 
3549 // The native instructions return -1 on 0 input. Optimize out a select that
3550 // produces -1 on 0.
3551 //
3552 // TODO: If zero is not undef, we could also do this if the output is compared
3553 // against the bitwidth.
3554 //
3555 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3557  SDValue LHS, SDValue RHS,
3558  DAGCombinerInfo &DCI) const {
3559  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3560  if (!CmpRhs || !CmpRhs->isNullValue())
3561  return SDValue();
3562 
3563  SelectionDAG &DAG = DCI.DAG;
3564  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3565  SDValue CmpLHS = Cond.getOperand(0);
3566 
3567  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3568  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3569  if (CCOpcode == ISD::SETEQ &&
3570  (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3571  RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3572  unsigned Opc =
3574  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3575  }
3576 
3577  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3578  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3579  if (CCOpcode == ISD::SETNE &&
3580  (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3581  LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3582  unsigned Opc =
3584 
3585  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3586  }
3587 
3588  return SDValue();
3589 }
3590 
3592  unsigned Op,
3593  const SDLoc &SL,
3594  SDValue Cond,
3595  SDValue N1,
3596  SDValue N2) {
3597  SelectionDAG &DAG = DCI.DAG;
3598  EVT VT = N1.getValueType();
3599 
3600  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3601  N1.getOperand(0), N2.getOperand(0));
3602  DCI.AddToWorklist(NewSelect.getNode());
3603  return DAG.getNode(Op, SL, VT, NewSelect);
3604 }
3605 
3606 // Pull a free FP operation out of a select so it may fold into uses.
3607 //
3608 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3609 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3610 //
3611 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3612 // select c, (fabs x), +k -> fabs (select c, x, k)
3614  SDValue N) {
3615  SelectionDAG &DAG = DCI.DAG;
3616  SDValue Cond = N.getOperand(0);
3617  SDValue LHS = N.getOperand(1);
3618  SDValue RHS = N.getOperand(2);
3619 
3620  EVT VT = N.getValueType();
3621  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3622  (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3623  return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3624  SDLoc(N), Cond, LHS, RHS);
3625  }
3626 
3627  bool Inv = false;
3628  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3629  std::swap(LHS, RHS);
3630  Inv = true;
3631  }
3632 
3633  // TODO: Support vector constants.
3634  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3635  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3636  SDLoc SL(N);
3637  // If one side is an fneg/fabs and the other is a constant, we can push the
3638  // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3639  SDValue NewLHS = LHS.getOperand(0);
3640  SDValue NewRHS = RHS;
3641 
3642  // Careful: if the neg can be folded up, don't try to pull it back down.
3643  bool ShouldFoldNeg = true;
3644 
3645  if (NewLHS.hasOneUse()) {
3646  unsigned Opc = NewLHS.getOpcode();
3647  if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3648  ShouldFoldNeg = false;
3649  if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3650  ShouldFoldNeg = false;
3651  }
3652 
3653  if (ShouldFoldNeg) {
3654  if (LHS.getOpcode() == ISD::FNEG)
3655  NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3656  else if (CRHS->isNegative())
3657  return SDValue();
3658 
3659  if (Inv)
3660  std::swap(NewLHS, NewRHS);
3661 
3662  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3663  Cond, NewLHS, NewRHS);
3664  DCI.AddToWorklist(NewSelect.getNode());
3665  return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3666  }
3667  }
3668 
3669  return SDValue();
3670 }
3671 
3672 
3674  DAGCombinerInfo &DCI) const {
3675  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3676  return Folded;
3677 
3678  SDValue Cond = N->getOperand(0);
3679  if (Cond.getOpcode() != ISD::SETCC)
3680  return SDValue();
3681 
3682  EVT VT = N->getValueType(0);
3683  SDValue LHS = Cond.getOperand(0);
3684  SDValue RHS = Cond.getOperand(1);
3685  SDValue CC = Cond.getOperand(2);
3686