LLVM  14.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/Support/KnownBits.h"
27 
28 using namespace llvm;
29 
30 #include "AMDGPUGenCallingConv.inc"
31 
33  "amdgpu-bypass-slow-div",
34  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
35  cl::init(true));
36 
37 // Find a larger type to do a load / store of a vector with.
39  unsigned StoreSize = VT.getStoreSizeInBits();
40  if (StoreSize <= 32)
41  return EVT::getIntegerVT(Ctx, StoreSize);
42 
43  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
44  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
45 }
46 
49 }
50 
52  EVT VT = Op.getValueType();
53 
54  // In order for this to be a signed 24-bit value, bit 23, must
55  // be a sign bit.
56  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
57 }
58 
60  const AMDGPUSubtarget &STI)
61  : TargetLowering(TM), Subtarget(&STI) {
62  // Lower floating point store/load to integer store/load to reduce the number
63  // of patterns in tablegen.
66 
69 
72 
75 
78 
81 
84 
87 
90 
93 
96 
99 
102 
105 
108 
111 
114 
117 
120 
123 
126 
129 
130  // There are no 64-bit extloads. These should be done as a 32-bit extload and
131  // an extension to 64-bit.
132  for (MVT VT : MVT::integer_valuetypes()) {
136  }
137 
138  for (MVT VT : MVT::integer_valuetypes()) {
139  if (VT == MVT::i64)
140  continue;
141 
146 
151 
156  }
157 
174  }
175 
183 
190 
197 
200 
203 
206 
209 
212 
215 
218 
221 
224 
227 
230 
233 
236 
239 
242 
245 
248 
251 
254 
257 
260 
263 
268 
273 
281 
284 
287 
292 
297 
300 
308 
313 
316 
317  // This is totally unsupported, just custom lower to produce an error.
319 
320  // Library functions. These default to Expand, but we have instructions
321  // for them.
332 
335 
339 
340 
343 
347 
348  // Expand to fneg + fadd.
350 
393 
397 
398  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
399  for (MVT VT : ScalarIntVTs) {
400  // These should use [SU]DIVREM, so set them to expand
405 
406  // GPU does not have divrem function for signed or unsigned.
409 
410  // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
413 
417 
418  // AMDGPU uses ADDC/SUBC/ADDE/SUBE
423  }
424 
425  // The hardware supports 32-bit FSHR, but not FSHL.
427 
428  // The hardware supports 32-bit ROTR, but not ROTL.
432 
435 
444 
449 
454 
455  static const MVT::SimpleValueType VectorIntTypes[] = {
457 
458  for (MVT VT : VectorIntTypes) {
459  // Expand the following operations for the current type by default.
494  }
495 
496  static const MVT::SimpleValueType FloatVectorTypes[] = {
498 
499  for (MVT VT : FloatVectorTypes) {
530  }
531 
532  // This causes using an unrolled select operation rather than expansion with
533  // bit operations. This is in general better, but the alternative using BFI
534  // instructions may be better if the select sources are SGPRs.
537 
540 
543 
546 
549 
552 
553  // There are no libcalls of any kind.
554  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
555  setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
556 
558  setJumpIsExpensive(true);
559 
560  // FIXME: This is only partially true. If we have to do vector compares, any
561  // SGPR pair can be a condition register. If we have a uniform condition, we
562  // are better off doing SALU operations, where there is only one SCC. For now,
563  // we don't have a way of knowing during instruction selection if a condition
564  // will be uniform and we always use vector compares. Assume we are using
565  // vector compares until that is fixed.
567 
570 
572 
573  // We want to find all load dependencies for long chains of stores to enable
574  // merging into very wide vectors. The problem is with vectors with > 4
575  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
576  // vectors are a legal type, even though we have to split the loads
577  // usually. When we can more precisely specify load legality per address
578  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
579  // smarter so that they can figure out what to do in 2 iterations without all
580  // N > 4 stores on the same chain.
582 
583  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
584  // about these during lowering.
585  MaxStoresPerMemcpy = 0xffffffff;
586  MaxStoresPerMemmove = 0xffffffff;
587  MaxStoresPerMemset = 0xffffffff;
588 
589  // The expansion for 64-bit division is enormous.
591  addBypassSlowDiv(64, 32);
592 
611 }
612 
614  if (getTargetMachine().Options.NoSignedZerosFPMath)
615  return true;
616 
617  const auto Flags = Op.getNode()->getFlags();
618  if (Flags.hasNoSignedZeros())
619  return true;
620 
621  return false;
622 }
623 
624 //===----------------------------------------------------------------------===//
625 // Target Information
626 //===----------------------------------------------------------------------===//
627 
629 static bool fnegFoldsIntoOp(unsigned Opc) {
630  switch (Opc) {
631  case ISD::FADD:
632  case ISD::FSUB:
633  case ISD::FMUL:
634  case ISD::FMA:
635  case ISD::FMAD:
636  case ISD::FMINNUM:
637  case ISD::FMAXNUM:
638  case ISD::FMINNUM_IEEE:
639  case ISD::FMAXNUM_IEEE:
640  case ISD::FSIN:
641  case ISD::FTRUNC:
642  case ISD::FRINT:
643  case ISD::FNEARBYINT:
644  case ISD::FCANONICALIZE:
645  case AMDGPUISD::RCP:
648  case AMDGPUISD::SIN_HW:
652  case AMDGPUISD::FMED3:
653  // TODO: handle llvm.amdgcn.fma.legacy
654  return true;
655  default:
656  return false;
657  }
658 }
659 
660 /// \p returns true if the operation will definitely need to use a 64-bit
661 /// encoding, and thus will use a VOP3 encoding regardless of the source
662 /// modifiers.
664 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
665  return N->getNumOperands() > 2 || VT == MVT::f64;
666 }
667 
668 // Most FP instructions support source modifiers, but this could be refined
669 // slightly.
671 static bool hasSourceMods(const SDNode *N) {
672  if (isa<MemSDNode>(N))
673  return false;
674 
675  switch (N->getOpcode()) {
676  case ISD::CopyToReg:
677  case ISD::SELECT:
678  case ISD::FDIV:
679  case ISD::FREM:
680  case ISD::INLINEASM:
681  case ISD::INLINEASM_BR:
684 
685  // TODO: Should really be looking at the users of the bitcast. These are
686  // problematic because bitcasts are used to legalize all stores to integer
687  // types.
688  case ISD::BITCAST:
689  return false;
691  switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
692  case Intrinsic::amdgcn_interp_p1:
693  case Intrinsic::amdgcn_interp_p2:
694  case Intrinsic::amdgcn_interp_mov:
695  case Intrinsic::amdgcn_interp_p1_f16:
696  case Intrinsic::amdgcn_interp_p2_f16:
697  return false;
698  default:
699  return true;
700  }
701  }
702  default:
703  return true;
704  }
705 }
706 
708  unsigned CostThreshold) {
709  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
710  // it is truly free to use a source modifier in all cases. If there are
711  // multiple users but for each one will necessitate using VOP3, there will be
712  // a code size increase. Try to avoid increasing code size unless we know it
713  // will save on the instruction count.
714  unsigned NumMayIncreaseSize = 0;
715  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
716 
717  // XXX - Should this limit number of uses to check?
718  for (const SDNode *U : N->uses()) {
719  if (!hasSourceMods(U))
720  return false;
721 
722  if (!opMustUseVOP3Encoding(U, VT)) {
723  if (++NumMayIncreaseSize > CostThreshold)
724  return false;
725  }
726  }
727 
728  return true;
729 }
730 
732  ISD::NodeType ExtendKind) const {
733  assert(!VT.isVector() && "only scalar expected");
734 
735  // Round to the next multiple of 32-bits.
736  unsigned Size = VT.getSizeInBits();
737  if (Size <= 32)
738  return MVT::i32;
739  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
740 }
741 
743  return MVT::i32;
744 }
745 
747  return true;
748 }
749 
750 // The backend supports 32 and 64 bit floating point immediates.
751 // FIXME: Why are we reporting vectors of FP immediates as legal?
753  bool ForCodeSize) const {
754  EVT ScalarVT = VT.getScalarType();
755  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
756  (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
757 }
758 
759 // We don't want to shrink f64 / f32 constants.
761  EVT ScalarVT = VT.getScalarType();
762  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
763 }
764 
766  ISD::LoadExtType ExtTy,
767  EVT NewVT) const {
768  // TODO: This may be worth removing. Check regression tests for diffs.
769  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
770  return false;
771 
772  unsigned NewSize = NewVT.getStoreSizeInBits();
773 
774  // If we are reducing to a 32-bit load or a smaller multi-dword load,
775  // this is always better.
776  if (NewSize >= 32)
777  return true;
778 
779  EVT OldVT = N->getValueType(0);
780  unsigned OldSize = OldVT.getStoreSizeInBits();
781 
782  MemSDNode *MN = cast<MemSDNode>(N);
783  unsigned AS = MN->getAddressSpace();
784  // Do not shrink an aligned scalar load to sub-dword.
785  // Scalar engine cannot do sub-dword loads.
786  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
789  (isa<LoadSDNode>(N) &&
790  AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
792  return false;
793 
794  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
795  // extloads, so doing one requires using a buffer_load. In cases where we
796  // still couldn't use a scalar load, using the wider load shouldn't really
797  // hurt anything.
798 
799  // If the old size already had to be an extload, there's no harm in continuing
800  // to reduce the width.
801  return (OldSize < 32);
802 }
803 
805  const SelectionDAG &DAG,
806  const MachineMemOperand &MMO) const {
807 
808  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
809 
810  if (LoadTy.getScalarType() == MVT::i32)
811  return false;
812 
813  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
814  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
815 
816  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
817  return false;
818 
819  bool Fast = false;
821  CastTy, MMO, &Fast) &&
822  Fast;
823 }
824 
825 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
826 // profitable with the expansion for 64-bit since it's generally good to
827 // speculate things.
828 // FIXME: These should really have the size as a parameter.
830  return true;
831 }
832 
834  return true;
835 }
836 
838  switch (N->getOpcode()) {
839  case ISD::EntryToken:
840  case ISD::TokenFactor:
841  return true;
843  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
844  switch (IntrID) {
845  case Intrinsic::amdgcn_readfirstlane:
846  case Intrinsic::amdgcn_readlane:
847  return true;
848  }
849  return false;
850  }
851  case ISD::LOAD:
852  if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
854  return true;
855  return false;
856  }
857  return false;
858 }
859 
861  SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
862  NegatibleCost &Cost, unsigned Depth) const {
863 
864  switch (Op.getOpcode()) {
865  case ISD::FMA:
866  case ISD::FMAD: {
867  // Negating a fma is not free if it has users without source mods.
868  if (!allUsesHaveSourceMods(Op.getNode()))
869  return SDValue();
870  break;
871  }
872  default:
873  break;
874  }
875 
876  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
877  ForCodeSize, Cost, Depth);
878 }
879 
880 //===---------------------------------------------------------------------===//
881 // Target Properties
882 //===---------------------------------------------------------------------===//
883 
885  assert(VT.isFloatingPoint());
886 
887  // Packed operations do not have a fabs modifier.
888  return VT == MVT::f32 || VT == MVT::f64 ||
889  (Subtarget->has16BitInsts() && VT == MVT::f16);
890 }
891 
893  assert(VT.isFloatingPoint());
894  // Report this based on the end legalized type.
895  VT = VT.getScalarType();
896  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
897 }
898 
900  unsigned NumElem,
901  unsigned AS) const {
902  return true;
903 }
904 
906  // There are few operations which truly have vector input operands. Any vector
907  // operation is going to involve operations on each component, and a
908  // build_vector will be a copy per element, so it always makes sense to use a
909  // build_vector input in place of the extracted element to avoid a copy into a
910  // super register.
911  //
912  // We should probably only do this if all users are extracts only, but this
913  // should be the common case.
914  return true;
915 }
916 
918  // Truncate is just accessing a subregister.
919 
920  unsigned SrcSize = Source.getSizeInBits();
921  unsigned DestSize = Dest.getSizeInBits();
922 
923  return DestSize < SrcSize && DestSize % 32 == 0 ;
924 }
925 
927  // Truncate is just accessing a subregister.
928 
929  unsigned SrcSize = Source->getScalarSizeInBits();
930  unsigned DestSize = Dest->getScalarSizeInBits();
931 
932  if (DestSize== 16 && Subtarget->has16BitInsts())
933  return SrcSize >= 32;
934 
935  return DestSize < SrcSize && DestSize % 32 == 0;
936 }
937 
938 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
939  unsigned SrcSize = Src->getScalarSizeInBits();
940  unsigned DestSize = Dest->getScalarSizeInBits();
941 
942  if (SrcSize == 16 && Subtarget->has16BitInsts())
943  return DestSize >= 32;
944 
945  return SrcSize == 32 && DestSize == 64;
946 }
947 
948 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
949  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
950  // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
951  // this will enable reducing 64-bit operations the 32-bit, which is always
952  // good.
953 
954  if (Src == MVT::i16)
955  return Dest == MVT::i32 ||Dest == MVT::i64 ;
956 
957  return Src == MVT::i32 && Dest == MVT::i64;
958 }
959 
961  return isZExtFree(Val.getValueType(), VT2);
962 }
963 
965  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
966  // limited number of native 64-bit operations. Shrinking an operation to fit
967  // in a single 32-bit register should always be helpful. As currently used,
968  // this is much less general than the name suggests, and is only used in
969  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
970  // not profitable, and may actually be harmful.
971  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
972 }
973 
974 //===---------------------------------------------------------------------===//
975 // TargetLowering Callbacks
976 //===---------------------------------------------------------------------===//
977 
979  bool IsVarArg) {
980  switch (CC) {
988  return CC_AMDGPU;
989  case CallingConv::C:
990  case CallingConv::Fast:
991  case CallingConv::Cold:
992  return CC_AMDGPU_Func;
994  return CC_SI_Gfx;
997  default:
998  report_fatal_error("Unsupported calling convention for call");
999  }
1000 }
1001 
1003  bool IsVarArg) {
1004  switch (CC) {
1007  llvm_unreachable("kernels should not be handled here");
1015  return RetCC_SI_Shader;
1017  return RetCC_SI_Gfx;
1018  case CallingConv::C:
1019  case CallingConv::Fast:
1020  case CallingConv::Cold:
1021  return RetCC_AMDGPU_Func;
1022  default:
1023  report_fatal_error("Unsupported calling convention.");
1024  }
1025 }
1026 
1027 /// The SelectionDAGBuilder will automatically promote function arguments
1028 /// with illegal types. However, this does not work for the AMDGPU targets
1029 /// since the function arguments are stored in memory as these illegal types.
1030 /// In order to handle this properly we need to get the original types sizes
1031 /// from the LLVM IR Function and fixup the ISD:InputArg values before
1032 /// passing them to AnalyzeFormalArguments()
1033 
1034 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1035 /// input values across multiple registers. Each item in the Ins array
1036 /// represents a single value that will be stored in registers. Ins[x].VT is
1037 /// the value type of the value that will be stored in the register, so
1038 /// whatever SDNode we lower the argument to needs to be this type.
1039 ///
1040 /// In order to correctly lower the arguments we need to know the size of each
1041 /// argument. Since Ins[x].VT gives us the size of the register that will
1042 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1043 /// for the original function argument so that we can deduce the correct memory
1044 /// type to use for Ins[x]. In most cases the correct memory type will be
1045 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1046 /// we have a kernel argument of type v8i8, this argument will be split into
1047 /// 8 parts and each part will be represented by its own item in the Ins array.
1048 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1049 /// the argument before it was split. From this, we deduce that the memory type
1050 /// for each individual part is i8. We pass the memory type as LocVT to the
1051 /// calling convention analysis function and the register type (Ins[x].VT) as
1052 /// the ValVT.
1054  CCState &State,
1055  const SmallVectorImpl<ISD::InputArg> &Ins) const {
1056  const MachineFunction &MF = State.getMachineFunction();
1057  const Function &Fn = MF.getFunction();
1058  LLVMContext &Ctx = Fn.getParent()->getContext();
1060  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1061  CallingConv::ID CC = Fn.getCallingConv();
1062 
1063  Align MaxAlign = Align(1);
1064  uint64_t ExplicitArgOffset = 0;
1065  const DataLayout &DL = Fn.getParent()->getDataLayout();
1066 
1067  unsigned InIndex = 0;
1068 
1069  for (const Argument &Arg : Fn.args()) {
1070  const bool IsByRef = Arg.hasByRefAttr();
1071  Type *BaseArgTy = Arg.getType();
1072  Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1073  MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1074  if (!Alignment)
1075  Alignment = DL.getABITypeAlign(MemArgTy);
1076  MaxAlign = max(Alignment, MaxAlign);
1077  uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1078 
1079  uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1080  ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1081 
1082  // We're basically throwing away everything passed into us and starting over
1083  // to get accurate in-memory offsets. The "PartOffset" is completely useless
1084  // to us as computed in Ins.
1085  //
1086  // We also need to figure out what type legalization is trying to do to get
1087  // the correct memory offsets.
1088 
1089  SmallVector<EVT, 16> ValueVTs;
1091  ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1092 
1093  for (unsigned Value = 0, NumValues = ValueVTs.size();
1094  Value != NumValues; ++Value) {
1095  uint64_t BasePartOffset = Offsets[Value];
1096 
1097  EVT ArgVT = ValueVTs[Value];
1098  EVT MemVT = ArgVT;
1099  MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1100  unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1101 
1102  if (NumRegs == 1) {
1103  // This argument is not split, so the IR type is the memory type.
1104  if (ArgVT.isExtended()) {
1105  // We have an extended type, like i24, so we should just use the
1106  // register type.
1107  MemVT = RegisterVT;
1108  } else {
1109  MemVT = ArgVT;
1110  }
1111  } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1112  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1113  assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1114  // We have a vector value which has been split into a vector with
1115  // the same scalar type, but fewer elements. This should handle
1116  // all the floating-point vector types.
1117  MemVT = RegisterVT;
1118  } else if (ArgVT.isVector() &&
1119  ArgVT.getVectorNumElements() == NumRegs) {
1120  // This arg has been split so that each element is stored in a separate
1121  // register.
1122  MemVT = ArgVT.getScalarType();
1123  } else if (ArgVT.isExtended()) {
1124  // We have an extended type, like i65.
1125  MemVT = RegisterVT;
1126  } else {
1127  unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1128  assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1129  if (RegisterVT.isInteger()) {
1130  MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1131  } else if (RegisterVT.isVector()) {
1132  assert(!RegisterVT.getScalarType().isFloatingPoint());
1133  unsigned NumElements = RegisterVT.getVectorNumElements();
1134  assert(MemoryBits % NumElements == 0);
1135  // This vector type has been split into another vector type with
1136  // a different elements size.
1137  EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1138  MemoryBits / NumElements);
1139  MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1140  } else {
1141  llvm_unreachable("cannot deduce memory type.");
1142  }
1143  }
1144 
1145  // Convert one element vectors to scalar.
1146  if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1147  MemVT = MemVT.getScalarType();
1148 
1149  // Round up vec3/vec5 argument.
1150  if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1151  assert(MemVT.getVectorNumElements() == 3 ||
1152  MemVT.getVectorNumElements() == 5);
1153  MemVT = MemVT.getPow2VectorType(State.getContext());
1154  } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1155  MemVT = MemVT.getRoundIntegerType(State.getContext());
1156  }
1157 
1158  unsigned PartOffset = 0;
1159  for (unsigned i = 0; i != NumRegs; ++i) {
1160  State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1161  BasePartOffset + PartOffset,
1162  MemVT.getSimpleVT(),
1164  PartOffset += MemVT.getStoreSize();
1165  }
1166  }
1167  }
1168 }
1169 
1171  SDValue Chain, CallingConv::ID CallConv,
1172  bool isVarArg,
1173  const SmallVectorImpl<ISD::OutputArg> &Outs,
1174  const SmallVectorImpl<SDValue> &OutVals,
1175  const SDLoc &DL, SelectionDAG &DAG) const {
1176  // FIXME: Fails for r600 tests
1177  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1178  // "wave terminate should not have return values");
1179  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1180 }
1181 
1182 //===---------------------------------------------------------------------===//
1183 // Target specific lowering
1184 //===---------------------------------------------------------------------===//
1185 
1186 /// Selects the correct CCAssignFn for a given CallingConvention value.
1188  bool IsVarArg) {
1189  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1190 }
1191 
1193  bool IsVarArg) {
1194  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1195 }
1196 
1198  SelectionDAG &DAG,
1199  MachineFrameInfo &MFI,
1200  int ClobberedFI) const {
1201  SmallVector<SDValue, 8> ArgChains;
1202  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1203  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1204 
1205  // Include the original chain at the beginning of the list. When this is
1206  // used by target LowerCall hooks, this helps legalize find the
1207  // CALLSEQ_BEGIN node.
1208  ArgChains.push_back(Chain);
1209 
1210  // Add a chain value for each stack argument corresponding
1211  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1212  UE = DAG.getEntryNode().getNode()->use_end();
1213  U != UE; ++U) {
1214  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1215  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1216  if (FI->getIndex() < 0) {
1217  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1218  int64_t InLastByte = InFirstByte;
1219  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1220 
1221  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1222  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1223  ArgChains.push_back(SDValue(L, 1));
1224  }
1225  }
1226  }
1227  }
1228 
1229  // Build a tokenfactor for all the chains.
1230  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1231 }
1232 
1234  SmallVectorImpl<SDValue> &InVals,
1235  StringRef Reason) const {
1236  SDValue Callee = CLI.Callee;
1237  SelectionDAG &DAG = CLI.DAG;
1238 
1239  const Function &Fn = DAG.getMachineFunction().getFunction();
1240 
1241  StringRef FuncName("<unknown>");
1242 
1243  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1244  FuncName = G->getSymbol();
1245  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1246  FuncName = G->getGlobal()->getName();
1247 
1248  DiagnosticInfoUnsupported NoCalls(
1249  Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1250  DAG.getContext()->diagnose(NoCalls);
1251 
1252  if (!CLI.IsTailCall) {
1253  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1254  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1255  }
1256 
1257  return DAG.getEntryNode();
1258 }
1259 
1261  SmallVectorImpl<SDValue> &InVals) const {
1262  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1263 }
1264 
1266  SelectionDAG &DAG) const {
1267  const Function &Fn = DAG.getMachineFunction().getFunction();
1268 
1269  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1270  SDLoc(Op).getDebugLoc());
1271  DAG.getContext()->diagnose(NoDynamicAlloca);
1272  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1273  return DAG.getMergeValues(Ops, SDLoc());
1274 }
1275 
1277  SelectionDAG &DAG) const {
1278  switch (Op.getOpcode()) {
1279  default:
1280  Op->print(errs(), &DAG);
1281  llvm_unreachable("Custom lowering code for this "
1282  "instruction is not implemented yet!");
1283  break;
1285  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1287  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1288  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1289  case ISD::FREM: return LowerFREM(Op, DAG);
1290  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1291  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1292  case ISD::FRINT: return LowerFRINT(Op, DAG);
1293  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1294  case ISD::FROUND: return LowerFROUND(Op, DAG);
1295  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1296  case ISD::FLOG:
1297  return LowerFLOG(Op, DAG, numbers::ln2f);
1298  case ISD::FLOG10:
1299  return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1300  case ISD::FEXP:
1301  return lowerFEXP(Op, DAG);
1302  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1303  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1304  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1305  case ISD::FP_TO_SINT:
1306  case ISD::FP_TO_UINT:
1307  return LowerFP_TO_INT(Op, DAG);
1308  case ISD::CTTZ:
1309  case ISD::CTTZ_ZERO_UNDEF:
1310  case ISD::CTLZ:
1311  case ISD::CTLZ_ZERO_UNDEF:
1312  return LowerCTLZ_CTTZ(Op, DAG);
1314  }
1315  return Op;
1316 }
1317 
1320  SelectionDAG &DAG) const {
1321  switch (N->getOpcode()) {
1323  // Different parts of legalization seem to interpret which type of
1324  // sign_extend_inreg is the one to check for custom lowering. The extended
1325  // from type is what really matters, but some places check for custom
1326  // lowering of the result type. This results in trying to use
1327  // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1328  // nothing here and let the illegal result integer be handled normally.
1329  return;
1330  default:
1331  return;
1332  }
1333 }
1334 
1336  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1337  if (!GVar || !GVar->hasInitializer())
1338  return false;
1339 
1340  return !isa<UndefValue>(GVar->getInitializer());
1341 }
1342 
1344  SDValue Op,
1345  SelectionDAG &DAG) const {
1346 
1347  const DataLayout &DL = DAG.getDataLayout();
1348  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1349  const GlobalValue *GV = G->getGlobal();
1350 
1351  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1352  G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1353  if (!MFI->isModuleEntryFunction() &&
1354  !GV->getName().equals("llvm.amdgcn.module.lds")) {
1355  SDLoc DL(Op);
1356  const Function &Fn = DAG.getMachineFunction().getFunction();
1357  DiagnosticInfoUnsupported BadLDSDecl(
1358  Fn, "local memory global used by non-kernel function",
1359  DL.getDebugLoc(), DS_Warning);
1360  DAG.getContext()->diagnose(BadLDSDecl);
1361 
1362  // We currently don't have a way to correctly allocate LDS objects that
1363  // aren't directly associated with a kernel. We do force inlining of
1364  // functions that use local objects. However, if these dead functions are
1365  // not eliminated, we don't want a compile time error. Just emit a warning
1366  // and a trap, since there should be no callable path here.
1367  SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1368  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1369  Trap, DAG.getRoot());
1370  DAG.setRoot(OutputChain);
1371  return DAG.getUNDEF(Op.getValueType());
1372  }
1373 
1374  // XXX: What does the value of G->getOffset() mean?
1375  assert(G->getOffset() == 0 &&
1376  "Do not know what to do with an non-zero offset");
1377 
1378  // TODO: We could emit code to handle the initialization somewhere.
1379  // We ignore the initializer for now and legalize it to allow selection.
1380  // The initializer will anyway get errored out during assembly emission.
1381  unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1382  return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1383  }
1384  return SDValue();
1385 }
1386 
1388  SelectionDAG &DAG) const {
1390 
1391  EVT VT = Op.getValueType();
1392  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1393  SDLoc SL(Op);
1394  SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1395  SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1396 
1397  SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1398  return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1399  }
1400 
1401  for (const SDUse &U : Op->ops())
1402  DAG.ExtractVectorElements(U.get(), Args);
1403 
1404  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1405 }
1406 
1408  SelectionDAG &DAG) const {
1409 
1411  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1412  EVT VT = Op.getValueType();
1413  EVT SrcVT = Op.getOperand(0).getValueType();
1414 
1415  // For these types, we have some TableGen patterns except if the index is 1
1416  if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1417  (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1418  Start != 1)
1419  return Op;
1420 
1421  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1422  VT.getVectorNumElements());
1423 
1424  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1425 }
1426 
1427 /// Generate Min/Max node
1429  SDValue LHS, SDValue RHS,
1430  SDValue True, SDValue False,
1431  SDValue CC,
1432  DAGCombinerInfo &DCI) const {
1433  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1434  return SDValue();
1435 
1436  SelectionDAG &DAG = DCI.DAG;
1437  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1438  switch (CCOpcode) {
1439  case ISD::SETOEQ:
1440  case ISD::SETONE:
1441  case ISD::SETUNE:
1442  case ISD::SETNE:
1443  case ISD::SETUEQ:
1444  case ISD::SETEQ:
1445  case ISD::SETFALSE:
1446  case ISD::SETFALSE2:
1447  case ISD::SETTRUE:
1448  case ISD::SETTRUE2:
1449  case ISD::SETUO:
1450  case ISD::SETO:
1451  break;
1452  case ISD::SETULE:
1453  case ISD::SETULT: {
1454  if (LHS == True)
1455  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1456  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1457  }
1458  case ISD::SETOLE:
1459  case ISD::SETOLT:
1460  case ISD::SETLE:
1461  case ISD::SETLT: {
1462  // Ordered. Assume ordered for undefined.
1463 
1464  // Only do this after legalization to avoid interfering with other combines
1465  // which might occur.
1466  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1467  !DCI.isCalledByLegalizer())
1468  return SDValue();
1469 
1470  // We need to permute the operands to get the correct NaN behavior. The
1471  // selected operand is the second one based on the failing compare with NaN,
1472  // so permute it based on the compare type the hardware uses.
1473  if (LHS == True)
1474  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1475  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1476  }
1477  case ISD::SETUGE:
1478  case ISD::SETUGT: {
1479  if (LHS == True)
1480  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1481  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1482  }
1483  case ISD::SETGT:
1484  case ISD::SETGE:
1485  case ISD::SETOGE:
1486  case ISD::SETOGT: {
1487  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1488  !DCI.isCalledByLegalizer())
1489  return SDValue();
1490 
1491  if (LHS == True)
1492  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1493  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1494  }
1495  case ISD::SETCC_INVALID:
1496  llvm_unreachable("Invalid setcc condcode!");
1497  }
1498  return SDValue();
1499 }
1500 
1501 std::pair<SDValue, SDValue>
1503  SDLoc SL(Op);
1504 
1505  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1506 
1507  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1508  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1509 
1510  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1511  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1512 
1513  return std::make_pair(Lo, Hi);
1514 }
1515 
1517  SDLoc SL(Op);
1518 
1519  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1520  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1521  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1522 }
1523 
1525  SDLoc SL(Op);
1526 
1527  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1528  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1529  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1530 }
1531 
1532 // Split a vector type into two parts. The first part is a power of two vector.
1533 // The second part is whatever is left over, and is a scalar if it would
1534 // otherwise be a 1-vector.
1535 std::pair<EVT, EVT>
1537  EVT LoVT, HiVT;
1538  EVT EltVT = VT.getVectorElementType();
1539  unsigned NumElts = VT.getVectorNumElements();
1540  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1541  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1542  HiVT = NumElts - LoNumElts == 1
1543  ? EltVT
1544  : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1545  return std::make_pair(LoVT, HiVT);
1546 }
1547 
1548 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1549 // scalar.
1550 std::pair<SDValue, SDValue>
1552  const EVT &LoVT, const EVT &HiVT,
1553  SelectionDAG &DAG) const {
1554  assert(LoVT.getVectorNumElements() +
1555  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1556  N.getValueType().getVectorNumElements() &&
1557  "More vector elements requested than available!");
1558  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1559  DAG.getVectorIdxConstant(0, DL));
1560  SDValue Hi = DAG.getNode(
1562  HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1563  return std::make_pair(Lo, Hi);
1564 }
1565 
1567  SelectionDAG &DAG) const {
1568  LoadSDNode *Load = cast<LoadSDNode>(Op);
1569  EVT VT = Op.getValueType();
1570  SDLoc SL(Op);
1571 
1572 
1573  // If this is a 2 element vector, we really want to scalarize and not create
1574  // weird 1 element vectors.
1575  if (VT.getVectorNumElements() == 2) {
1576  SDValue Ops[2];
1577  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1578  return DAG.getMergeValues(Ops, SL);
1579  }
1580 
1581  SDValue BasePtr = Load->getBasePtr();
1582  EVT MemVT = Load->getMemoryVT();
1583 
1584  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1585 
1586  EVT LoVT, HiVT;
1587  EVT LoMemVT, HiMemVT;
1588  SDValue Lo, Hi;
1589 
1590  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1591  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1592  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1593 
1594  unsigned Size = LoMemVT.getStoreSize();
1595  unsigned BaseAlign = Load->getAlignment();
1596  unsigned HiAlign = MinAlign(BaseAlign, Size);
1597 
1598  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1599  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1600  BaseAlign, Load->getMemOperand()->getFlags());
1601  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1602  SDValue HiLoad =
1603  DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1604  HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1605  HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1606 
1607  SDValue Join;
1608  if (LoVT == HiVT) {
1609  // This is the case that the vector is power of two so was evenly split.
1610  Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1611  } else {
1612  Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1613  DAG.getVectorIdxConstant(0, SL));
1614  Join = DAG.getNode(
1616  VT, Join, HiLoad,
1617  DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1618  }
1619 
1620  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1621  LoLoad.getValue(1), HiLoad.getValue(1))};
1622 
1623  return DAG.getMergeValues(Ops, SL);
1624 }
1625 
1627  SelectionDAG &DAG) const {
1628  LoadSDNode *Load = cast<LoadSDNode>(Op);
1629  EVT VT = Op.getValueType();
1630  SDValue BasePtr = Load->getBasePtr();
1631  EVT MemVT = Load->getMemoryVT();
1632  SDLoc SL(Op);
1633  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1634  unsigned BaseAlign = Load->getAlignment();
1635  unsigned NumElements = MemVT.getVectorNumElements();
1636 
1637  // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1638  // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1639  if (NumElements != 3 ||
1640  (BaseAlign < 8 &&
1641  !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1642  return SplitVectorLoad(Op, DAG);
1643 
1644  assert(NumElements == 3);
1645 
1646  EVT WideVT =
1648  EVT WideMemVT =
1649  EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1650  SDValue WideLoad = DAG.getExtLoad(
1651  Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1652  WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1653  return DAG.getMergeValues(
1654  {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1655  DAG.getVectorIdxConstant(0, SL)),
1656  WideLoad.getValue(1)},
1657  SL);
1658 }
1659 
1661  SelectionDAG &DAG) const {
1662  StoreSDNode *Store = cast<StoreSDNode>(Op);
1663  SDValue Val = Store->getValue();
1664  EVT VT = Val.getValueType();
1665 
1666  // If this is a 2 element vector, we really want to scalarize and not create
1667  // weird 1 element vectors.
1668  if (VT.getVectorNumElements() == 2)
1669  return scalarizeVectorStore(Store, DAG);
1670 
1671  EVT MemVT = Store->getMemoryVT();
1672  SDValue Chain = Store->getChain();
1673  SDValue BasePtr = Store->getBasePtr();
1674  SDLoc SL(Op);
1675 
1676  EVT LoVT, HiVT;
1677  EVT LoMemVT, HiMemVT;
1678  SDValue Lo, Hi;
1679 
1680  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1681  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1682  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1683 
1684  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1685 
1686  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1687  unsigned BaseAlign = Store->getAlignment();
1688  unsigned Size = LoMemVT.getStoreSize();
1689  unsigned HiAlign = MinAlign(BaseAlign, Size);
1690 
1691  SDValue LoStore =
1692  DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1693  Store->getMemOperand()->getFlags());
1694  SDValue HiStore =
1695  DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1696  HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1697 
1698  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1699 }
1700 
1701 // This is a shortcut for integer division because we have fast i32<->f32
1702 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1703 // float is enough to accurately represent up to a 24-bit signed integer.
1705  bool Sign) const {
1706  SDLoc DL(Op);
1707  EVT VT = Op.getValueType();
1708  SDValue LHS = Op.getOperand(0);
1709  SDValue RHS = Op.getOperand(1);
1710  MVT IntVT = MVT::i32;
1711  MVT FltVT = MVT::f32;
1712 
1713  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1714  if (LHSSignBits < 9)
1715  return SDValue();
1716 
1717  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1718  if (RHSSignBits < 9)
1719  return SDValue();
1720 
1721  unsigned BitSize = VT.getSizeInBits();
1722  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1723  unsigned DivBits = BitSize - SignBits;
1724  if (Sign)
1725  ++DivBits;
1726 
1729 
1730  SDValue jq = DAG.getConstant(1, DL, IntVT);
1731 
1732  if (Sign) {
1733  // char|short jq = ia ^ ib;
1734  jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1735 
1736  // jq = jq >> (bitsize - 2)
1737  jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1738  DAG.getConstant(BitSize - 2, DL, VT));
1739 
1740  // jq = jq | 0x1
1741  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1742  }
1743 
1744  // int ia = (int)LHS;
1745  SDValue ia = LHS;
1746 
1747  // int ib, (int)RHS;
1748  SDValue ib = RHS;
1749 
1750  // float fa = (float)ia;
1751  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1752 
1753  // float fb = (float)ib;
1754  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1755 
1756  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1757  fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1758 
1759  // fq = trunc(fq);
1760  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1761 
1762  // float fqneg = -fq;
1763  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1764 
1765  MachineFunction &MF = DAG.getMachineFunction();
1767 
1768  // float fr = mad(fqneg, fb, fa);
1769  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1770  (unsigned)ISD::FMA :
1771  !MFI->getMode().allFP32Denormals() ?
1772  (unsigned)ISD::FMAD :
1773  (unsigned)AMDGPUISD::FMAD_FTZ;
1774  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1775 
1776  // int iq = (int)fq;
1777  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1778 
1779  // fr = fabs(fr);
1780  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1781 
1782  // fb = fabs(fb);
1783  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1784 
1785  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1786 
1787  // int cv = fr >= fb;
1788  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1789 
1790  // jq = (cv ? jq : 0);
1791  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1792 
1793  // dst = iq + jq;
1794  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1795 
1796  // Rem needs compensation, it's easier to recompute it
1797  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1798  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1799 
1800  // Truncate to number of bits this divide really is.
1801  if (Sign) {
1802  SDValue InRegSize
1803  = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1804  Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1805  Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1806  } else {
1807  SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1808  Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1809  Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1810  }
1811 
1812  return DAG.getMergeValues({ Div, Rem }, DL);
1813 }
1814 
1816  SelectionDAG &DAG,
1818  SDLoc DL(Op);
1819  EVT VT = Op.getValueType();
1820 
1821  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1822 
1823  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1824 
1825  SDValue One = DAG.getConstant(1, DL, HalfVT);
1826  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1827 
1828  //HiLo split
1829  SDValue LHS = Op.getOperand(0);
1830  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1831  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1832 
1833  SDValue RHS = Op.getOperand(1);
1834  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1835  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1836 
1837  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1838  DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1839 
1840  SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1841  LHS_Lo, RHS_Lo);
1842 
1843  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1844  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1845 
1846  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1847  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1848  return;
1849  }
1850 
1851  if (isTypeLegal(MVT::i64)) {
1852  MachineFunction &MF = DAG.getMachineFunction();
1854 
1855  // Compute denominator reciprocal.
1856  unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1857  (unsigned)ISD::FMA :
1858  !MFI->getMode().allFP32Denormals() ?
1859  (unsigned)ISD::FMAD :
1860  (unsigned)AMDGPUISD::FMAD_FTZ;
1861 
1862  SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1863  SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1864  SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1865  DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1866  Cvt_Lo);
1867  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1868  SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1869  DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1870  SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1871  DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1872  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1873  SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1874  DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1875  Mul1);
1876  SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1877  SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1878  SDValue Rcp64 = DAG.getBitcast(VT,
1879  DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1880 
1881  SDValue Zero64 = DAG.getConstant(0, DL, VT);
1882  SDValue One64 = DAG.getConstant(1, DL, VT);
1883  SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1884  SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1885 
1886  SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1887  SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1888  SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1889  SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1890  Zero);
1891  SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1892  One);
1893 
1894  SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1895  Mulhi1_Lo, Zero1);
1896  SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1897  Mulhi1_Hi, Add1_Lo.getValue(1));
1898  SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1899  SDValue Add1 = DAG.getBitcast(VT,
1900  DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1901 
1902  SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1903  SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1904  SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1905  Zero);
1906  SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1907  One);
1908 
1909  SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1910  Mulhi2_Lo, Zero1);
1911  SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1912  Mulhi2_Hi, Add1_Lo.getValue(1));
1913  SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1914  Zero, Add2_Lo.getValue(1));
1915  SDValue Add2 = DAG.getBitcast(VT,
1916  DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1917  SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1918 
1919  SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1920 
1921  SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1922  SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1923  SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1924  Mul3_Lo, Zero1);
1925  SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1926  Mul3_Hi, Sub1_Lo.getValue(1));
1927  SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1928  SDValue Sub1 = DAG.getBitcast(VT,
1929  DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1930 
1931  SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1932  SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1933  ISD::SETUGE);
1934  SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1935  ISD::SETUGE);
1936  SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1937 
1938  // TODO: Here and below portions of the code can be enclosed into if/endif.
1939  // Currently control flow is unconditional and we have 4 selects after
1940  // potential endif to substitute PHIs.
1941 
1942  // if C3 != 0 ...
1943  SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1944  RHS_Lo, Zero1);
1945  SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1946  RHS_Hi, Sub1_Lo.getValue(1));
1947  SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1948  Zero, Sub2_Lo.getValue(1));
1949  SDValue Sub2 = DAG.getBitcast(VT,
1950  DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1951 
1952  SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1953 
1954  SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1955  ISD::SETUGE);
1956  SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1957  ISD::SETUGE);
1958  SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1959 
1960  // if (C6 != 0)
1961  SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1962 
1963  SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1964  RHS_Lo, Zero1);
1965  SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1966  RHS_Hi, Sub2_Lo.getValue(1));
1967  SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1968  Zero, Sub3_Lo.getValue(1));
1969  SDValue Sub3 = DAG.getBitcast(VT,
1970  DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1971 
1972  // endif C6
1973  // endif C3
1974 
1975  SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1976  SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1977 
1978  SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1979  SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1980 
1981  Results.push_back(Div);
1982  Results.push_back(Rem);
1983 
1984  return;
1985  }
1986 
1987  // r600 expandion.
1988  // Get Speculative values
1989  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1990  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1991 
1992  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1993  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1994  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1995 
1996  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1997  SDValue DIV_Lo = Zero;
1998 
1999  const unsigned halfBitWidth = HalfVT.getSizeInBits();
2000 
2001  for (unsigned i = 0; i < halfBitWidth; ++i) {
2002  const unsigned bitPos = halfBitWidth - i - 1;
2003  SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2004  // Get value of high bit
2005  SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2006  HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2007  HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2008 
2009  // Shift
2010  REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2011  // Add LHS high bit
2012  REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2013 
2014  SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2015  SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2016 
2017  DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2018 
2019  // Update REM
2020  SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2021  REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2022  }
2023 
2024  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2025  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2026  Results.push_back(DIV);
2027  Results.push_back(REM);
2028 }
2029 
2030 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2031  SelectionDAG &DAG) const {
2032  SDLoc DL(Op);
2033  EVT VT = Op.getValueType();
2034 
2035  if (VT == MVT::i64) {
2037  LowerUDIVREM64(Op, DAG, Results);
2038  return DAG.getMergeValues(Results, DL);
2039  }
2040 
2041  if (VT == MVT::i32) {
2042  if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2043  return Res;
2044  }
2045 
2046  SDValue X = Op.getOperand(0);
2047  SDValue Y = Op.getOperand(1);
2048 
2049  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2050  // algorithm used here.
2051 
2052  // Initial estimate of inv(y).
2053  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2054 
2055  // One round of UNR.
2056  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2057  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2058  Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2059  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2060 
2061  // Quotient/remainder estimate.
2062  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2063  SDValue R =
2064  DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2065 
2066  // First quotient/remainder refinement.
2067  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2068  SDValue One = DAG.getConstant(1, DL, VT);
2069  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2070  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2071  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2072  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2073  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2074 
2075  // Second quotient/remainder refinement.
2076  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2077  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2078  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2079  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2080  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2081 
2082  return DAG.getMergeValues({Q, R}, DL);
2083 }
2084 
2086  SelectionDAG &DAG) const {
2087  SDLoc DL(Op);
2088  EVT VT = Op.getValueType();
2089 
2090  SDValue LHS = Op.getOperand(0);
2091  SDValue RHS = Op.getOperand(1);
2092 
2093  SDValue Zero = DAG.getConstant(0, DL, VT);
2094  SDValue NegOne = DAG.getConstant(-1, DL, VT);
2095 
2096  if (VT == MVT::i32) {
2097  if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2098  return Res;
2099  }
2100 
2101  if (VT == MVT::i64 &&
2102  DAG.ComputeNumSignBits(LHS) > 32 &&
2103  DAG.ComputeNumSignBits(RHS) > 32) {
2104  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2105 
2106  //HiLo split
2107  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2108  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2109  SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2110  LHS_Lo, RHS_Lo);
2111  SDValue Res[2] = {
2112  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2113  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2114  };
2115  return DAG.getMergeValues(Res, DL);
2116  }
2117 
2118  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2119  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2120  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2121  SDValue RSign = LHSign; // Remainder sign is the same as LHS
2122 
2123  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2124  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2125 
2126  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2127  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2128 
2129  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2130  SDValue Rem = Div.getValue(1);
2131 
2132  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2133  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2134 
2135  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2136  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2137 
2138  SDValue Res[2] = {
2139  Div,
2140  Rem
2141  };
2142  return DAG.getMergeValues(Res, DL);
2143 }
2144 
2145 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2147  SDLoc SL(Op);
2148  EVT VT = Op.getValueType();
2149  auto Flags = Op->getFlags();
2150  SDValue X = Op.getOperand(0);
2151  SDValue Y = Op.getOperand(1);
2152 
2153  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2154  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2155  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2156  // TODO: For f32 use FMAD instead if !hasFastFMA32?
2157  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2158 }
2159 
2161  SDLoc SL(Op);
2162  SDValue Src = Op.getOperand(0);
2163 
2164  // result = trunc(src)
2165  // if (src > 0.0 && src != result)
2166  // result += 1.0
2167 
2168  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2169 
2170  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2171  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2172 
2173  EVT SetCCVT =
2175 
2176  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2177  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2178  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2179 
2180  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2181  // TODO: Should this propagate fast-math-flags?
2182  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2183 }
2184 
2186  SelectionDAG &DAG) {
2187  const unsigned FractBits = 52;
2188  const unsigned ExpBits = 11;
2189 
2190  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2191  Hi,
2192  DAG.getConstant(FractBits - 32, SL, MVT::i32),
2193  DAG.getConstant(ExpBits, SL, MVT::i32));
2194  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2195  DAG.getConstant(1023, SL, MVT::i32));
2196 
2197  return Exp;
2198 }
2199 
2201  SDLoc SL(Op);
2202  SDValue Src = Op.getOperand(0);
2203 
2204  assert(Op.getValueType() == MVT::f64);
2205 
2206  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2207 
2208  // Extract the upper half, since this is where we will find the sign and
2209  // exponent.
2210  SDValue Hi = getHiHalf64(Src, DAG);
2211 
2212  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2213 
2214  const unsigned FractBits = 52;
2215 
2216  // Extract the sign bit.
2217  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2218  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2219 
2220  // Extend back to 64-bits.
2221  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2222  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2223 
2224  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2225  const SDValue FractMask
2226  = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2227 
2228  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2229  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2230  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2231 
2232  EVT SetCCVT =
2234 
2235  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2236 
2237  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2238  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2239 
2240  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2241  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2242 
2243  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2244 }
2245 
2247  SDLoc SL(Op);
2248  SDValue Src = Op.getOperand(0);
2249 
2250  assert(Op.getValueType() == MVT::f64);
2251 
2252  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2253  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2254  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2255 
2256  // TODO: Should this propagate fast-math-flags?
2257 
2258  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2259  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2260 
2261  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2262 
2263  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2264  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2265 
2266  EVT SetCCVT =
2268  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2269 
2270  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2271 }
2272 
2274  // FNEARBYINT and FRINT are the same, except in their handling of FP
2275  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2276  // rint, so just treat them as equivalent.
2277  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2278 }
2279 
2280 // XXX - May require not supporting f32 denormals?
2281 
2282 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2283 // compare and vselect end up producing worse code than scalarizing the whole
2284 // operation.
2286  SDLoc SL(Op);
2287  SDValue X = Op.getOperand(0);
2288  EVT VT = Op.getValueType();
2289 
2290  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2291 
2292  // TODO: Should this propagate fast-math-flags?
2293 
2294  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2295 
2296  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2297 
2298  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2299  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2300  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2301 
2302  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2303 
2304  EVT SetCCVT =
2305  getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2306 
2307  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2308 
2309  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2310 
2311  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2312 }
2313 
2315  SDLoc SL(Op);
2316  SDValue Src = Op.getOperand(0);
2317 
2318  // result = trunc(src);
2319  // if (src < 0.0 && src != result)
2320  // result += -1.0.
2321 
2322  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2323 
2324  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2325  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2326 
2327  EVT SetCCVT =
2329 
2330  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2331  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2332  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2333 
2334  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2335  // TODO: Should this propagate fast-math-flags?
2336  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2337 }
2338 
2340  double Log2BaseInverted) const {
2341  EVT VT = Op.getValueType();
2342 
2343  SDLoc SL(Op);
2344  SDValue Operand = Op.getOperand(0);
2345  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2346  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2347 
2348  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2349 }
2350 
2351 // exp2(M_LOG2E_F * f);
2353  EVT VT = Op.getValueType();
2354  SDLoc SL(Op);
2355  SDValue Src = Op.getOperand(0);
2356 
2357  const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2358  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2359  return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2360 }
2361 
2362 static bool isCtlzOpc(unsigned Opc) {
2363  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2364 }
2365 
2366 static bool isCttzOpc(unsigned Opc) {
2367  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2368 }
2369 
2371  SDLoc SL(Op);
2372  SDValue Src = Op.getOperand(0);
2373 
2374  assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2375  bool Ctlz = isCtlzOpc(Op.getOpcode());
2376  unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2377 
2378  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2379  Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2380 
2381  if (Src.getValueType() == MVT::i32) {
2382  // (ctlz hi:lo) -> (umin (ffbh src), 32)
2383  // (cttz hi:lo) -> (umin (ffbl src), 32)
2384  // (ctlz_zero_undef src) -> (ffbh src)
2385  // (cttz_zero_undef src) -> (ffbl src)
2386  SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2387  if (!ZeroUndef) {
2388  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2389  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2390  }
2391  return NewOpr;
2392  }
2393 
2394  SDValue Lo, Hi;
2395  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2396 
2397  SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2398  SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2399 
2400  // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2401  // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2402  // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2403  // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2404 
2405  unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2406  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2407  if (Ctlz)
2408  OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2409  else
2410  OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2411 
2412  SDValue NewOpr;
2413  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2414  if (!ZeroUndef) {
2415  const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2416  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2417  }
2418 
2419  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2420 }
2421 
2423  bool Signed) const {
2424  // The regular method converting a 64-bit integer to float roughly consists of
2425  // 2 steps: normalization and rounding. In fact, after normalization, the
2426  // conversion from a 64-bit integer to a float is essentially the same as the
2427  // one from a 32-bit integer. The only difference is that it has more
2428  // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2429  // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2430  // converted into the correct float number. The basic steps for the unsigned
2431  // conversion are illustrated in the following pseudo code:
2432  //
2433  // f32 uitofp(i64 u) {
2434  // i32 hi, lo = split(u);
2435  // // Only count the leading zeros in hi as we have native support of the
2436  // // conversion from i32 to f32. If hi is all 0s, the conversion is
2437  // // reduced to a 32-bit one automatically.
2438  // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2439  // u <<= shamt;
2440  // hi, lo = split(u);
2441  // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2442  // // convert it as a 32-bit integer and scale the result back.
2443  // return uitofp(hi) * 2^(32 - shamt);
2444  // }
2445  //
2446  // The signed one follows the same principle but uses 'ffbh_i32' to count its
2447  // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2448  // converted instead followed by negation based its sign bit.
2449 
2450  SDLoc SL(Op);
2451  SDValue Src = Op.getOperand(0);
2452 
2453  SDValue Lo, Hi;
2454  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2455  SDValue Sign;
2456  SDValue ShAmt;
2457  if (Signed && Subtarget->isGCN()) {
2458  // We also need to consider the sign bit in Lo if Hi has just sign bits,
2459  // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2460  // account. That is, the maximal shift is
2461  // - 32 if Lo and Hi have opposite signs;
2462  // - 33 if Lo and Hi have the same sign.
2463  //
2464  // Or, MaxShAmt = 33 + OppositeSign, where
2465  //
2466  // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2467  // - -1 if Lo and Hi have opposite signs; and
2468  // - 0 otherwise.
2469  //
2470  // All in all, ShAmt is calculated as
2471  //
2472  // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2473  //
2474  // or
2475  //
2476  // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2477  //
2478  // to reduce the critical path.
2479  SDValue OppositeSign = DAG.getNode(
2480  ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2481  DAG.getConstant(31, SL, MVT::i32));
2482  SDValue MaxShAmt =
2483  DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2484  OppositeSign);
2485  // Count the leading sign bits.
2486  ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2487  // Different from unsigned conversion, the shift should be one bit less to
2488  // preserve the sign bit.
2489  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2490  DAG.getConstant(1, SL, MVT::i32));
2491  ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2492  } else {
2493  if (Signed) {
2494  // Without 'ffbh_i32', only leading zeros could be counted. Take the
2495  // absolute value first.
2496  Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2497  DAG.getConstant(63, SL, MVT::i64));
2498  SDValue Abs =
2499  DAG.getNode(ISD::XOR, SL, MVT::i64,
2500  DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2501  std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2502  }
2503  // Count the leading zeros.
2504  ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2505  // The shift amount for signed integers is [0, 32].
2506  }
2507  // Normalize the given 64-bit integer.
2508  SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2509  // Split it again.
2510  std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2511  // Calculate the adjust bit for rounding.
2512  // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2513  SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2514  DAG.getConstant(1, SL, MVT::i32), Lo);
2515  // Get the 32-bit normalized integer.
2516  Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2517  // Convert the normalized 32-bit integer into f32.
2518  unsigned Opc =
2519  (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2520  SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2521 
2522  // Finally, need to scale back the converted floating number as the original
2523  // 64-bit integer is converted as a 32-bit one.
2524  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2525  ShAmt);
2526  // On GCN, use LDEXP directly.
2527  if (Subtarget->isGCN())
2528  return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2529 
2530  // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2531  // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2532  // exponent is enough to avoid overflowing into the sign bit.
2533  SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2534  DAG.getConstant(23, SL, MVT::i32));
2535  SDValue IVal =
2536  DAG.getNode(ISD::ADD, SL, MVT::i32,
2537  DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2538  if (Signed) {
2539  // Set the sign bit.
2540  Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2541  DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2542  DAG.getConstant(31, SL, MVT::i32));
2543  IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2544  }
2545  return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2546 }
2547 
2549  bool Signed) const {
2550  SDLoc SL(Op);
2551  SDValue Src = Op.getOperand(0);
2552 
2553  SDValue Lo, Hi;
2554  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2555 
2557  SL, MVT::f64, Hi);
2558 
2559  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2560 
2561  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2562  DAG.getConstant(32, SL, MVT::i32));
2563  // TODO: Should this propagate fast-math-flags?
2564  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2565 }
2566 
2568  SelectionDAG &DAG) const {
2569  // TODO: Factor out code common with LowerSINT_TO_FP.
2570  EVT DestVT = Op.getValueType();
2571  SDValue Src = Op.getOperand(0);
2572  EVT SrcVT = Src.getValueType();
2573 
2574  if (SrcVT == MVT::i16) {
2575  if (DestVT == MVT::f16)
2576  return Op;
2577  SDLoc DL(Op);
2578 
2579  // Promote src to i32
2581  return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2582  }
2583 
2584  assert(SrcVT == MVT::i64 && "operation should be legal");
2585 
2586  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2587  SDLoc DL(Op);
2588 
2589  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2590  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2591  SDValue FPRound =
2592  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2593 
2594  return FPRound;
2595  }
2596 
2597  if (DestVT == MVT::f32)
2598  return LowerINT_TO_FP32(Op, DAG, false);
2599 
2600  assert(DestVT == MVT::f64);
2601  return LowerINT_TO_FP64(Op, DAG, false);
2602 }
2603 
2605  SelectionDAG &DAG) const {
2606  EVT DestVT = Op.getValueType();
2607 
2608  SDValue Src = Op.getOperand(0);
2609  EVT SrcVT = Src.getValueType();
2610 
2611  if (SrcVT == MVT::i16) {
2612  if (DestVT == MVT::f16)
2613  return Op;
2614 
2615  SDLoc DL(Op);
2616  // Promote src to i32
2618  return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2619  }
2620 
2621  assert(SrcVT == MVT::i64 && "operation should be legal");
2622 
2623  // TODO: Factor out code common with LowerUINT_TO_FP.
2624 
2625  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2626  SDLoc DL(Op);
2627  SDValue Src = Op.getOperand(0);
2628 
2629  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2630  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2631  SDValue FPRound =
2632  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2633 
2634  return FPRound;
2635  }
2636 
2637  if (DestVT == MVT::f32)
2638  return LowerINT_TO_FP32(Op, DAG, true);
2639 
2640  assert(DestVT == MVT::f64);
2641  return LowerINT_TO_FP64(Op, DAG, true);
2642 }
2643 
2645  bool Signed) const {
2646  SDLoc SL(Op);
2647 
2648  SDValue Src = Op.getOperand(0);
2649  EVT SrcVT = Src.getValueType();
2650 
2651  assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2652 
2653  // The basic idea of converting a floating point number into a pair of 32-bit
2654  // integers is illustrated as follows:
2655  //
2656  // tf := trunc(val);
2657  // hif := floor(tf * 2^-32);
2658  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2659  // hi := fptoi(hif);
2660  // lo := fptoi(lof);
2661  //
2662  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2663  SDValue Sign;
2664  if (Signed && SrcVT == MVT::f32) {
2665  // However, a 32-bit floating point number has only 23 bits mantissa and
2666  // it's not enough to hold all the significant bits of `lof` if val is
2667  // negative. To avoid the loss of precision, We need to take the absolute
2668  // value after truncating and flip the result back based on the original
2669  // signedness.
2670  Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2671  DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2672  DAG.getConstant(31, SL, MVT::i32));
2673  Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2674  }
2675 
2676  SDValue K0, K1;
2677  if (SrcVT == MVT::f64) {
2678  K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
2679  SL, SrcVT);
2680  K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
2681  SL, SrcVT);
2682  } else {
2683  K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
2684  SrcVT);
2685  K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
2686  SrcVT);
2687  }
2688  // TODO: Should this propagate fast-math-flags?
2689  SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2690 
2691  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2692 
2693  SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2694 
2695  SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2696  : ISD::FP_TO_UINT,
2697  SL, MVT::i32, FloorMul);
2698  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2699 
2700  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2701  DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2702 
2703  if (Signed && SrcVT == MVT::f32) {
2704  assert(Sign);
2705  // Flip the result based on the signedness, which is either all 0s or 1s.
2706  Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2707  DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2708  // r := xor(r, sign) - sign;
2709  Result =
2710  DAG.getNode(ISD::SUB, SL, MVT::i64,
2711  DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2712  }
2713 
2714  return Result;
2715 }
2716 
2718  SDLoc DL(Op);
2719  SDValue N0 = Op.getOperand(0);
2720 
2721  // Convert to target node to get known bits
2722  if (N0.getValueType() == MVT::f32)
2723  return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2724 
2725  if (getTargetMachine().Options.UnsafeFPMath) {
2726  // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2727  return SDValue();
2728  }
2729 
2731 
2732  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2733  const unsigned ExpMask = 0x7ff;
2734  const unsigned ExpBiasf64 = 1023;
2735  const unsigned ExpBiasf16 = 15;
2736  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2737  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2738  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2739  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2740  DAG.getConstant(32, DL, MVT::i64));
2741  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2742  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2743  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2744  DAG.getConstant(20, DL, MVT::i64));
2745  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2746  DAG.getConstant(ExpMask, DL, MVT::i32));
2747  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2748  // add the f16 bias (15) to get the biased exponent for the f16 format.
2749  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2750  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2751 
2752  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2753  DAG.getConstant(8, DL, MVT::i32));
2754  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2755  DAG.getConstant(0xffe, DL, MVT::i32));
2756 
2757  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2758  DAG.getConstant(0x1ff, DL, MVT::i32));
2759  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2760 
2761  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2762  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2763 
2764  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2765  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2766  DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2767  Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2768 
2769  // N = M | (E << 12);
2770  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2771  DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2772  DAG.getConstant(12, DL, MVT::i32)));
2773 
2774  // B = clamp(1-E, 0, 13);
2775  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2776  One, E);
2777  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2778  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2779  DAG.getConstant(13, DL, MVT::i32));
2780 
2781  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2782  DAG.getConstant(0x1000, DL, MVT::i32));
2783 
2784  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2785  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2786  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2787  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2788 
2789  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2790  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2791  DAG.getConstant(0x7, DL, MVT::i32));
2792  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2793  DAG.getConstant(2, DL, MVT::i32));
2794  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2795  One, Zero, ISD::SETEQ);
2796  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2797  One, Zero, ISD::SETGT);
2798  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2799  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2800 
2801  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2802  DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2803  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2804  I, V, ISD::SETEQ);
2805 
2806  // Extract the sign bit.
2807  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2808  DAG.getConstant(16, DL, MVT::i32));
2809  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2810  DAG.getConstant(0x8000, DL, MVT::i32));
2811 
2812  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2813  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2814 }
2815 
2817  SelectionDAG &DAG) const {
2818  SDValue Src = Op.getOperand(0);
2819  unsigned OpOpcode = Op.getOpcode();
2820  EVT SrcVT = Src.getValueType();
2821  EVT DestVT = Op.getValueType();
2822 
2823  // Will be selected natively
2824  if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2825  return Op;
2826 
2827  // Promote i16 to i32
2828  if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2829  SDLoc DL(Op);
2830 
2831  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2832  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2833  }
2834 
2835  if (SrcVT == MVT::f16 ||
2836  (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2837  SDLoc DL(Op);
2838 
2839  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2840  unsigned Ext =
2842  return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2843  }
2844 
2845  if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2846  return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2847 
2848  return SDValue();
2849 }
2850 
2852  SelectionDAG &DAG) const {
2853  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2854  MVT VT = Op.getSimpleValueType();
2855  MVT ScalarVT = VT.getScalarType();
2856 
2857  assert(VT.isVector());
2858 
2859  SDValue Src = Op.getOperand(0);
2860  SDLoc DL(Op);
2861 
2862  // TODO: Don't scalarize on Evergreen?
2863  unsigned NElts = VT.getVectorNumElements();
2865  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2866 
2867  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2868  for (unsigned I = 0; I < NElts; ++I)
2869  Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2870 
2871  return DAG.getBuildVector(VT, DL, Args);
2872 }
2873 
2874 //===----------------------------------------------------------------------===//
2875 // Custom DAG optimizations
2876 //===----------------------------------------------------------------------===//
2877 
2878 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2879  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2880 }
2881 
2882 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2883  EVT VT = Op.getValueType();
2884  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2885  // as unsigned 24-bit values.
2887 }
2888 
2891  SelectionDAG &DAG = DCI.DAG;
2892  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2893  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2894 
2895  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2896  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2897  unsigned NewOpcode = Node24->getOpcode();
2898  if (IsIntrin) {
2899  unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2900  NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2902  }
2903 
2904  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2905 
2906  // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2907  // the operands to have other uses, but will only perform simplifications that
2908  // involve bypassing some nodes for this user.
2909  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2910  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2911  if (DemandedLHS || DemandedRHS)
2912  return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2913  DemandedLHS ? DemandedLHS : LHS,
2914  DemandedRHS ? DemandedRHS : RHS);
2915 
2916  // Now try SimplifyDemandedBits which can simplify the nodes used by our
2917  // operands if this node is the only user.
2918  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2919  return SDValue(Node24, 0);
2920  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2921  return SDValue(Node24, 0);
2922 
2923  return SDValue();
2924 }
2925 
2926 template <typename IntTy>
2928  uint32_t Width, const SDLoc &DL) {
2929  if (Width + Offset < 32) {
2930  uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2931  IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2932  return DAG.getConstant(Result, DL, MVT::i32);
2933  }
2934 
2935  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2936 }
2937 
2938 static bool hasVolatileUser(SDNode *Val) {
2939  for (SDNode *U : Val->uses()) {
2940  if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2941  if (M->isVolatile())
2942  return true;
2943  }
2944  }
2945 
2946  return false;
2947 }
2948 
2950  // i32 vectors are the canonical memory type.
2951  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2952  return false;
2953 
2954  if (!VT.isByteSized())
2955  return false;
2956 
2957  unsigned Size = VT.getStoreSize();
2958 
2959  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2960  return false;
2961 
2962  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2963  return false;
2964 
2965  return true;
2966 }
2967 
2968 // Replace load of an illegal type with a store of a bitcast to a friendlier
2969 // type.
2971  DAGCombinerInfo &DCI) const {
2972  if (!DCI.isBeforeLegalize())
2973  return SDValue();
2974 
2975  LoadSDNode *LN = cast<LoadSDNode>(N);
2976  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2977  return SDValue();
2978 
2979  SDLoc SL(N);
2980  SelectionDAG &DAG = DCI.DAG;
2981  EVT VT = LN->getMemoryVT();
2982 
2983  unsigned Size = VT.getStoreSize();
2984  Align Alignment = LN->getAlign();
2985  if (Alignment < Size && isTypeLegal(VT)) {
2986  bool IsFast;
2987  unsigned AS = LN->getAddressSpace();
2988 
2989  // Expand unaligned loads earlier than legalization. Due to visitation order
2990  // problems during legalization, the emitted instructions to pack and unpack
2991  // the bytes again are not eliminated in the case of an unaligned copy.
2993  VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2994  SDValue Ops[2];
2995 
2996  if (VT.isVector())
2997  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
2998  else
2999  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3000 
3001  return DAG.getMergeValues(Ops, SDLoc(N));
3002  }
3003 
3004  if (!IsFast)
3005  return SDValue();
3006  }
3007 
3008  if (!shouldCombineMemoryType(VT))
3009  return SDValue();
3010 
3011  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3012 
3013  SDValue NewLoad
3014  = DAG.getLoad(NewVT, SL, LN->getChain(),
3015  LN->getBasePtr(), LN->getMemOperand());
3016 
3017  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3018  DCI.CombineTo(N, BC, NewLoad.getValue(1));
3019  return SDValue(N, 0);
3020 }
3021 
3022 // Replace store of an illegal type with a store of a bitcast to a friendlier
3023 // type.
3025  DAGCombinerInfo &DCI) const {
3026  if (!DCI.isBeforeLegalize())
3027  return SDValue();
3028 
3029  StoreSDNode *SN = cast<StoreSDNode>(N);
3030  if (!SN->isSimple() || !ISD::isNormalStore(SN))
3031  return SDValue();
3032 
3033  EVT VT = SN->getMemoryVT();
3034  unsigned Size = VT.getStoreSize();
3035 
3036  SDLoc SL(N);
3037  SelectionDAG &DAG = DCI.DAG;
3038  Align Alignment = SN->getAlign();
3039  if (Alignment < Size && isTypeLegal(VT)) {
3040  bool IsFast;
3041  unsigned AS = SN->getAddressSpace();
3042 
3043  // Expand unaligned stores earlier than legalization. Due to visitation
3044  // order problems during legalization, the emitted instructions to pack and
3045  // unpack the bytes again are not eliminated in the case of an unaligned
3046  // copy.
3048  VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3049  if (VT.isVector())
3050  return scalarizeVectorStore(SN, DAG);
3051 
3052  return expandUnalignedStore(SN, DAG);
3053  }
3054 
3055  if (!IsFast)
3056  return SDValue();
3057  }
3058 
3059  if (!shouldCombineMemoryType(VT))
3060  return SDValue();
3061 
3062  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3063  SDValue Val = SN->getValue();
3064 
3065  //DCI.AddToWorklist(Val.getNode());
3066 
3067  bool OtherUses = !Val.hasOneUse();
3068  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3069  if (OtherUses) {
3070  SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3071  DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3072  }
3073 
3074  return DAG.getStore(SN->getChain(), SL, CastVal,
3075  SN->getBasePtr(), SN->getMemOperand());
3076 }
3077 
3078 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3079 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3080 // issues.
3082  DAGCombinerInfo &DCI) const {
3083  SelectionDAG &DAG = DCI.DAG;
3084  SDValue N0 = N->getOperand(0);
3085 
3086  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3087  // (vt2 (truncate (assertzext vt0:x, vt1)))
3088  if (N0.getOpcode() == ISD::TRUNCATE) {
3089  SDValue N1 = N->getOperand(1);
3090  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3091  SDLoc SL(N);
3092 
3093  SDValue Src = N0.getOperand(0);
3094  EVT SrcVT = Src.getValueType();
3095  if (SrcVT.bitsGE(ExtVT)) {
3096  SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3097  return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3098  }
3099  }
3100 
3101  return SDValue();
3102 }
3103 
3105  SDNode *N, DAGCombinerInfo &DCI) const {
3106  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3107  switch (IID) {
3108  case Intrinsic::amdgcn_mul_i24:
3109  case Intrinsic::amdgcn_mul_u24:
3110  return simplifyMul24(N, DCI);
3111  case Intrinsic::amdgcn_fract:
3112  case Intrinsic::amdgcn_rsq:
3113  case Intrinsic::amdgcn_rcp_legacy:
3114  case Intrinsic::amdgcn_rsq_legacy:
3115  case Intrinsic::amdgcn_rsq_clamp:
3116  case Intrinsic::amdgcn_ldexp: {
3117  // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3118  SDValue Src = N->getOperand(1);
3119  return Src.isUndef() ? Src : SDValue();
3120  }
3121  default:
3122  return SDValue();
3123  }
3124 }
3125 
3126 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3127 /// binary operation \p Opc to it with the corresponding constant operands.
3129  DAGCombinerInfo &DCI, const SDLoc &SL,
3130  unsigned Opc, SDValue LHS,
3131  uint32_t ValLo, uint32_t ValHi) const {
3132  SelectionDAG &DAG = DCI.DAG;
3133  SDValue Lo, Hi;
3134  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3135 
3136  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3137  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3138 
3139  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3140  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3141 
3142  // Re-visit the ands. It's possible we eliminated one of them and it could
3143  // simplify the vector.
3144  DCI.AddToWorklist(Lo.getNode());
3145  DCI.AddToWorklist(Hi.getNode());
3146 
3147  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3148  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3149 }
3150 
3152  DAGCombinerInfo &DCI) const {
3153  EVT VT = N->getValueType(0);
3154 
3155  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3156  if (!RHS)
3157  return SDValue();
3158 
3159  SDValue LHS = N->getOperand(0);
3160  unsigned RHSVal = RHS->getZExtValue();
3161  if (!RHSVal)
3162  return LHS;
3163 
3164  SDLoc SL(N);
3165  SelectionDAG &DAG = DCI.DAG;
3166 
3167  switch (LHS->getOpcode()) {
3168  default:
3169  break;
3170  case ISD::ZERO_EXTEND:
3171  case ISD::SIGN_EXTEND:
3172  case ISD::ANY_EXTEND: {
3173  SDValue X = LHS->getOperand(0);
3174 
3175  if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3177  // Prefer build_vector as the canonical form if packed types are legal.
3178  // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3179  SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3180  { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3181  return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3182  }
3183 
3184  // shl (ext x) => zext (shl x), if shift does not overflow int
3185  if (VT != MVT::i64)
3186  break;
3187  KnownBits Known = DAG.computeKnownBits(X);
3188  unsigned LZ = Known.countMinLeadingZeros();
3189  if (LZ < RHSVal)
3190  break;
3191  EVT XVT = X.getValueType();
3192  SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3193  return DAG.getZExtOrTrunc(Shl, SL, VT);
3194  }
3195  }
3196 
3197  if (VT != MVT::i64)
3198  return SDValue();
3199 
3200  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3201 
3202  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3203  // common case, splitting this into a move and a 32-bit shift is faster and
3204  // the same code size.
3205  if (RHSVal < 32)
3206  return SDValue();
3207 
3208  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3209 
3210  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3211  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3212 
3213  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3214 
3215  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3216  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3217 }
3218 
3220  DAGCombinerInfo &DCI) const {
3221  if (N->getValueType(0) != MVT::i64)
3222  return SDValue();
3223 
3224  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3225  if (!RHS)
3226  return SDValue();
3227 
3228  SelectionDAG &DAG = DCI.DAG;
3229  SDLoc SL(N);
3230  unsigned RHSVal = RHS->getZExtValue();
3231 
3232  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3233  if (RHSVal == 32) {
3234  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3235  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3236  DAG.getConstant(31, SL, MVT::i32));
3237 
3238  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3239  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3240  }
3241 
3242  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3243  if (RHSVal == 63) {
3244  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3245  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3246  DAG.getConstant(31, SL, MVT::i32));
3247  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3248  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3249  }
3250 
3251  return SDValue();
3252 }
3253 
3255  DAGCombinerInfo &DCI) const {
3256  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3257  if (!RHS)
3258  return SDValue();
3259 
3260  EVT VT = N->getValueType(0);
3261  SDValue LHS = N->getOperand(0);
3262  unsigned ShiftAmt = RHS->getZExtValue();
3263  SelectionDAG &DAG = DCI.DAG;
3264  SDLoc SL(N);
3265 
3266  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3267  // this improves the ability to match BFE patterns in isel.
3268  if (LHS.getOpcode() == ISD::AND) {
3269  if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3270  if (Mask->getAPIntValue().isShiftedMask() &&
3271  Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3272  return DAG.getNode(
3273  ISD::AND, SL, VT,
3274  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3275  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3276  }
3277  }
3278  }
3279 
3280  if (VT != MVT::i64)
3281  return SDValue();
3282 
3283  if (ShiftAmt < 32)
3284  return SDValue();
3285 
3286  // srl i64:x, C for C >= 32
3287  // =>
3288  // build_pair (srl hi_32(x), C - 32), 0
3289  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3290 
3291  SDValue Hi = getHiHalf64(LHS, DAG);
3292 
3293  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3294  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3295 
3296  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3297 
3298  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3299 }
3300 
3302  SDNode *N, DAGCombinerInfo &DCI) const {
3303  SDLoc SL(N);
3304  SelectionDAG &DAG = DCI.DAG;
3305  EVT VT = N->getValueType(0);
3306  SDValue Src = N->getOperand(0);
3307 
3308  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3309  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3310  SDValue Vec = Src.getOperand(0);
3311  if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3312  SDValue Elt0 = Vec.getOperand(0);
3313  EVT EltVT = Elt0.getValueType();
3314  if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3315  if (EltVT.isFloatingPoint()) {
3316  Elt0 = DAG.getNode(ISD::BITCAST, SL,
3317  EltVT.changeTypeToInteger(), Elt0);
3318  }
3319 
3320  return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3321  }
3322  }
3323  }
3324 
3325  // Equivalent of above for accessing the high element of a vector as an
3326  // integer operation.
3327  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3328  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3329  if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3330  if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3331  SDValue BV = stripBitcast(Src.getOperand(0));
3332  if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3333  BV.getValueType().getVectorNumElements() == 2) {
3334  SDValue SrcElt = BV.getOperand(1);
3335  EVT SrcEltVT = SrcElt.getValueType();
3336  if (SrcEltVT.isFloatingPoint()) {
3337  SrcElt = DAG.getNode(ISD::BITCAST, SL,
3338  SrcEltVT.changeTypeToInteger(), SrcElt);
3339  }
3340 
3341  return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3342  }
3343  }
3344  }
3345  }
3346 
3347  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3348  //
3349  // i16 (trunc (srl i64:x, K)), K <= 16 ->
3350  // i16 (trunc (srl (i32 (trunc x), K)))
3351  if (VT.getScalarSizeInBits() < 32) {
3352  EVT SrcVT = Src.getValueType();
3353  if (SrcVT.getScalarSizeInBits() > 32 &&
3354  (Src.getOpcode() == ISD::SRL ||
3355  Src.getOpcode() == ISD::SRA ||
3356  Src.getOpcode() == ISD::SHL)) {
3357  SDValue Amt = Src.getOperand(1);
3358  KnownBits Known = DAG.computeKnownBits(Amt);
3359  unsigned Size = VT.getScalarSizeInBits();
3360  if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3361  (Known.countMaxActiveBits() <= Log2_32(Size))) {
3362  EVT MidVT = VT.isVector() ?
3365 
3366  EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3367  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3368  Src.getOperand(0));
3369  DCI.AddToWorklist(Trunc.getNode());
3370 
3371  if (Amt.getValueType() != NewShiftVT) {
3372  Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3373  DCI.AddToWorklist(Amt.getNode());
3374  }
3375 
3376  SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3377  Trunc, Amt);
3378  return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3379  }
3380  }
3381  }
3382 
3383  return SDValue();
3384 }
3385 
3386 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3387 // instructions. If we only match on the legalized i64 mul expansion,
3388 // SimplifyDemandedBits will be unable to remove them because there will be
3389 // multiple uses due to the separate mul + mulh[su].
3390 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3391  SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3392  if (Size <= 32) {
3393  unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3394  return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3395  }
3396 
3397  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3398  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3399 
3400  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3401  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3402 
3403  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3404 }
3405 
3407  DAGCombinerInfo &DCI) const {
3408  EVT VT = N->getValueType(0);
3409 
3410  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3411  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3412  // unnecessarily). isDivergent() is used as an approximation of whether the
3413  // value is in an SGPR.
3414  if (!N->isDivergent())
3415  return SDValue();
3416 
3417  unsigned Size = VT.getSizeInBits();
3418  if (VT.isVector() || Size > 64)
3419  return SDValue();
3420 
3421  // There are i16 integer mul/mad.
3422  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3423  return SDValue();
3424 
3425  SelectionDAG &DAG = DCI.DAG;
3426  SDLoc DL(N);
3427 
3428  SDValue N0 = N->getOperand(0);
3429  SDValue N1 = N->getOperand(1);
3430 
3431  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3432  // in the source into any_extends if the result of the mul is truncated. Since
3433  // we can assume the high bits are whatever we want, use the underlying value
3434  // to avoid the unknown high bits from interfering.
3435  if (N0.getOpcode() == ISD::ANY_EXTEND)
3436  N0 = N0.getOperand(0);
3437 
3438  if (N1.getOpcode() == ISD::ANY_EXTEND)
3439  N1 = N1.getOperand(0);
3440 
3441  SDValue Mul;
3442 
3443  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3444  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3445  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3446  Mul = getMul24(DAG, DL, N0, N1, Size, false);
3447  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3448  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3449  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3450  Mul = getMul24(DAG, DL, N0, N1, Size, true);
3451  } else {
3452  return SDValue();
3453  }
3454 
3455  // We need to use sext even for MUL_U24, because MUL_U24 is used
3456  // for signed multiply of 8 and 16-bit types.
3457  return DAG.getSExtOrTrunc(Mul, DL, VT);
3458 }
3459 
3461  DAGCombinerInfo &DCI) const {
3462  EVT VT = N->getValueType(0);
3463 
3464  if (!Subtarget->hasMulI24() || VT.isVector())
3465  return SDValue();
3466 
3467  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3468  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3469  // unnecessarily). isDivergent() is used as an approximation of whether the
3470  // value is in an SGPR.
3471  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3472  // valu op anyway)
3473  if (Subtarget->hasSMulHi() && !N->isDivergent())
3474  return SDValue();
3475 
3476  SelectionDAG &DAG = DCI.DAG;
3477  SDLoc DL(N);
3478 
3479  SDValue N0 = N->getOperand(0);
3480  SDValue N1 = N->getOperand(1);
3481 
3482  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3483  return SDValue();
3484 
3485  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3486  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3487 
3488  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3489  DCI.AddToWorklist(Mulhi.getNode());
3490  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3491 }
3492 
3494  DAGCombinerInfo &DCI) const {
3495  EVT VT = N->getValueType(0);
3496 
3497  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3498  return SDValue();
3499 
3500  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3501  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3502  // unnecessarily). isDivergent() is used as an approximation of whether the
3503  // value is in an SGPR.
3504  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3505  // valu op anyway)
3506  if (Subtarget->hasSMulHi() && !N->isDivergent())
3507  return SDValue();
3508 
3509  SelectionDAG &DAG = DCI.DAG;
3510  SDLoc DL(N);
3511 
3512  SDValue N0 = N->getOperand(0);
3513  SDValue N1 = N->getOperand(1);
3514 
3515  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3516  return SDValue();
3517 
3518  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3519  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3520 
3521  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3522  DCI.AddToWorklist(Mulhi.getNode());
3523  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3524 }
3525 
3526 static bool isNegativeOne(SDValue Val) {
3527  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3528  return C->isAllOnes();
3529  return false;
3530 }
3531 
3532 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3533  SDValue Op,
3534  const SDLoc &DL,
3535  unsigned Opc) const {
3536  EVT VT = Op.getValueType();
3537  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3538  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3539  LegalVT != MVT::i16))
3540  return SDValue();
3541 
3542  if (VT != MVT::i32)
3544 
3545  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3546  if (VT != MVT::i32)
3547  FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3548 
3549  return FFBX;
3550 }
3551 
3552 // The native instructions return -1 on 0 input. Optimize out a select that
3553 // produces -1 on 0.
3554 //
3555 // TODO: If zero is not undef, we could also do this if the output is compared
3556 // against the bitwidth.
3557 //
3558 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3560  SDValue LHS, SDValue RHS,
3561  DAGCombinerInfo &DCI) const {
3562  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3563  if (!CmpRhs || !CmpRhs->isZero())
3564  return SDValue();
3565 
3566  SelectionDAG &DAG = DCI.DAG;
3567  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3568  SDValue CmpLHS = Cond.getOperand(0);
3569 
3570  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3571  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3572  if (CCOpcode == ISD::SETEQ &&
3573  (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3574  RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3575  unsigned Opc =
3577  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3578  }
3579 
3580  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3581  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3582  if (CCOpcode == ISD::SETNE &&
3583  (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3584  LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3585  unsigned Opc =
3587 
3588  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3589  }
3590 
3591  return SDValue();
3592 }
3593 
3595  unsigned Op,
3596  const SDLoc &SL,
3597  SDValue Cond,
3598  SDValue N1,
3599  SDValue N2) {
3600  SelectionDAG &DAG = DCI.DAG;
3601  EVT VT = N1.getValueType();
3602 
3603  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3604  N1.getOperand(0), N2.getOperand(0));
3605  DCI.AddToWorklist(NewSelect.getNode());
3606  return DAG.getNode(Op, SL, VT, NewSelect);
3607 }
3608 
3609 // Pull a free FP operation out of a select so it may fold into uses.
3610 //
3611 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3612 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3613 //
3614 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3615 // select c, (fabs x), +k -> fabs (select c, x, k)
3617  SDValue N) {
3618  SelectionDAG &DAG = DCI.DAG;
3619  SDValue Cond = N.getOperand(0);
3620  SDValue LHS = N.getOperand(1);
3621  SDValue RHS = N.getOperand(2);
3622 
3623  EVT VT = N.getValueType();
3624  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3625  (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3626  return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3627  SDLoc(N), Cond, LHS, RHS);
3628  }
3629 
3630  bool Inv = false;
3631  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3632  std::swap(LHS, RHS);
3633  Inv = true;
3634  }
3635 
3636  // TODO: Support vector constants.
3637  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3638  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3639  SDLoc SL(N);
3640  // If one side is an fneg/fabs and the other is a constant, we can push the
3641  // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3642  SDValue NewLHS = LHS.getOperand(0);
3643  SDValue NewRHS = RHS;
3644 
3645  // Careful: if the neg can be folded up, don't try to pull it back down.
3646  bool ShouldFoldNeg = true;
3647 
3648  if (NewLHS.hasOneUse()) {
3649  unsigned Opc = NewLHS.getOpcode();
3650  if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3651  ShouldFoldNeg = false;
3652  if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3653  ShouldFoldNeg = false;
3654  }
3655 
3656  if (ShouldFoldNeg) {
3657  if (LHS.getOpcode() == ISD::FNEG)
3658  NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3659  else if (CRHS->isNegative())
3660  return SDValue();
3661 
3662  if (Inv)
3663  std::swap(NewLHS, NewRHS);
3664 
3665  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3666  Cond, NewLHS, NewRHS);
3667  DCI.AddToWorklist(NewSelect.getNode());
3668  return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3669  }
3670  }
3671 
3672  return SDValue();
3673 }
3674 
3675 
3677  DAGCombinerInfo &DCI) const {
3678  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3679  return Folded;
3680 
3681  SDValue Cond = N->getOperand(0);
3682  if (Cond.getOpcode() != ISD::SETCC)
3683  return SDValue();
3684 
3685  EVT VT = N->getValueType(0);
3686  SDValue LHS = Cond.getOperand(0);
3687  SDValue RHS = Cond.getOperand(1);
3688  SDValue CC = Cond.getOperand(2);
3689 
3690  SDValue True = N->getOperand(1);
3691  SDValue False = N->getOperand(2);
3692 
3693  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3694  SelectionDAG &DAG = DCI.DAG;
3695  if (DAG.isConstantValueOfAnyType(True) &&
3696  !DAG.isConstantValueOfAnyType(False)) {
3697  // Swap cmp + select pair to move constant to false input.
3698  // This will allow using VOPC cndmasks more often.
3699  // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3700 
3701  SDLoc SL(N);
3702  ISD::CondCode NewCC =
3703  getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3704 
3705  SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3706  return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3707  }
3708 
3709  if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3710  SDValue MinMax
3711  = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3712  // Revisit this node so we can catch min3/max3/med3 patterns.
3713  //DCI.AddToWorklist(MinMax.getNode());
3714  return MinMax;
3715  }
3716  }
3717 
3718  // There's no reason to not do this if the condition has other uses.
3719  return performCtlz_