LLVM  13.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/IR/DiagnosticInfo.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/Support/KnownBits.h"
27 
28 using namespace llvm;
29 
30 #include "AMDGPUGenCallingConv.inc"
31 
33  "amdgpu-bypass-slow-div",
34  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
35  cl::init(true));
36 
37 // Find a larger type to do a load / store of a vector with.
39  unsigned StoreSize = VT.getStoreSizeInBits();
40  if (StoreSize <= 32)
41  return EVT::getIntegerVT(Ctx, StoreSize);
42 
43  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
44  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
45 }
46 
48  EVT VT = Op.getValueType();
49  KnownBits Known = DAG.computeKnownBits(Op);
50  return VT.getSizeInBits() - Known.countMinLeadingZeros();
51 }
52 
54  EVT VT = Op.getValueType();
55 
56  // In order for this to be a signed 24-bit value, bit 23, must
57  // be a sign bit.
58  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
59 }
60 
62  const AMDGPUSubtarget &STI)
63  : TargetLowering(TM), Subtarget(&STI) {
64  // Lower floating point store/load to integer store/load to reduce the number
65  // of patterns in tablegen.
68 
71 
74 
77 
80 
83 
86 
89 
92 
95 
98 
101 
104 
107 
110 
113 
116 
119 
120  // There are no 64-bit extloads. These should be done as a 32-bit extload and
121  // an extension to 64-bit.
122  for (MVT VT : MVT::integer_valuetypes()) {
126  }
127 
128  for (MVT VT : MVT::integer_valuetypes()) {
129  if (VT == MVT::i64)
130  continue;
131 
136 
141 
146  }
147 
164  }
165 
173 
179 
185 
188 
191 
194 
197 
200 
203 
206 
209 
212 
215 
218 
221 
224 
227 
230 
233 
236 
239 
244 
249 
257 
260 
263 
268 
271 
279 
284 
287 
288  // This is totally unsupported, just custom lower to produce an error.
290 
291  // Library functions. These default to Expand, but we have instructions
292  // for them.
303 
306 
310 
311 
314 
318 
319  // Expand to fneg + fadd.
321 
352 
356 
357  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
358  for (MVT VT : ScalarIntVTs) {
359  // These should use [SU]DIVREM, so set them to expand
364 
365  // GPU does not have divrem function for signed or unsigned.
368 
369  // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
372 
376 
377  // AMDGPU uses ADDC/SUBC/ADDE/SUBE
382  }
383 
384  // The hardware supports 32-bit FSHR, but not FSHL.
386 
387  // The hardware supports 32-bit ROTR, but not ROTL.
391 
394 
403 
408 
413 
414  static const MVT::SimpleValueType VectorIntTypes[] = {
416  };
417 
418  for (MVT VT : VectorIntTypes) {
419  // Expand the following operations for the current type by default.
454  }
455 
456  static const MVT::SimpleValueType FloatVectorTypes[] = {
458  };
459 
460  for (MVT VT : FloatVectorTypes) {
491  }
492 
493  // This causes using an unrolled select operation rather than expansion with
494  // bit operations. This is in general better, but the alternative using BFI
495  // instructions may be better if the select sources are SGPRs.
498 
501 
504 
507 
508  // There are no libcalls of any kind.
509  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
510  setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
511 
513  setJumpIsExpensive(true);
514 
515  // FIXME: This is only partially true. If we have to do vector compares, any
516  // SGPR pair can be a condition register. If we have a uniform condition, we
517  // are better off doing SALU operations, where there is only one SCC. For now,
518  // we don't have a way of knowing during instruction selection if a condition
519  // will be uniform and we always use vector compares. Assume we are using
520  // vector compares until that is fixed.
522 
525 
527 
528  // We want to find all load dependencies for long chains of stores to enable
529  // merging into very wide vectors. The problem is with vectors with > 4
530  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
531  // vectors are a legal type, even though we have to split the loads
532  // usually. When we can more precisely specify load legality per address
533  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
534  // smarter so that they can figure out what to do in 2 iterations without all
535  // N > 4 stores on the same chain.
537 
538  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
539  // about these during lowering.
540  MaxStoresPerMemcpy = 0xffffffff;
541  MaxStoresPerMemmove = 0xffffffff;
542  MaxStoresPerMemset = 0xffffffff;
543 
544  // The expansion for 64-bit division is enormous.
546  addBypassSlowDiv(64, 32);
547 
566 }
567 
569  if (getTargetMachine().Options.NoSignedZerosFPMath)
570  return true;
571 
572  const auto Flags = Op.getNode()->getFlags();
573  if (Flags.hasNoSignedZeros())
574  return true;
575 
576  return false;
577 }
578 
579 //===----------------------------------------------------------------------===//
580 // Target Information
581 //===----------------------------------------------------------------------===//
582 
584 static bool fnegFoldsIntoOp(unsigned Opc) {
585  switch (Opc) {
586  case ISD::FADD:
587  case ISD::FSUB:
588  case ISD::FMUL:
589  case ISD::FMA:
590  case ISD::FMAD:
591  case ISD::FMINNUM:
592  case ISD::FMAXNUM:
593  case ISD::FMINNUM_IEEE:
594  case ISD::FMAXNUM_IEEE:
595  case ISD::FSIN:
596  case ISD::FTRUNC:
597  case ISD::FRINT:
598  case ISD::FNEARBYINT:
599  case ISD::FCANONICALIZE:
600  case AMDGPUISD::RCP:
603  case AMDGPUISD::SIN_HW:
607  case AMDGPUISD::FMED3:
608  // TODO: handle llvm.amdgcn.fma.legacy
609  return true;
610  default:
611  return false;
612  }
613 }
614 
615 /// \p returns true if the operation will definitely need to use a 64-bit
616 /// encoding, and thus will use a VOP3 encoding regardless of the source
617 /// modifiers.
619 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
620  return N->getNumOperands() > 2 || VT == MVT::f64;
621 }
622 
623 // Most FP instructions support source modifiers, but this could be refined
624 // slightly.
626 static bool hasSourceMods(const SDNode *N) {
627  if (isa<MemSDNode>(N))
628  return false;
629 
630  switch (N->getOpcode()) {
631  case ISD::CopyToReg:
632  case ISD::SELECT:
633  case ISD::FDIV:
634  case ISD::FREM:
635  case ISD::INLINEASM:
636  case ISD::INLINEASM_BR:
639 
640  // TODO: Should really be looking at the users of the bitcast. These are
641  // problematic because bitcasts are used to legalize all stores to integer
642  // types.
643  case ISD::BITCAST:
644  return false;
646  switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
647  case Intrinsic::amdgcn_interp_p1:
648  case Intrinsic::amdgcn_interp_p2:
649  case Intrinsic::amdgcn_interp_mov:
650  case Intrinsic::amdgcn_interp_p1_f16:
651  case Intrinsic::amdgcn_interp_p2_f16:
652  return false;
653  default:
654  return true;
655  }
656  }
657  default:
658  return true;
659  }
660 }
661 
663  unsigned CostThreshold) {
664  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
665  // it is truly free to use a source modifier in all cases. If there are
666  // multiple users but for each one will necessitate using VOP3, there will be
667  // a code size increase. Try to avoid increasing code size unless we know it
668  // will save on the instruction count.
669  unsigned NumMayIncreaseSize = 0;
670  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
671 
672  // XXX - Should this limit number of uses to check?
673  for (const SDNode *U : N->uses()) {
674  if (!hasSourceMods(U))
675  return false;
676 
677  if (!opMustUseVOP3Encoding(U, VT)) {
678  if (++NumMayIncreaseSize > CostThreshold)
679  return false;
680  }
681  }
682 
683  return true;
684 }
685 
687  ISD::NodeType ExtendKind) const {
688  assert(!VT.isVector() && "only scalar expected");
689 
690  // Round to the next multiple of 32-bits.
691  unsigned Size = VT.getSizeInBits();
692  if (Size <= 32)
693  return MVT::i32;
694  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
695 }
696 
698  return MVT::i32;
699 }
700 
702  return true;
703 }
704 
705 // The backend supports 32 and 64 bit floating point immediates.
706 // FIXME: Why are we reporting vectors of FP immediates as legal?
708  bool ForCodeSize) const {
709  EVT ScalarVT = VT.getScalarType();
710  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
711  (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
712 }
713 
714 // We don't want to shrink f64 / f32 constants.
716  EVT ScalarVT = VT.getScalarType();
717  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
718 }
719 
721  ISD::LoadExtType ExtTy,
722  EVT NewVT) const {
723  // TODO: This may be worth removing. Check regression tests for diffs.
724  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
725  return false;
726 
727  unsigned NewSize = NewVT.getStoreSizeInBits();
728 
729  // If we are reducing to a 32-bit load or a smaller multi-dword load,
730  // this is always better.
731  if (NewSize >= 32)
732  return true;
733 
734  EVT OldVT = N->getValueType(0);
735  unsigned OldSize = OldVT.getStoreSizeInBits();
736 
737  MemSDNode *MN = cast<MemSDNode>(N);
738  unsigned AS = MN->getAddressSpace();
739  // Do not shrink an aligned scalar load to sub-dword.
740  // Scalar engine cannot do sub-dword loads.
741  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
744  (isa<LoadSDNode>(N) &&
745  AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
747  return false;
748 
749  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
750  // extloads, so doing one requires using a buffer_load. In cases where we
751  // still couldn't use a scalar load, using the wider load shouldn't really
752  // hurt anything.
753 
754  // If the old size already had to be an extload, there's no harm in continuing
755  // to reduce the width.
756  return (OldSize < 32);
757 }
758 
760  const SelectionDAG &DAG,
761  const MachineMemOperand &MMO) const {
762 
763  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
764 
765  if (LoadTy.getScalarType() == MVT::i32)
766  return false;
767 
768  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
769  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
770 
771  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
772  return false;
773 
774  bool Fast = false;
776  CastTy, MMO, &Fast) &&
777  Fast;
778 }
779 
780 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
781 // profitable with the expansion for 64-bit since it's generally good to
782 // speculate things.
783 // FIXME: These should really have the size as a parameter.
785  return true;
786 }
787 
789  return true;
790 }
791 
793  switch (N->getOpcode()) {
794  case ISD::EntryToken:
795  case ISD::TokenFactor:
796  return true;
798  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
799  switch (IntrID) {
800  case Intrinsic::amdgcn_readfirstlane:
801  case Intrinsic::amdgcn_readlane:
802  return true;
803  }
804  return false;
805  }
806  case ISD::LOAD:
807  if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
809  return true;
810  return false;
811  }
812  return false;
813 }
814 
816  SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
817  NegatibleCost &Cost, unsigned Depth) const {
818 
819  switch (Op.getOpcode()) {
820  case ISD::FMA:
821  case ISD::FMAD: {
822  // Negating a fma is not free if it has users without source mods.
823  if (!allUsesHaveSourceMods(Op.getNode()))
824  return SDValue();
825  break;
826  }
827  default:
828  break;
829  }
830 
831  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
832  ForCodeSize, Cost, Depth);
833 }
834 
835 //===---------------------------------------------------------------------===//
836 // Target Properties
837 //===---------------------------------------------------------------------===//
838 
840  assert(VT.isFloatingPoint());
841 
842  // Packed operations do not have a fabs modifier.
843  return VT == MVT::f32 || VT == MVT::f64 ||
844  (Subtarget->has16BitInsts() && VT == MVT::f16);
845 }
846 
848  assert(VT.isFloatingPoint());
849  return VT == MVT::f32 || VT == MVT::f64 ||
850  (Subtarget->has16BitInsts() && VT == MVT::f16) ||
851  (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
852 }
853 
855  unsigned NumElem,
856  unsigned AS) const {
857  return true;
858 }
859 
861  // There are few operations which truly have vector input operands. Any vector
862  // operation is going to involve operations on each component, and a
863  // build_vector will be a copy per element, so it always makes sense to use a
864  // build_vector input in place of the extracted element to avoid a copy into a
865  // super register.
866  //
867  // We should probably only do this if all users are extracts only, but this
868  // should be the common case.
869  return true;
870 }
871 
873  // Truncate is just accessing a subregister.
874 
875  unsigned SrcSize = Source.getSizeInBits();
876  unsigned DestSize = Dest.getSizeInBits();
877 
878  return DestSize < SrcSize && DestSize % 32 == 0 ;
879 }
880 
882  // Truncate is just accessing a subregister.
883 
884  unsigned SrcSize = Source->getScalarSizeInBits();
885  unsigned DestSize = Dest->getScalarSizeInBits();
886 
887  if (DestSize== 16 && Subtarget->has16BitInsts())
888  return SrcSize >= 32;
889 
890  return DestSize < SrcSize && DestSize % 32 == 0;
891 }
892 
893 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
894  unsigned SrcSize = Src->getScalarSizeInBits();
895  unsigned DestSize = Dest->getScalarSizeInBits();
896 
897  if (SrcSize == 16 && Subtarget->has16BitInsts())
898  return DestSize >= 32;
899 
900  return SrcSize == 32 && DestSize == 64;
901 }
902 
903 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
904  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
905  // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
906  // this will enable reducing 64-bit operations the 32-bit, which is always
907  // good.
908 
909  if (Src == MVT::i16)
910  return Dest == MVT::i32 ||Dest == MVT::i64 ;
911 
912  return Src == MVT::i32 && Dest == MVT::i64;
913 }
914 
916  return isZExtFree(Val.getValueType(), VT2);
917 }
918 
920  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
921  // limited number of native 64-bit operations. Shrinking an operation to fit
922  // in a single 32-bit register should always be helpful. As currently used,
923  // this is much less general than the name suggests, and is only used in
924  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
925  // not profitable, and may actually be harmful.
926  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
927 }
928 
929 //===---------------------------------------------------------------------===//
930 // TargetLowering Callbacks
931 //===---------------------------------------------------------------------===//
932 
934  bool IsVarArg) {
935  switch (CC) {
943  return CC_AMDGPU;
944  case CallingConv::C:
945  case CallingConv::Fast:
946  case CallingConv::Cold:
947  return CC_AMDGPU_Func;
949  return CC_SI_Gfx;
952  default:
953  report_fatal_error("Unsupported calling convention for call");
954  }
955 }
956 
958  bool IsVarArg) {
959  switch (CC) {
962  llvm_unreachable("kernels should not be handled here");
970  return RetCC_SI_Shader;
972  return RetCC_SI_Gfx;
973  case CallingConv::C:
974  case CallingConv::Fast:
975  case CallingConv::Cold:
976  return RetCC_AMDGPU_Func;
977  default:
978  report_fatal_error("Unsupported calling convention.");
979  }
980 }
981 
982 /// The SelectionDAGBuilder will automatically promote function arguments
983 /// with illegal types. However, this does not work for the AMDGPU targets
984 /// since the function arguments are stored in memory as these illegal types.
985 /// In order to handle this properly we need to get the original types sizes
986 /// from the LLVM IR Function and fixup the ISD:InputArg values before
987 /// passing them to AnalyzeFormalArguments()
988 
989 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
990 /// input values across multiple registers. Each item in the Ins array
991 /// represents a single value that will be stored in registers. Ins[x].VT is
992 /// the value type of the value that will be stored in the register, so
993 /// whatever SDNode we lower the argument to needs to be this type.
994 ///
995 /// In order to correctly lower the arguments we need to know the size of each
996 /// argument. Since Ins[x].VT gives us the size of the register that will
997 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
998 /// for the orignal function argument so that we can deduce the correct memory
999 /// type to use for Ins[x]. In most cases the correct memory type will be
1000 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1001 /// we have a kernel argument of type v8i8, this argument will be split into
1002 /// 8 parts and each part will be represented by its own item in the Ins array.
1003 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1004 /// the argument before it was split. From this, we deduce that the memory type
1005 /// for each individual part is i8. We pass the memory type as LocVT to the
1006 /// calling convention analysis function and the register type (Ins[x].VT) as
1007 /// the ValVT.
1009  CCState &State,
1010  const SmallVectorImpl<ISD::InputArg> &Ins) const {
1011  const MachineFunction &MF = State.getMachineFunction();
1012  const Function &Fn = MF.getFunction();
1013  LLVMContext &Ctx = Fn.getParent()->getContext();
1015  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1016  CallingConv::ID CC = Fn.getCallingConv();
1017 
1018  Align MaxAlign = Align(1);
1019  uint64_t ExplicitArgOffset = 0;
1020  const DataLayout &DL = Fn.getParent()->getDataLayout();
1021 
1022  unsigned InIndex = 0;
1023 
1024  for (const Argument &Arg : Fn.args()) {
1025  const bool IsByRef = Arg.hasByRefAttr();
1026  Type *BaseArgTy = Arg.getType();
1027  Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1028  MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1029  if (!Alignment)
1030  Alignment = DL.getABITypeAlign(MemArgTy);
1031  MaxAlign = max(Alignment, MaxAlign);
1032  uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1033 
1034  uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1035  ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1036 
1037  // We're basically throwing away everything passed into us and starting over
1038  // to get accurate in-memory offsets. The "PartOffset" is completely useless
1039  // to us as computed in Ins.
1040  //
1041  // We also need to figure out what type legalization is trying to do to get
1042  // the correct memory offsets.
1043 
1044  SmallVector<EVT, 16> ValueVTs;
1046  ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1047 
1048  for (unsigned Value = 0, NumValues = ValueVTs.size();
1049  Value != NumValues; ++Value) {
1050  uint64_t BasePartOffset = Offsets[Value];
1051 
1052  EVT ArgVT = ValueVTs[Value];
1053  EVT MemVT = ArgVT;
1054  MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1055  unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1056 
1057  if (NumRegs == 1) {
1058  // This argument is not split, so the IR type is the memory type.
1059  if (ArgVT.isExtended()) {
1060  // We have an extended type, like i24, so we should just use the
1061  // register type.
1062  MemVT = RegisterVT;
1063  } else {
1064  MemVT = ArgVT;
1065  }
1066  } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1067  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1068  assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1069  // We have a vector value which has been split into a vector with
1070  // the same scalar type, but fewer elements. This should handle
1071  // all the floating-point vector types.
1072  MemVT = RegisterVT;
1073  } else if (ArgVT.isVector() &&
1074  ArgVT.getVectorNumElements() == NumRegs) {
1075  // This arg has been split so that each element is stored in a separate
1076  // register.
1077  MemVT = ArgVT.getScalarType();
1078  } else if (ArgVT.isExtended()) {
1079  // We have an extended type, like i65.
1080  MemVT = RegisterVT;
1081  } else {
1082  unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1083  assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1084  if (RegisterVT.isInteger()) {
1085  MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1086  } else if (RegisterVT.isVector()) {
1087  assert(!RegisterVT.getScalarType().isFloatingPoint());
1088  unsigned NumElements = RegisterVT.getVectorNumElements();
1089  assert(MemoryBits % NumElements == 0);
1090  // This vector type has been split into another vector type with
1091  // a different elements size.
1092  EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1093  MemoryBits / NumElements);
1094  MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1095  } else {
1096  llvm_unreachable("cannot deduce memory type.");
1097  }
1098  }
1099 
1100  // Convert one element vectors to scalar.
1101  if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1102  MemVT = MemVT.getScalarType();
1103 
1104  // Round up vec3/vec5 argument.
1105  if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1106  assert(MemVT.getVectorNumElements() == 3 ||
1107  MemVT.getVectorNumElements() == 5);
1108  MemVT = MemVT.getPow2VectorType(State.getContext());
1109  } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1110  MemVT = MemVT.getRoundIntegerType(State.getContext());
1111  }
1112 
1113  unsigned PartOffset = 0;
1114  for (unsigned i = 0; i != NumRegs; ++i) {
1115  State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1116  BasePartOffset + PartOffset,
1117  MemVT.getSimpleVT(),
1119  PartOffset += MemVT.getStoreSize();
1120  }
1121  }
1122  }
1123 }
1124 
1126  SDValue Chain, CallingConv::ID CallConv,
1127  bool isVarArg,
1128  const SmallVectorImpl<ISD::OutputArg> &Outs,
1129  const SmallVectorImpl<SDValue> &OutVals,
1130  const SDLoc &DL, SelectionDAG &DAG) const {
1131  // FIXME: Fails for r600 tests
1132  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1133  // "wave terminate should not have return values");
1134  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1135 }
1136 
1137 //===---------------------------------------------------------------------===//
1138 // Target specific lowering
1139 //===---------------------------------------------------------------------===//
1140 
1141 /// Selects the correct CCAssignFn for a given CallingConvention value.
1143  bool IsVarArg) {
1144  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1145 }
1146 
1148  bool IsVarArg) {
1149  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1150 }
1151 
1153  SelectionDAG &DAG,
1154  MachineFrameInfo &MFI,
1155  int ClobberedFI) const {
1156  SmallVector<SDValue, 8> ArgChains;
1157  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1158  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1159 
1160  // Include the original chain at the beginning of the list. When this is
1161  // used by target LowerCall hooks, this helps legalize find the
1162  // CALLSEQ_BEGIN node.
1163  ArgChains.push_back(Chain);
1164 
1165  // Add a chain value for each stack argument corresponding
1166  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1167  UE = DAG.getEntryNode().getNode()->use_end();
1168  U != UE; ++U) {
1169  if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1170  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1171  if (FI->getIndex() < 0) {
1172  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1173  int64_t InLastByte = InFirstByte;
1174  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1175 
1176  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1177  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1178  ArgChains.push_back(SDValue(L, 1));
1179  }
1180  }
1181  }
1182  }
1183 
1184  // Build a tokenfactor for all the chains.
1185  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1186 }
1187 
1189  SmallVectorImpl<SDValue> &InVals,
1190  StringRef Reason) const {
1191  SDValue Callee = CLI.Callee;
1192  SelectionDAG &DAG = CLI.DAG;
1193 
1194  const Function &Fn = DAG.getMachineFunction().getFunction();
1195 
1196  StringRef FuncName("<unknown>");
1197 
1198  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1199  FuncName = G->getSymbol();
1200  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1201  FuncName = G->getGlobal()->getName();
1202 
1203  DiagnosticInfoUnsupported NoCalls(
1204  Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1205  DAG.getContext()->diagnose(NoCalls);
1206 
1207  if (!CLI.IsTailCall) {
1208  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1209  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1210  }
1211 
1212  return DAG.getEntryNode();
1213 }
1214 
1216  SmallVectorImpl<SDValue> &InVals) const {
1217  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1218 }
1219 
1221  SelectionDAG &DAG) const {
1222  const Function &Fn = DAG.getMachineFunction().getFunction();
1223 
1224  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1225  SDLoc(Op).getDebugLoc());
1226  DAG.getContext()->diagnose(NoDynamicAlloca);
1227  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1228  return DAG.getMergeValues(Ops, SDLoc());
1229 }
1230 
1232  SelectionDAG &DAG) const {
1233  switch (Op.getOpcode()) {
1234  default:
1235  Op->print(errs(), &DAG);
1236  llvm_unreachable("Custom lowering code for this "
1237  "instruction is not implemented yet!");
1238  break;
1240  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1242  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1243  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1244  case ISD::FREM: return LowerFREM(Op, DAG);
1245  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1246  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1247  case ISD::FRINT: return LowerFRINT(Op, DAG);
1248  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1249  case ISD::FROUND: return LowerFROUND(Op, DAG);
1250  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1251  case ISD::FLOG:
1252  return LowerFLOG(Op, DAG, numbers::ln2f);
1253  case ISD::FLOG10:
1254  return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1255  case ISD::FEXP:
1256  return lowerFEXP(Op, DAG);
1257  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1258  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1259  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1260  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1261  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1262  case ISD::CTTZ:
1263  case ISD::CTTZ_ZERO_UNDEF:
1264  case ISD::CTLZ:
1265  case ISD::CTLZ_ZERO_UNDEF:
1266  return LowerCTLZ_CTTZ(Op, DAG);
1268  }
1269  return Op;
1270 }
1271 
1274  SelectionDAG &DAG) const {
1275  switch (N->getOpcode()) {
1277  // Different parts of legalization seem to interpret which type of
1278  // sign_extend_inreg is the one to check for custom lowering. The extended
1279  // from type is what really matters, but some places check for custom
1280  // lowering of the result type. This results in trying to use
1281  // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1282  // nothing here and let the illegal result integer be handled normally.
1283  return;
1284  default:
1285  return;
1286  }
1287 }
1288 
1290  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1291  if (!GVar || !GVar->hasInitializer())
1292  return false;
1293 
1294  return !isa<UndefValue>(GVar->getInitializer());
1295 }
1296 
1298  SDValue Op,
1299  SelectionDAG &DAG) const {
1300 
1301  const DataLayout &DL = DAG.getDataLayout();
1302  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1303  const GlobalValue *GV = G->getGlobal();
1304 
1305  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1306  G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1307  if (!MFI->isModuleEntryFunction()) {
1308  SDLoc DL(Op);
1309  const Function &Fn = DAG.getMachineFunction().getFunction();
1310  DiagnosticInfoUnsupported BadLDSDecl(
1311  Fn, "local memory global used by non-kernel function",
1312  DL.getDebugLoc(), DS_Warning);
1313  DAG.getContext()->diagnose(BadLDSDecl);
1314 
1315  // We currently don't have a way to correctly allocate LDS objects that
1316  // aren't directly associated with a kernel. We do force inlining of
1317  // functions that use local objects. However, if these dead functions are
1318  // not eliminated, we don't want a compile time error. Just emit a warning
1319  // and a trap, since there should be no callable path here.
1320  SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1321  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1322  Trap, DAG.getRoot());
1323  DAG.setRoot(OutputChain);
1324  return DAG.getUNDEF(Op.getValueType());
1325  }
1326 
1327  // XXX: What does the value of G->getOffset() mean?
1328  assert(G->getOffset() == 0 &&
1329  "Do not know what to do with an non-zero offset");
1330 
1331  // TODO: We could emit code to handle the initialization somewhere.
1332  if (!hasDefinedInitializer(GV)) {
1333  unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1334  return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1335  }
1336  }
1337 
1338  const Function &Fn = DAG.getMachineFunction().getFunction();
1339  DiagnosticInfoUnsupported BadInit(
1340  Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1341  DAG.getContext()->diagnose(BadInit);
1342  return SDValue();
1343 }
1344 
1346  SelectionDAG &DAG) const {
1348 
1349  EVT VT = Op.getValueType();
1350  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1351  SDLoc SL(Op);
1352  SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1353  SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1354 
1355  SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1356  return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1357  }
1358 
1359  for (const SDUse &U : Op->ops())
1360  DAG.ExtractVectorElements(U.get(), Args);
1361 
1362  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1363 }
1364 
1366  SelectionDAG &DAG) const {
1367 
1369  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1370  EVT VT = Op.getValueType();
1371  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1372  VT.getVectorNumElements());
1373 
1374  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1375 }
1376 
1377 /// Generate Min/Max node
1379  SDValue LHS, SDValue RHS,
1380  SDValue True, SDValue False,
1381  SDValue CC,
1382  DAGCombinerInfo &DCI) const {
1383  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1384  return SDValue();
1385 
1386  SelectionDAG &DAG = DCI.DAG;
1387  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1388  switch (CCOpcode) {
1389  case ISD::SETOEQ:
1390  case ISD::SETONE:
1391  case ISD::SETUNE:
1392  case ISD::SETNE:
1393  case ISD::SETUEQ:
1394  case ISD::SETEQ:
1395  case ISD::SETFALSE:
1396  case ISD::SETFALSE2:
1397  case ISD::SETTRUE:
1398  case ISD::SETTRUE2:
1399  case ISD::SETUO:
1400  case ISD::SETO:
1401  break;
1402  case ISD::SETULE:
1403  case ISD::SETULT: {
1404  if (LHS == True)
1405  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1406  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1407  }
1408  case ISD::SETOLE:
1409  case ISD::SETOLT:
1410  case ISD::SETLE:
1411  case ISD::SETLT: {
1412  // Ordered. Assume ordered for undefined.
1413 
1414  // Only do this after legalization to avoid interfering with other combines
1415  // which might occur.
1416  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1417  !DCI.isCalledByLegalizer())
1418  return SDValue();
1419 
1420  // We need to permute the operands to get the correct NaN behavior. The
1421  // selected operand is the second one based on the failing compare with NaN,
1422  // so permute it based on the compare type the hardware uses.
1423  if (LHS == True)
1424  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1425  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1426  }
1427  case ISD::SETUGE:
1428  case ISD::SETUGT: {
1429  if (LHS == True)
1430  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1431  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1432  }
1433  case ISD::SETGT:
1434  case ISD::SETGE:
1435  case ISD::SETOGE:
1436  case ISD::SETOGT: {
1437  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1438  !DCI.isCalledByLegalizer())
1439  return SDValue();
1440 
1441  if (LHS == True)
1442  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1443  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1444  }
1445  case ISD::SETCC_INVALID:
1446  llvm_unreachable("Invalid setcc condcode!");
1447  }
1448  return SDValue();
1449 }
1450 
1451 std::pair<SDValue, SDValue>
1453  SDLoc SL(Op);
1454 
1455  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1456 
1457  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1458  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1459 
1460  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1461  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1462 
1463  return std::make_pair(Lo, Hi);
1464 }
1465 
1467  SDLoc SL(Op);
1468 
1469  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1470  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1471  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1472 }
1473 
1475  SDLoc SL(Op);
1476 
1477  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1478  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1479  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1480 }
1481 
1482 // Split a vector type into two parts. The first part is a power of two vector.
1483 // The second part is whatever is left over, and is a scalar if it would
1484 // otherwise be a 1-vector.
1485 std::pair<EVT, EVT>
1487  EVT LoVT, HiVT;
1488  EVT EltVT = VT.getVectorElementType();
1489  unsigned NumElts = VT.getVectorNumElements();
1490  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1491  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1492  HiVT = NumElts - LoNumElts == 1
1493  ? EltVT
1494  : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1495  return std::make_pair(LoVT, HiVT);
1496 }
1497 
1498 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1499 // scalar.
1500 std::pair<SDValue, SDValue>
1502  const EVT &LoVT, const EVT &HiVT,
1503  SelectionDAG &DAG) const {
1504  assert(LoVT.getVectorNumElements() +
1505  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1506  N.getValueType().getVectorNumElements() &&
1507  "More vector elements requested than available!");
1508  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1509  DAG.getVectorIdxConstant(0, DL));
1510  SDValue Hi = DAG.getNode(
1512  HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1513  return std::make_pair(Lo, Hi);
1514 }
1515 
1517  SelectionDAG &DAG) const {
1518  LoadSDNode *Load = cast<LoadSDNode>(Op);
1519  EVT VT = Op.getValueType();
1520  SDLoc SL(Op);
1521 
1522 
1523  // If this is a 2 element vector, we really want to scalarize and not create
1524  // weird 1 element vectors.
1525  if (VT.getVectorNumElements() == 2) {
1526  SDValue Ops[2];
1527  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1528  return DAG.getMergeValues(Ops, SL);
1529  }
1530 
1531  SDValue BasePtr = Load->getBasePtr();
1532  EVT MemVT = Load->getMemoryVT();
1533 
1534  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1535 
1536  EVT LoVT, HiVT;
1537  EVT LoMemVT, HiMemVT;
1538  SDValue Lo, Hi;
1539 
1540  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1541  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1542  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1543 
1544  unsigned Size = LoMemVT.getStoreSize();
1545  unsigned BaseAlign = Load->getAlignment();
1546  unsigned HiAlign = MinAlign(BaseAlign, Size);
1547 
1548  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1549  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1550  BaseAlign, Load->getMemOperand()->getFlags());
1551  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1552  SDValue HiLoad =
1553  DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1554  HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1555  HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1556 
1557  SDValue Join;
1558  if (LoVT == HiVT) {
1559  // This is the case that the vector is power of two so was evenly split.
1560  Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1561  } else {
1562  Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1563  DAG.getVectorIdxConstant(0, SL));
1564  Join = DAG.getNode(
1566  VT, Join, HiLoad,
1567  DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1568  }
1569 
1570  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1571  LoLoad.getValue(1), HiLoad.getValue(1))};
1572 
1573  return DAG.getMergeValues(Ops, SL);
1574 }
1575 
1577  SelectionDAG &DAG) const {
1578  LoadSDNode *Load = cast<LoadSDNode>(Op);
1579  EVT VT = Op.getValueType();
1580  SDValue BasePtr = Load->getBasePtr();
1581  EVT MemVT = Load->getMemoryVT();
1582  SDLoc SL(Op);
1583  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1584  unsigned BaseAlign = Load->getAlignment();
1585  unsigned NumElements = MemVT.getVectorNumElements();
1586 
1587  // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1588  // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1589  if (NumElements != 3 ||
1590  (BaseAlign < 8 &&
1591  !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1592  return SplitVectorLoad(Op, DAG);
1593 
1594  assert(NumElements == 3);
1595 
1596  EVT WideVT =
1598  EVT WideMemVT =
1599  EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1600  SDValue WideLoad = DAG.getExtLoad(
1601  Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1602  WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1603  return DAG.getMergeValues(
1604  {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1605  DAG.getVectorIdxConstant(0, SL)),
1606  WideLoad.getValue(1)},
1607  SL);
1608 }
1609 
1611  SelectionDAG &DAG) const {
1612  StoreSDNode *Store = cast<StoreSDNode>(Op);
1613  SDValue Val = Store->getValue();
1614  EVT VT = Val.getValueType();
1615 
1616  // If this is a 2 element vector, we really want to scalarize and not create
1617  // weird 1 element vectors.
1618  if (VT.getVectorNumElements() == 2)
1619  return scalarizeVectorStore(Store, DAG);
1620 
1621  EVT MemVT = Store->getMemoryVT();
1622  SDValue Chain = Store->getChain();
1623  SDValue BasePtr = Store->getBasePtr();
1624  SDLoc SL(Op);
1625 
1626  EVT LoVT, HiVT;
1627  EVT LoMemVT, HiMemVT;
1628  SDValue Lo, Hi;
1629 
1630  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1631  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1632  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1633 
1634  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1635 
1636  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1637  unsigned BaseAlign = Store->getAlignment();
1638  unsigned Size = LoMemVT.getStoreSize();
1639  unsigned HiAlign = MinAlign(BaseAlign, Size);
1640 
1641  SDValue LoStore =
1642  DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1643  Store->getMemOperand()->getFlags());
1644  SDValue HiStore =
1645  DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1646  HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1647 
1648  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1649 }
1650 
1651 // This is a shortcut for integer division because we have fast i32<->f32
1652 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1653 // float is enough to accurately represent up to a 24-bit signed integer.
1655  bool Sign) const {
1656  SDLoc DL(Op);
1657  EVT VT = Op.getValueType();
1658  SDValue LHS = Op.getOperand(0);
1659  SDValue RHS = Op.getOperand(1);
1660  MVT IntVT = MVT::i32;
1661  MVT FltVT = MVT::f32;
1662 
1663  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1664  if (LHSSignBits < 9)
1665  return SDValue();
1666 
1667  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1668  if (RHSSignBits < 9)
1669  return SDValue();
1670 
1671  unsigned BitSize = VT.getSizeInBits();
1672  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1673  unsigned DivBits = BitSize - SignBits;
1674  if (Sign)
1675  ++DivBits;
1676 
1679 
1680  SDValue jq = DAG.getConstant(1, DL, IntVT);
1681 
1682  if (Sign) {
1683  // char|short jq = ia ^ ib;
1684  jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1685 
1686  // jq = jq >> (bitsize - 2)
1687  jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1688  DAG.getConstant(BitSize - 2, DL, VT));
1689 
1690  // jq = jq | 0x1
1691  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1692  }
1693 
1694  // int ia = (int)LHS;
1695  SDValue ia = LHS;
1696 
1697  // int ib, (int)RHS;
1698  SDValue ib = RHS;
1699 
1700  // float fa = (float)ia;
1701  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1702 
1703  // float fb = (float)ib;
1704  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1705 
1706  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1707  fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1708 
1709  // fq = trunc(fq);
1710  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1711 
1712  // float fqneg = -fq;
1713  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1714 
1715  MachineFunction &MF = DAG.getMachineFunction();
1717 
1718  // float fr = mad(fqneg, fb, fa);
1719  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1720  (unsigned)ISD::FMA :
1721  !MFI->getMode().allFP32Denormals() ?
1722  (unsigned)ISD::FMAD :
1723  (unsigned)AMDGPUISD::FMAD_FTZ;
1724  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1725 
1726  // int iq = (int)fq;
1727  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1728 
1729  // fr = fabs(fr);
1730  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1731 
1732  // fb = fabs(fb);
1733  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1734 
1735  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1736 
1737  // int cv = fr >= fb;
1738  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1739 
1740  // jq = (cv ? jq : 0);
1741  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1742 
1743  // dst = iq + jq;
1744  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1745 
1746  // Rem needs compensation, it's easier to recompute it
1747  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1748  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1749 
1750  // Truncate to number of bits this divide really is.
1751  if (Sign) {
1752  SDValue InRegSize
1753  = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1754  Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1755  Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1756  } else {
1757  SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1758  Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1759  Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1760  }
1761 
1762  return DAG.getMergeValues({ Div, Rem }, DL);
1763 }
1764 
1766  SelectionDAG &DAG,
1768  SDLoc DL(Op);
1769  EVT VT = Op.getValueType();
1770 
1771  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1772 
1773  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1774 
1775  SDValue One = DAG.getConstant(1, DL, HalfVT);
1776  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1777 
1778  //HiLo split
1779  SDValue LHS = Op.getOperand(0);
1780  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1781  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1782 
1783  SDValue RHS = Op.getOperand(1);
1784  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1785  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1786 
1787  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1788  DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1789 
1790  SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1791  LHS_Lo, RHS_Lo);
1792 
1793  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1794  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1795 
1796  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1797  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1798  return;
1799  }
1800 
1801  if (isTypeLegal(MVT::i64)) {
1802  MachineFunction &MF = DAG.getMachineFunction();
1804 
1805  // Compute denominator reciprocal.
1806  unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1807  (unsigned)ISD::FMA :
1808  !MFI->getMode().allFP32Denormals() ?
1809  (unsigned)ISD::FMAD :
1810  (unsigned)AMDGPUISD::FMAD_FTZ;
1811 
1812  SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1813  SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1814  SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1815  DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1816  Cvt_Lo);
1817  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1818  SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1819  DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1820  SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1821  DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1822  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1823  SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1824  DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1825  Mul1);
1826  SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1827  SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1828  SDValue Rcp64 = DAG.getBitcast(VT,
1829  DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1830 
1831  SDValue Zero64 = DAG.getConstant(0, DL, VT);
1832  SDValue One64 = DAG.getConstant(1, DL, VT);
1833  SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1834  SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1835 
1836  SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1837  SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1838  SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1839  SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1840  Zero);
1841  SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1842  One);
1843 
1844  SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1845  Mulhi1_Lo, Zero1);
1846  SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1847  Mulhi1_Hi, Add1_Lo.getValue(1));
1848  SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1849  SDValue Add1 = DAG.getBitcast(VT,
1850  DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1851 
1852  SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1853  SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1854  SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1855  Zero);
1856  SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1857  One);
1858 
1859  SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1860  Mulhi2_Lo, Zero1);
1861  SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1862  Mulhi2_Hi, Add1_Lo.getValue(1));
1863  SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1864  Zero, Add2_Lo.getValue(1));
1865  SDValue Add2 = DAG.getBitcast(VT,
1866  DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1867  SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1868 
1869  SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1870 
1871  SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1872  SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1873  SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1874  Mul3_Lo, Zero1);
1875  SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1876  Mul3_Hi, Sub1_Lo.getValue(1));
1877  SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1878  SDValue Sub1 = DAG.getBitcast(VT,
1879  DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1880 
1881  SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1882  SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1883  ISD::SETUGE);
1884  SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1885  ISD::SETUGE);
1886  SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1887 
1888  // TODO: Here and below portions of the code can be enclosed into if/endif.
1889  // Currently control flow is unconditional and we have 4 selects after
1890  // potential endif to substitute PHIs.
1891 
1892  // if C3 != 0 ...
1893  SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1894  RHS_Lo, Zero1);
1895  SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1896  RHS_Hi, Sub1_Lo.getValue(1));
1897  SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1898  Zero, Sub2_Lo.getValue(1));
1899  SDValue Sub2 = DAG.getBitcast(VT,
1900  DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1901 
1902  SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1903 
1904  SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1905  ISD::SETUGE);
1906  SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1907  ISD::SETUGE);
1908  SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1909 
1910  // if (C6 != 0)
1911  SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1912 
1913  SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1914  RHS_Lo, Zero1);
1915  SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1916  RHS_Hi, Sub2_Lo.getValue(1));
1917  SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1918  Zero, Sub3_Lo.getValue(1));
1919  SDValue Sub3 = DAG.getBitcast(VT,
1920  DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1921 
1922  // endif C6
1923  // endif C3
1924 
1925  SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1926  SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1927 
1928  SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1929  SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1930 
1931  Results.push_back(Div);
1932  Results.push_back(Rem);
1933 
1934  return;
1935  }
1936 
1937  // r600 expandion.
1938  // Get Speculative values
1939  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1940  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1941 
1942  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1943  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1944  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1945 
1946  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1947  SDValue DIV_Lo = Zero;
1948 
1949  const unsigned halfBitWidth = HalfVT.getSizeInBits();
1950 
1951  for (unsigned i = 0; i < halfBitWidth; ++i) {
1952  const unsigned bitPos = halfBitWidth - i - 1;
1953  SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1954  // Get value of high bit
1955  SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1956  HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1957  HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1958 
1959  // Shift
1960  REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1961  // Add LHS high bit
1962  REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1963 
1964  SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1965  SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1966 
1967  DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1968 
1969  // Update REM
1970  SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1971  REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1972  }
1973 
1974  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1975  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1976  Results.push_back(DIV);
1977  Results.push_back(REM);
1978 }
1979 
1980 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1981  SelectionDAG &DAG) const {
1982  SDLoc DL(Op);
1983  EVT VT = Op.getValueType();
1984 
1985  if (VT == MVT::i64) {
1987  LowerUDIVREM64(Op, DAG, Results);
1988  return DAG.getMergeValues(Results, DL);
1989  }
1990 
1991  if (VT == MVT::i32) {
1992  if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1993  return Res;
1994  }
1995 
1996  SDValue X = Op.getOperand(0);
1997  SDValue Y = Op.getOperand(1);
1998 
1999  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2000  // algorithm used here.
2001 
2002  // Initial estimate of inv(y).
2003  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2004 
2005  // One round of UNR.
2006  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2007  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2008  Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2009  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2010 
2011  // Quotient/remainder estimate.
2012  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2013  SDValue R =
2014  DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2015 
2016  // First quotient/remainder refinement.
2017  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2018  SDValue One = DAG.getConstant(1, DL, VT);
2019  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2020  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2021  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2022  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2023  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2024 
2025  // Second quotient/remainder refinement.
2026  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2027  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2028  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2029  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2030  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2031 
2032  return DAG.getMergeValues({Q, R}, DL);
2033 }
2034 
2036  SelectionDAG &DAG) const {
2037  SDLoc DL(Op);
2038  EVT VT = Op.getValueType();
2039 
2040  SDValue LHS = Op.getOperand(0);
2041  SDValue RHS = Op.getOperand(1);
2042 
2043  SDValue Zero = DAG.getConstant(0, DL, VT);
2044  SDValue NegOne = DAG.getConstant(-1, DL, VT);
2045 
2046  if (VT == MVT::i32) {
2047  if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2048  return Res;
2049  }
2050 
2051  if (VT == MVT::i64 &&
2052  DAG.ComputeNumSignBits(LHS) > 32 &&
2053  DAG.ComputeNumSignBits(RHS) > 32) {
2054  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2055 
2056  //HiLo split
2057  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2058  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2059  SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2060  LHS_Lo, RHS_Lo);
2061  SDValue Res[2] = {
2062  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2063  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2064  };
2065  return DAG.getMergeValues(Res, DL);
2066  }
2067 
2068  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2069  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2070  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2071  SDValue RSign = LHSign; // Remainder sign is the same as LHS
2072 
2073  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2074  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2075 
2076  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2077  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2078 
2079  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2080  SDValue Rem = Div.getValue(1);
2081 
2082  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2083  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2084 
2085  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2086  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2087 
2088  SDValue Res[2] = {
2089  Div,
2090  Rem
2091  };
2092  return DAG.getMergeValues(Res, DL);
2093 }
2094 
2095 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2097  SDLoc SL(Op);
2098  EVT VT = Op.getValueType();
2099  auto Flags = Op->getFlags();
2100  SDValue X = Op.getOperand(0);
2101  SDValue Y = Op.getOperand(1);
2102 
2103  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2104  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2105  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2106  // TODO: For f32 use FMAD instead if !hasFastFMA32?
2107  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2108 }
2109 
2111  SDLoc SL(Op);
2112  SDValue Src = Op.getOperand(0);
2113 
2114  // result = trunc(src)
2115  // if (src > 0.0 && src != result)
2116  // result += 1.0
2117 
2118  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2119 
2120  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2121  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2122 
2123  EVT SetCCVT =
2125 
2126  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2127  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2128  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2129 
2130  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2131  // TODO: Should this propagate fast-math-flags?
2132  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2133 }
2134 
2136  SelectionDAG &DAG) {
2137  const unsigned FractBits = 52;
2138  const unsigned ExpBits = 11;
2139 
2140  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2141  Hi,
2142  DAG.getConstant(FractBits - 32, SL, MVT::i32),
2143  DAG.getConstant(ExpBits, SL, MVT::i32));
2144  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2145  DAG.getConstant(1023, SL, MVT::i32));
2146 
2147  return Exp;
2148 }
2149 
2151  SDLoc SL(Op);
2152  SDValue Src = Op.getOperand(0);
2153 
2154  assert(Op.getValueType() == MVT::f64);
2155 
2156  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2157  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2158 
2159  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2160 
2161  // Extract the upper half, since this is where we will find the sign and
2162  // exponent.
2163  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2164 
2165  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2166 
2167  const unsigned FractBits = 52;
2168 
2169  // Extract the sign bit.
2170  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2171  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2172 
2173  // Extend back to 64-bits.
2174  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2175  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2176 
2177  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2178  const SDValue FractMask
2179  = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2180 
2181  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2182  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2183  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2184 
2185  EVT SetCCVT =
2187 
2188  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2189 
2190  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2191  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2192 
2193  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2194  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2195 
2196  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2197 }
2198 
2200  SDLoc SL(Op);
2201  SDValue Src = Op.getOperand(0);
2202 
2203  assert(Op.getValueType() == MVT::f64);
2204 
2205  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2206  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2207  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2208 
2209  // TODO: Should this propagate fast-math-flags?
2210 
2211  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2212  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2213 
2214  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2215 
2216  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2217  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2218 
2219  EVT SetCCVT =
2221  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2222 
2223  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2224 }
2225 
2227  // FNEARBYINT and FRINT are the same, except in their handling of FP
2228  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2229  // rint, so just treat them as equivalent.
2230  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2231 }
2232 
2233 // XXX - May require not supporting f32 denormals?
2234 
2235 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2236 // compare and vselect end up producing worse code than scalarizing the whole
2237 // operation.
2239  SDLoc SL(Op);
2240  SDValue X = Op.getOperand(0);
2241  EVT VT = Op.getValueType();
2242 
2243  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2244 
2245  // TODO: Should this propagate fast-math-flags?
2246 
2247  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2248 
2249  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2250 
2251  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2252  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2253  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2254 
2255  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2256 
2257  EVT SetCCVT =
2258  getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2259 
2260  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2261 
2262  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2263 
2264  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2265 }
2266 
2268  SDLoc SL(Op);
2269  SDValue Src = Op.getOperand(0);
2270 
2271  // result = trunc(src);
2272  // if (src < 0.0 && src != result)
2273  // result += -1.0.
2274 
2275  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2276 
2277  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2278  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2279 
2280  EVT SetCCVT =
2282 
2283  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2284  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2285  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2286 
2287  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2288  // TODO: Should this propagate fast-math-flags?
2289  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2290 }
2291 
2293  double Log2BaseInverted) const {
2294  EVT VT = Op.getValueType();
2295 
2296  SDLoc SL(Op);
2297  SDValue Operand = Op.getOperand(0);
2298  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2299  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2300 
2301  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2302 }
2303 
2304 // exp2(M_LOG2E_F * f);
2306  EVT VT = Op.getValueType();
2307  SDLoc SL(Op);
2308  SDValue Src = Op.getOperand(0);
2309 
2310  const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2311  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2312  return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2313 }
2314 
2315 static bool isCtlzOpc(unsigned Opc) {
2316  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2317 }
2318 
2319 static bool isCttzOpc(unsigned Opc) {
2320  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2321 }
2322 
2324  SDLoc SL(Op);
2325  SDValue Src = Op.getOperand(0);
2326  bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2327  Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2328 
2329  unsigned ISDOpc, NewOpc;
2330  if (isCtlzOpc(Op.getOpcode())) {
2331  ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2332  NewOpc = AMDGPUISD::FFBH_U32;
2333  } else if (isCttzOpc(Op.getOpcode())) {
2334  ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2335  NewOpc = AMDGPUISD::FFBL_B32;
2336  } else
2337  llvm_unreachable("Unexpected OPCode!!!");
2338 
2339 
2340  if (ZeroUndef && Src.getValueType() == MVT::i32)
2341  return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2342 
2343  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2344 
2345  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2346  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2347 
2348  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2349  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2350 
2351  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2352  *DAG.getContext(), MVT::i32);
2353 
2354  SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2355  SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2356 
2357  SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2358  SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2359 
2360  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2361  SDValue Add, NewOpr;
2362  if (isCtlzOpc(Op.getOpcode())) {
2363  Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2364  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2365  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2366  } else {
2367  Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2368  // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2369  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2370  }
2371 
2372  if (!ZeroUndef) {
2373  // Test if the full 64-bit input is zero.
2374 
2375  // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2376  // which we probably don't want.
2377  SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2378  SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2379  SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2380 
2381  // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2382  // with the same cycles, otherwise it is slower.
2383  // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2384  // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2385 
2386  const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2387 
2388  // The instruction returns -1 for 0 input, but the defined intrinsic
2389  // behavior is to return the number of bits.
2390  NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2391  SrcIsZero, Bits32, NewOpr);
2392  }
2393 
2394  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2395 }
2396 
2398  bool Signed) const {
2399  // Unsigned
2400  // cul2f(ulong u)
2401  //{
2402  // uint lz = clz(u);
2403  // uint e = (u != 0) ? 127U + 63U - lz : 0;
2404  // u = (u << lz) & 0x7fffffffffffffffUL;
2405  // ulong t = u & 0xffffffffffUL;
2406  // uint v = (e << 23) | (uint)(u >> 40);
2407  // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2408  // return as_float(v + r);
2409  //}
2410  // Signed
2411  // cl2f(long l)
2412  //{
2413  // long s = l >> 63;
2414  // float r = cul2f((l + s) ^ s);
2415  // return s ? -r : r;
2416  //}
2417 
2418  SDLoc SL(Op);
2419  SDValue Src = Op.getOperand(0);
2420  SDValue L = Src;
2421 
2422  SDValue S;
2423  if (Signed) {
2424  const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2425  S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2426 
2427  SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2428  L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2429  }
2430 
2431  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2432  *DAG.getContext(), MVT::f32);
2433 
2434 
2435  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2436  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2437  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2438  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2439 
2440  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2441  SDValue E = DAG.getSelect(SL, MVT::i32,
2442  DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2443  DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2444  ZeroI32);
2445 
2446  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2447  DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2448  DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2449 
2450  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2451  DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2452 
2453  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2454  U, DAG.getConstant(40, SL, MVT::i64));
2455 
2456  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2457  DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2458  DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
2459 
2460  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2461  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2462  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2463 
2464  SDValue One = DAG.getConstant(1, SL, MVT::i32);
2465 
2466  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2467 
2468  SDValue R = DAG.getSelect(SL, MVT::i32,
2469  RCmp,
2470  One,
2471  DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2472  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2473  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2474 
2475  if (!Signed)
2476  return R;
2477 
2478  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2479  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2480 }
2481 
2483  bool Signed) const {
2484  SDLoc SL(Op);
2485  SDValue Src = Op.getOperand(0);
2486 
2487  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2488 
2490  DAG.getConstant(0, SL, MVT::i32));
2492  DAG.getConstant(1, SL, MVT::i32));
2493 
2495  SL, MVT::f64, Hi);
2496 
2497  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2498 
2499  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2500  DAG.getConstant(32, SL, MVT::i32));
2501  // TODO: Should this propagate fast-math-flags?
2502  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2503 }
2504 
2506  SelectionDAG &DAG) const {
2507  // TODO: Factor out code common with LowerSINT_TO_FP.
2508  EVT DestVT = Op.getValueType();
2509  SDValue Src = Op.getOperand(0);
2510  EVT SrcVT = Src.getValueType();
2511 
2512  if (SrcVT == MVT::i16) {
2513  if (DestVT == MVT::f16)
2514  return Op;
2515  SDLoc DL(Op);
2516 
2517  // Promote src to i32
2519  return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2520  }
2521 
2522  assert(SrcVT == MVT::i64 && "operation should be legal");
2523 
2524  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2525  SDLoc DL(Op);
2526 
2527  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2528  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2529  SDValue FPRound =
2530  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2531 
2532  return FPRound;
2533  }
2534 
2535  if (DestVT == MVT::f32)
2536  return LowerINT_TO_FP32(Op, DAG, false);
2537 
2538  assert(DestVT == MVT::f64);
2539  return LowerINT_TO_FP64(Op, DAG, false);
2540 }
2541 
2543  SelectionDAG &DAG) const {
2544  EVT DestVT = Op.getValueType();
2545 
2546  SDValue Src = Op.getOperand(0);
2547  EVT SrcVT = Src.getValueType();
2548 
2549  if (SrcVT == MVT::i16) {
2550  if (DestVT == MVT::f16)
2551  return Op;
2552 
2553  SDLoc DL(Op);
2554  // Promote src to i32
2556  return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2557  }
2558 
2559  assert(SrcVT == MVT::i64 && "operation should be legal");
2560 
2561  // TODO: Factor out code common with LowerUINT_TO_FP.
2562 
2563  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2564  SDLoc DL(Op);
2565  SDValue Src = Op.getOperand(0);
2566 
2567  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2568  SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2569  SDValue FPRound =
2570  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2571 
2572  return FPRound;
2573  }
2574 
2575  if (DestVT == MVT::f32)
2576  return LowerINT_TO_FP32(Op, DAG, true);
2577 
2578  assert(DestVT == MVT::f64);
2579  return LowerINT_TO_FP64(Op, DAG, true);
2580 }
2581 
2583  bool Signed) const {
2584  SDLoc SL(Op);
2585 
2586  SDValue Src = Op.getOperand(0);
2587 
2588  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2589 
2590  SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2591  MVT::f64);
2592  SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2593  MVT::f64);
2594  // TODO: Should this propagate fast-math-flags?
2595  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2596 
2597  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2598 
2599 
2600  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2601 
2603  MVT::i32, FloorMul);
2604  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2605 
2606  SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2607 
2608  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2609 }
2610 
2612  SDLoc DL(Op);
2613  SDValue N0 = Op.getOperand(0);
2614 
2615  // Convert to target node to get known bits
2616  if (N0.getValueType() == MVT::f32)
2617  return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2618 
2619  if (getTargetMachine().Options.UnsafeFPMath) {
2620  // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2621  return SDValue();
2622  }
2623 
2625 
2626  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2627  const unsigned ExpMask = 0x7ff;
2628  const unsigned ExpBiasf64 = 1023;
2629  const unsigned ExpBiasf16 = 15;
2630  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2631  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2632  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2633  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2634  DAG.getConstant(32, DL, MVT::i64));
2635  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2636  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2637  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2638  DAG.getConstant(20, DL, MVT::i64));
2639  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2640  DAG.getConstant(ExpMask, DL, MVT::i32));
2641  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2642  // add the f16 bias (15) to get the biased exponent for the f16 format.
2643  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2644  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2645 
2646  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2647  DAG.getConstant(8, DL, MVT::i32));
2648  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2649  DAG.getConstant(0xffe, DL, MVT::i32));
2650 
2651  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2652  DAG.getConstant(0x1ff, DL, MVT::i32));
2653  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2654 
2655  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2656  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2657 
2658  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2659  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2660  DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2661  Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2662 
2663  // N = M | (E << 12);
2664  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2665  DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2666  DAG.getConstant(12, DL, MVT::i32)));
2667 
2668  // B = clamp(1-E, 0, 13);
2669  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2670  One, E);
2671  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2672  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2673  DAG.getConstant(13, DL, MVT::i32));
2674 
2675  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2676  DAG.getConstant(0x1000, DL, MVT::i32));
2677 
2678  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2679  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2680  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2681  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2682 
2683  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2684  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2685  DAG.getConstant(0x7, DL, MVT::i32));
2686  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2687  DAG.getConstant(2, DL, MVT::i32));
2688  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2689  One, Zero, ISD::SETEQ);
2690  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2691  One, Zero, ISD::SETGT);
2692  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2693  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2694 
2695  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2696  DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2697  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2698  I, V, ISD::SETEQ);
2699 
2700  // Extract the sign bit.
2701  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2702  DAG.getConstant(16, DL, MVT::i32));
2703  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2704  DAG.getConstant(0x8000, DL, MVT::i32));
2705 
2706  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2707  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2708 }
2709 
2711  SelectionDAG &DAG) const {
2712  SDValue Src = Op.getOperand(0);
2713 
2714  // TODO: Factor out code common with LowerFP_TO_UINT.
2715 
2716  EVT SrcVT = Src.getValueType();
2717  if (SrcVT == MVT::f16 ||
2718  (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2719  SDLoc DL(Op);
2720 
2721  SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
2722  return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32);
2723  }
2724 
2725  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2726  return LowerFP64_TO_INT(Op, DAG, true);
2727 
2728  return SDValue();
2729 }
2730 
2732  SelectionDAG &DAG) const {
2733  SDValue Src = Op.getOperand(0);
2734 
2735  // TODO: Factor out code common with LowerFP_TO_SINT.
2736 
2737  EVT SrcVT = Src.getValueType();
2738  if (SrcVT == MVT::f16 ||
2739  (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2740  SDLoc DL(Op);
2741 
2742  SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
2743  return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32);
2744  }
2745 
2746  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2747  return LowerFP64_TO_INT(Op, DAG, false);
2748 
2749  return SDValue();
2750 }
2751 
2753  SelectionDAG &DAG) const {
2754  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2755  MVT VT = Op.getSimpleValueType();
2756  MVT ScalarVT = VT.getScalarType();
2757 
2758  assert(VT.isVector());
2759 
2760  SDValue Src = Op.getOperand(0);
2761  SDLoc DL(Op);
2762 
2763  // TODO: Don't scalarize on Evergreen?
2764  unsigned NElts = VT.getVectorNumElements();
2766  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2767 
2768  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2769  for (unsigned I = 0; I < NElts; ++I)
2770  Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2771 
2772  return DAG.getBuildVector(VT, DL, Args);
2773 }
2774 
2775 //===----------------------------------------------------------------------===//
2776 // Custom DAG optimizations
2777 //===----------------------------------------------------------------------===//
2778 
2779 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2780  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2781 }
2782 
2783 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2784  EVT VT = Op.getValueType();
2785  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2786  // as unsigned 24-bit values.
2788 }
2789 
2792  SelectionDAG &DAG = DCI.DAG;
2793  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2794  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2795 
2796  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2797  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2798  unsigned NewOpcode = Node24->getOpcode();
2799  if (IsIntrin) {
2800  unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2801  NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2803  }
2804 
2805  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2806 
2807  // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2808  // the operands to have other uses, but will only perform simplifications that
2809  // involve bypassing some nodes for this user.
2810  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2811  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2812  if (DemandedLHS || DemandedRHS)
2813  return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2814  DemandedLHS ? DemandedLHS : LHS,
2815  DemandedRHS ? DemandedRHS : RHS);
2816 
2817  // Now try SimplifyDemandedBits which can simplify the nodes used by our
2818  // operands if this node is the only user.
2819  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2820  return SDValue(Node24, 0);
2821  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2822  return SDValue(Node24, 0);
2823 
2824  return SDValue();
2825 }
2826 
2827 template <typename IntTy>
2829  uint32_t Width, const SDLoc &DL) {
2830  if (Width + Offset < 32) {
2831  uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2832  IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2833  return DAG.getConstant(Result, DL, MVT::i32);
2834  }
2835 
2836  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2837 }
2838 
2839 static bool hasVolatileUser(SDNode *Val) {
2840  for (SDNode *U : Val->uses()) {
2841  if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2842  if (M->isVolatile())
2843  return true;
2844  }
2845  }
2846 
2847  return false;
2848 }
2849 
2851  // i32 vectors are the canonical memory type.
2852  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2853  return false;
2854 
2855  if (!VT.isByteSized())
2856  return false;
2857 
2858  unsigned Size = VT.getStoreSize();
2859 
2860  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2861  return false;
2862 
2863  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2864  return false;
2865 
2866  return true;
2867 }
2868 
2869 // Replace load of an illegal type with a store of a bitcast to a friendlier
2870 // type.
2872  DAGCombinerInfo &DCI) const {
2873  if (!DCI.isBeforeLegalize())
2874  return SDValue();
2875 
2876  LoadSDNode *LN = cast<LoadSDNode>(N);
2877  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2878  return SDValue();
2879 
2880  SDLoc SL(N);
2881  SelectionDAG &DAG = DCI.DAG;
2882  EVT VT = LN->getMemoryVT();
2883 
2884  unsigned Size = VT.getStoreSize();
2885  Align Alignment = LN->getAlign();
2886  if (Alignment < Size && isTypeLegal(VT)) {
2887  bool IsFast;
2888  unsigned AS = LN->getAddressSpace();
2889 
2890  // Expand unaligned loads earlier than legalization. Due to visitation order
2891  // problems during legalization, the emitted instructions to pack and unpack
2892  // the bytes again are not eliminated in the case of an unaligned copy.
2894  VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2895  SDValue Ops[2];
2896 
2897  if (VT.isVector())
2898  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
2899  else
2900  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2901 
2902  return DAG.getMergeValues(Ops, SDLoc(N));
2903  }
2904 
2905  if (!IsFast)
2906  return SDValue();
2907  }
2908 
2909  if (!shouldCombineMemoryType(VT))
2910  return SDValue();
2911 
2912  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2913 
2914  SDValue NewLoad
2915  = DAG.getLoad(NewVT, SL, LN->getChain(),
2916  LN->getBasePtr(), LN->getMemOperand());
2917 
2918  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2919  DCI.CombineTo(N, BC, NewLoad.getValue(1));
2920  return SDValue(N, 0);
2921 }
2922 
2923 // Replace store of an illegal type with a store of a bitcast to a friendlier
2924 // type.
2926  DAGCombinerInfo &DCI) const {
2927  if (!DCI.isBeforeLegalize())
2928  return SDValue();
2929 
2930  StoreSDNode *SN = cast<StoreSDNode>(N);
2931  if (!SN->isSimple() || !ISD::isNormalStore(SN))
2932  return SDValue();
2933 
2934  EVT VT = SN->getMemoryVT();
2935  unsigned Size = VT.getStoreSize();
2936 
2937  SDLoc SL(N);
2938  SelectionDAG &DAG = DCI.DAG;
2939  Align Alignment = SN->getAlign();
2940  if (Alignment < Size && isTypeLegal(VT)) {
2941  bool IsFast;
2942  unsigned AS = SN->getAddressSpace();
2943 
2944  // Expand unaligned stores earlier than legalization. Due to visitation
2945  // order problems during legalization, the emitted instructions to pack and
2946  // unpack the bytes again are not eliminated in the case of an unaligned
2947  // copy.
2949  VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
2950  if (VT.isVector())
2951  return scalarizeVectorStore(SN, DAG);
2952 
2953  return expandUnalignedStore(SN, DAG);
2954  }
2955 
2956  if (!IsFast)
2957  return SDValue();
2958  }
2959 
2960  if (!shouldCombineMemoryType(VT))
2961  return SDValue();
2962 
2963  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2964  SDValue Val = SN->getValue();
2965 
2966  //DCI.AddToWorklist(Val.getNode());
2967 
2968  bool OtherUses = !Val.hasOneUse();
2969  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2970  if (OtherUses) {
2971  SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2972  DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2973  }
2974 
2975  return DAG.getStore(SN->getChain(), SL, CastVal,
2976  SN->getBasePtr(), SN->getMemOperand());
2977 }
2978 
2979 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2980 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2981 // issues.
2983  DAGCombinerInfo &DCI) const {
2984  SelectionDAG &DAG = DCI.DAG;
2985  SDValue N0 = N->getOperand(0);
2986 
2987  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2988  // (vt2 (truncate (assertzext vt0:x, vt1)))
2989  if (N0.getOpcode() == ISD::TRUNCATE) {
2990  SDValue N1 = N->getOperand(1);
2991  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2992  SDLoc SL(N);
2993 
2994  SDValue Src = N0.getOperand(0);
2995  EVT SrcVT = Src.getValueType();
2996  if (SrcVT.bitsGE(ExtVT)) {
2997  SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2998  return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2999  }
3000  }
3001 
3002  return SDValue();
3003 }
3004 
3006  SDNode *N, DAGCombinerInfo &DCI) const {
3007  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3008  switch (IID) {
3009  case Intrinsic::amdgcn_mul_i24:
3010  case Intrinsic::amdgcn_mul_u24:
3011  return simplifyMul24(N, DCI);
3012  case Intrinsic::amdgcn_fract:
3013  case Intrinsic::amdgcn_rsq:
3014  case Intrinsic::amdgcn_rcp_legacy:
3015  case Intrinsic::amdgcn_rsq_legacy:
3016  case Intrinsic::amdgcn_rsq_clamp:
3017  case Intrinsic::amdgcn_ldexp: {
3018  // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3019  SDValue Src = N->getOperand(1);
3020  return Src.isUndef() ? Src : SDValue();
3021  }
3022  default:
3023  return SDValue();
3024  }
3025 }
3026 
3027 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3028 /// binary operation \p Opc to it with the corresponding constant operands.
3030  DAGCombinerInfo &DCI, const SDLoc &SL,
3031  unsigned Opc, SDValue LHS,
3032  uint32_t ValLo, uint32_t ValHi) const {
3033  SelectionDAG &DAG = DCI.DAG;
3034  SDValue Lo, Hi;
3035  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3036 
3037  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3038  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3039 
3040  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3041  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3042 
3043  // Re-visit the ands. It's possible we eliminated one of them and it could
3044  // simplify the vector.
3045  DCI.AddToWorklist(Lo.getNode());
3046  DCI.AddToWorklist(Hi.getNode());
3047 
3048  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3049  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3050 }
3051 
3053  DAGCombinerInfo &DCI) const {
3054  EVT VT = N->getValueType(0);
3055 
3056  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3057  if (!RHS)
3058  return SDValue();
3059 
3060  SDValue LHS = N->getOperand(0);
3061  unsigned RHSVal = RHS->getZExtValue();
3062  if (!RHSVal)
3063  return LHS;
3064 
3065  SDLoc SL(N);
3066  SelectionDAG &DAG = DCI.DAG;
3067 
3068  switch (LHS->getOpcode()) {
3069  default:
3070  break;
3071  case ISD::ZERO_EXTEND:
3072  case ISD::SIGN_EXTEND:
3073  case ISD::ANY_EXTEND: {
3074  SDValue X = LHS->getOperand(0);
3075 
3076  if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3078  // Prefer build_vector as the canonical form if packed types are legal.
3079  // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3080  SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3081  { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3082  return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3083  }
3084 
3085  // shl (ext x) => zext (shl x), if shift does not overflow int
3086  if (VT != MVT::i64)
3087  break;
3088  KnownBits Known = DAG.computeKnownBits(X);
3089  unsigned LZ = Known.countMinLeadingZeros();
3090  if (LZ < RHSVal)
3091  break;
3092  EVT XVT = X.getValueType();
3093  SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3094  return DAG.getZExtOrTrunc(Shl, SL, VT);
3095  }
3096  }
3097 
3098  if (VT != MVT::i64)
3099  return SDValue();
3100 
3101  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3102 
3103  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3104  // common case, splitting this into a move and a 32-bit shift is faster and
3105  // the same code size.
3106  if (RHSVal < 32)
3107  return SDValue();
3108 
3109  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3110 
3111  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3112  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3113 
3114  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3115 
3116  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3117  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3118 }
3119 
3121  DAGCombinerInfo &DCI) const {
3122  if (N->getValueType(0) != MVT::i64)
3123  return SDValue();
3124 
3125  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3126  if (!RHS)
3127  return SDValue();
3128 
3129  SelectionDAG &DAG = DCI.DAG;
3130  SDLoc SL(N);
3131  unsigned RHSVal = RHS->getZExtValue();
3132 
3133  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3134  if (RHSVal == 32) {
3135  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3136  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3137  DAG.getConstant(31, SL, MVT::i32));
3138 
3139  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3140  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3141  }
3142 
3143  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3144  if (RHSVal == 63) {
3145  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3146  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3147  DAG.getConstant(31, SL, MVT::i32));
3148  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3149  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3150  }
3151 
3152  return SDValue();
3153 }
3154 
3156  DAGCombinerInfo &DCI) const {
3157  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3158  if (!RHS)
3159  return SDValue();
3160 
3161  EVT VT = N->getValueType(0);
3162  SDValue LHS = N->getOperand(0);
3163  unsigned ShiftAmt = RHS->getZExtValue();
3164  SelectionDAG &DAG = DCI.DAG;
3165  SDLoc SL(N);
3166 
3167  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3168  // this improves the ability to match BFE patterns in isel.
3169  if (LHS.getOpcode() == ISD::AND) {
3170  if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3171  if (Mask->getAPIntValue().isShiftedMask() &&
3172  Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3173  return DAG.getNode(
3174  ISD::AND, SL, VT,
3175  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3176  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3177  }
3178  }
3179  }
3180 
3181  if (VT != MVT::i64)
3182  return SDValue();
3183 
3184  if (ShiftAmt < 32)
3185  return SDValue();
3186 
3187  // srl i64:x, C for C >= 32
3188  // =>
3189  // build_pair (srl hi_32(x), C - 32), 0
3190  SDValue One = DAG.getConstant(1, SL, MVT::i32);
3191  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3192 
3193  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3194  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3195 
3196  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3197  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3198 
3199  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3200 
3201  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3202 }
3203 
3205  SDNode *N, DAGCombinerInfo &DCI) const {
3206  SDLoc SL(N);
3207  SelectionDAG &DAG = DCI.DAG;
3208  EVT VT = N->getValueType(0);
3209  SDValue Src = N->getOperand(0);
3210 
3211  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3212  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3213  SDValue Vec = Src.getOperand(0);
3214  if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3215  SDValue Elt0 = Vec.getOperand(0);
3216  EVT EltVT = Elt0.getValueType();
3217  if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3218  if (EltVT.isFloatingPoint()) {
3219  Elt0 = DAG.getNode(ISD::BITCAST, SL,
3220  EltVT.changeTypeToInteger(), Elt0);
3221  }
3222 
3223  return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3224  }
3225  }
3226  }
3227 
3228  // Equivalent of above for accessing the high element of a vector as an
3229  // integer operation.
3230  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3231  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3232  if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3233  if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3234  SDValue BV = stripBitcast(Src.getOperand(0));
3235  if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3236  BV.getValueType().getVectorNumElements() == 2) {
3237  SDValue SrcElt = BV.getOperand(1);
3238  EVT SrcEltVT = SrcElt.getValueType();
3239  if (SrcEltVT.isFloatingPoint()) {
3240  SrcElt = DAG.getNode(ISD::BITCAST, SL,
3241  SrcEltVT.changeTypeToInteger(), SrcElt);
3242  }
3243 
3244  return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3245  }
3246  }
3247  }
3248  }
3249 
3250  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3251  //
3252  // i16 (trunc (srl i64:x, K)), K <= 16 ->
3253  // i16 (trunc (srl (i32 (trunc x), K)))
3254  if (VT.getScalarSizeInBits() < 32) {
3255  EVT SrcVT = Src.getValueType();
3256  if (SrcVT.getScalarSizeInBits() > 32 &&
3257  (Src.getOpcode() == ISD::SRL ||
3258  Src.getOpcode() == ISD::SRA ||
3259  Src.getOpcode() == ISD::SHL)) {
3260  SDValue Amt = Src.getOperand(1);
3261  KnownBits Known = DAG.computeKnownBits(Amt);
3262  unsigned Size = VT.getScalarSizeInBits();
3263  if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3264  (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3265  EVT MidVT = VT.isVector() ?
3268 
3269  EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3270  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3271  Src.getOperand(0));
3272  DCI.AddToWorklist(Trunc.getNode());
3273 
3274  if (Amt.getValueType() != NewShiftVT) {
3275  Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3276  DCI.AddToWorklist(Amt.getNode());
3277  }
3278 
3279  SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3280  Trunc, Amt);
3281  return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3282  }
3283  }
3284  }
3285 
3286  return SDValue();
3287 }
3288 
3289 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3290 // instructions. If we only match on the legalized i64 mul expansion,
3291 // SimplifyDemandedBits will be unable to remove them because there will be
3292 // multiple uses due to the separate mul + mulh[su].
3293 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3294  SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3295  if (Size <= 32) {
3296  unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3297  return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3298  }
3299 
3300  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3301  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3302 
3303  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3304  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3305 
3306  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3307 }
3308 
3310  DAGCombinerInfo &DCI) const {
3311  EVT VT = N->getValueType(0);
3312 
3313  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3314  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3315  // unnecessarily). isDivergent() is used as an approximation of whether the
3316  // value is in an SGPR.
3317  if (!N->isDivergent())
3318  return SDValue();
3319 
3320  unsigned Size = VT.getSizeInBits();
3321  if (VT.isVector() || Size > 64)
3322  return SDValue();
3323 
3324  // There are i16 integer mul/mad.
3325  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3326  return SDValue();
3327 
3328  SelectionDAG &DAG = DCI.DAG;
3329  SDLoc DL(N);
3330 
3331  SDValue N0 = N->getOperand(0);
3332  SDValue N1 = N->getOperand(1);
3333 
3334  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3335  // in the source into any_extends if the result of the mul is truncated. Since
3336  // we can assume the high bits are whatever we want, use the underlying value
3337  // to avoid the unknown high bits from interfering.
3338  if (N0.getOpcode() == ISD::ANY_EXTEND)
3339  N0 = N0.getOperand(0);
3340 
3341  if (N1.getOpcode() == ISD::ANY_EXTEND)
3342  N1 = N1.getOperand(0);
3343 
3344  SDValue Mul;
3345 
3346  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3347  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3348  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3349  Mul = getMul24(DAG, DL, N0, N1, Size, false);
3350  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3351  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3352  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3353  Mul = getMul24(DAG, DL, N0, N1, Size, true);
3354  } else {
3355  return SDValue();
3356  }
3357 
3358  // We need to use sext even for MUL_U24, because MUL_U24 is used
3359  // for signed multiply of 8 and 16-bit types.
3360  return DAG.getSExtOrTrunc(Mul, DL, VT);
3361 }
3362 
3364  DAGCombinerInfo &DCI) const {
3365  EVT VT = N->getValueType(0);
3366 
3367  if (!Subtarget->hasMulI24() || VT.isVector())
3368  return SDValue();
3369 
3370  SelectionDAG &DAG = DCI.DAG;
3371  SDLoc DL(N);
3372 
3373  SDValue N0 = N->getOperand(0);
3374  SDValue N1 = N->getOperand(1);
3375 
3376  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3377  return SDValue();
3378 
3379  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3380  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3381 
3382  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3383  DCI.AddToWorklist(Mulhi.getNode());
3384  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3385 }
3386 
3388  DAGCombinerInfo &DCI) const {
3389  EVT VT = N->getValueType(0);
3390 
3391  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3392  return SDValue();
3393 
3394  SelectionDAG &DAG = DCI.DAG;
3395  SDLoc DL(N);
3396 
3397  SDValue N0 = N->getOperand(0);
3398  SDValue N1 = N->getOperand(1);
3399 
3400  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3401  return SDValue();
3402 
3403  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3404  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3405 
3406  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3407  DCI.AddToWorklist(Mulhi.getNode());
3408  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3409 }
3410 
3411 static bool isNegativeOne(SDValue Val) {
3412  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3413  return C->isAllOnesValue();
3414  return false;
3415 }
3416 
3417 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3418  SDValue Op,
3419  const SDLoc &DL,
3420  unsigned Opc) const {
3421  EVT VT = Op.getValueType();
3422  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3423  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3424  LegalVT != MVT::i16))
3425  return SDValue();
3426 
3427  if (VT != MVT::i32)
3429 
3430  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3431  if (VT != MVT::i32)
3432  FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3433 
3434  return FFBX;
3435 }
3436 
3437 // The native instructions return -1 on 0 input. Optimize out a select that
3438 // produces -1 on 0.
3439 //
3440 // TODO: If zero is not undef, we could also do this if the output is compared
3441 // against the bitwidth.
3442 //
3443 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3445  SDValue LHS, SDValue RHS,
3446  DAGCombinerInfo &DCI) const {
3447  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3448  if (!CmpRhs || !CmpRhs->isNullValue())
3449  return SDValue();
3450 
3451  SelectionDAG &DAG = DCI.DAG;
3452  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3453  SDValue CmpLHS = Cond.getOperand(0);
3454 
3455  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3456  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3457  if (CCOpcode == ISD::SETEQ &&
3458  (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3459  RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3460  unsigned Opc =
3462  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3463  }
3464 
3465  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3466  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3467  if (CCOpcode == ISD::SETNE &&
3468  (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3469  LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3470  unsigned Opc =
3472 
3473  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3474  }
3475 
3476  return SDValue();
3477 }
3478 
3480  unsigned Op,
3481  const SDLoc &SL,
3482  SDValue Cond,
3483  SDValue N1,
3484  SDValue N2) {
3485  SelectionDAG &DAG = DCI.DAG;
3486  EVT VT = N1.getValueType();
3487 
3488  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3489  N1.getOperand(0), N2.getOperand(0));
3490  DCI.AddToWorklist(NewSelect.getNode());
3491  return DAG.getNode(Op, SL, VT, NewSelect);
3492 }
3493 
3494 // Pull a free FP operation out of a select so it may fold into uses.
3495 //
3496 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3497 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3498 //
3499 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3500 // select c, (fabs x), +k -> fabs (select c, x, k)
3502  SDValue N) {
3503  SelectionDAG &DAG = DCI.DAG;
3504  SDValue Cond = N.getOperand(0);
3505  SDValue LHS = N.getOperand(1);
3506  SDValue RHS = N.getOperand(2);
3507 
3508  EVT VT = N.getValueType();
3509  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3510  (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3511  return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3512  SDLoc(N), Cond, LHS, RHS);
3513  }
3514 
3515  bool Inv = false;
3516  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3517  std::swap(LHS, RHS);
3518  Inv = true;
3519  }
3520 
3521  // TODO: Support vector constants.
3522  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3523  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3524  SDLoc SL(N);
3525  // If one side is an fneg/fabs and the other is a constant, we can push the
3526  // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3527  SDValue NewLHS = LHS.getOperand(0);
3528  SDValue NewRHS = RHS;
3529 
3530  // Careful: if the neg can be folded up, don't try to pull it back down.
3531  bool ShouldFoldNeg = true;
3532 
3533  if (NewLHS.hasOneUse()) {
3534  unsigned Opc = NewLHS.getOpcode();
3535  if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3536  ShouldFoldNeg = false;
3537  if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3538  ShouldFoldNeg = false;
3539  }
3540 
3541  if (ShouldFoldNeg) {
3542  if (LHS.getOpcode() == ISD::FNEG)
3543  NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3544  else if (CRHS->isNegative())
3545  return SDValue();
3546 
3547  if (Inv)
3548  std::swap(NewLHS, NewRHS);
3549 
3550  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3551  Cond, NewLHS, NewRHS);
3552  DCI.AddToWorklist(NewSelect.getNode());
3553  return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3554  }
3555  }
3556 
3557  return SDValue();
3558 }
3559 
3560 
3562  DAGCombinerInfo &DCI) const {
3563  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3564  return Folded;
3565 
3566  SDValue Cond = N->getOperand(0);
3567  if (Cond.getOpcode() != ISD::SETCC)
3568  return SDValue();
3569 
3570  EVT VT = N->getValueType(0);
3571  SDValue LHS = Cond.getOperand(0);
3572  SDValue RHS = Cond.getOperand(1);
3573  SDValue CC = Cond.getOperand(2);
3574 
3575  SDValue True = N->getOperand(1);
3576  SDValue False = N->getOperand(2);
3577 
3578  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3579  SelectionDAG &DAG = DCI.DAG;
3580  if (DAG.isConstantValueOfAnyType(True) &&
3581  !DAG.isConstantValueOfAnyType(False)) {
3582  // Swap cmp + select pair to move constant to false input.
3583  // This will allow using VOPC cndmasks more often.
3584  // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3585 
3586  SDLoc SL(N);
3587  ISD::CondCode NewCC =
3588  getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3589 
3590  SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3591  return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3592  }
3593 
3594  if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3595  SDValue MinMax
3596  = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3597  // Revisit this node so we can catch min3/max3/med3 patterns.
3598  //DCI.AddToWorklist(MinMax.getNode());
3599  return MinMax;
3600  }
3601  }
3602 
3603  // There's no reason to not do this if the condition has other uses.
3604  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3605 }
3606 
3607 static bool isInv2Pi(const APFloat &APF) {
3608  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3609  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3610  static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3611 
3612  return APF.bitwiseIsEqual(KF16) ||
3613  APF.bitwiseIsEqual(KF32) ||
3614  APF.bitwiseIsEqual(KF64);
3615 }
3616 
3617 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3618 // additional cost to negate them.
3620  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3621  if (C->isZero() && !C->isNegative())
3622  return true;
3623 
3624  if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3625  return true;
3626  }
3627 
3628  return false;
3629 }
3630 
3631 static unsigned inverseMinMax(unsigned Opc) {
3632  switch (Opc) {
3633  case ISD::FMAXNUM:
3634  return ISD::FMINNUM;
3635  case ISD::FMINNUM:
3636  return ISD::FMAXNUM;
3637  case ISD::FMAXNUM_IEEE:
3638  return ISD::FMINNUM_IEEE;
3639  case ISD::FMINNUM_IEEE:
3640  return ISD::FMAXNUM_IEEE;
3642  return AMDGPUISD::FMIN_LEGACY;
3644  return AMDGPUISD::FMAX_LEGACY;
3645  default:
3646  llvm_unreachable("invalid min/max opcode");
3647  }
3648 }
3649 
3651  DAGCombinerInfo &DCI) const {
3652  SelectionDAG &DAG = DCI.DAG;
3653  SDValue N0 = N->getOperand(0);
3654  EVT VT = N->getValueType(0);
3655 
3656  unsigned Opc = N0.getOpcode();
3657 
3658  // If the input has multiple uses and we can either fold the negate down, or
3659  // the other uses cannot, give up. This both prevents unprofitable
3660  // transformations and infinite loops: we won't repeatedly try to fold around
3661  // a negate that has no 'good' form.
3662  if (N0.hasOneUse()) {
3663  // This may be able to fold into the source, but at a code size cost. Don't
3664  // fold if the fold into the user is free.
3665  if (allUsesHaveSourceMods(N, 0))
3666  return SDValue();
3667  } else {
3668  if (fnegFoldsIntoOp(Opc) &&
3670  return SDValue();
3671  }
3672 
3673  SDLoc SL(N);
3674  switch (Opc) {
3675  case ISD::FADD: {
3676  if (!mayIgnoreSignedZero(N0))
3677  return SDValue();
3678 
3679  // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3680  SDValue LHS = N0.getOperand(0);
3681  SDValue RHS = N0.getOperand(1);
3682 
3683  if (LHS.getOpcode() != ISD::FNEG)
3684  LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3685  else
3686  LHS = LHS.getOperand(0);
3687 
3688  if (RHS.getOpcode() != ISD::FNEG)
3689  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3690  else
3691  RHS = RHS.getOperand(0);
3692 
3693  SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3694  if (Res.getOpcode() != ISD::FADD)
3695  return SDValue(); // Op got folded away.
3696  if (!N0.hasOneUse())
3697  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3698  return Res;
3699  }
3700  case ISD::FMUL:
3701  case AMDGPUISD::FMUL_LEGACY: {
3702  // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3703  // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3704  SDValue LHS = N0.getOperand(0);
3705  SDValue RHS = N0.getOperand(1);
3706 
3707  if (LHS.getOpcode() == ISD::FNEG)
3708  LHS = LHS.getOperand(0);
3709  else if (RHS.getOpcode() == ISD::FNEG)
3710  RHS = RHS.getOperand(0);
3711  else
3712  RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3713 
3714  SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3715  if (Res.getOpcode() != Opc)
3716  return SDValue(); // Op got folded away.
3717  if (!N0.hasOneUse())
3718  DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3719  return Res;
3720  }
3721  case ISD::FMA:
3722  case ISD::FMAD: {
3723  // TODO: handle llvm.amdgcn.fma.legacy
3724  if (!mayIgnoreSignedZero(N0))
3725  return SDValue();
3726 
3727  // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3728  SDValue LHS = N0.getOperand(0);
3729  SDValue MHS = N0.getOperand(1);
3730  SDValue RHS = N0.getOperand(2);
3731 
3732  if (LHS.getOpcode() == ISD::FNEG)
3733  LHS = LHS.getOperand(0);
3734  else if (MHS.getOpcode() == ISD::FNEG)
3735  MHS = MHS.getOperand(0);
3736  else
3737  MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3738 
3739  if (RHS.getOpcode() !=