LLVM  16.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
26 #include "llvm/Support/KnownBits.h"
28 
29 using namespace llvm;
30 
31 #include "AMDGPUGenCallingConv.inc"
32 
34  "amdgpu-bypass-slow-div",
35  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
36  cl::init(true));
37 
38 // Find a larger type to do a load / store of a vector with.
40  unsigned StoreSize = VT.getStoreSizeInBits();
41  if (StoreSize <= 32)
42  return EVT::getIntegerVT(Ctx, StoreSize);
43 
44  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
45  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
46 }
47 
50 }
51 
53  // In order for this to be a signed 24-bit value, bit 23, must
54  // be a sign bit.
55  return DAG.ComputeMaxSignificantBits(Op);
56 }
57 
59  const AMDGPUSubtarget &STI)
60  : TargetLowering(TM), Subtarget(&STI) {
61  // Lower floating point store/load to integer store/load to reduce the number
62  // of patterns in tablegen.
65 
68 
71 
74 
77 
80 
83 
86 
89 
92 
95 
98 
101 
104 
107 
110 
113 
116 
119 
122 
125 
128 
131 
134 
137 
140 
141  // There are no 64-bit extloads. These should be done as a 32-bit extload and
142  // an extension to 64-bit.
143  for (MVT VT : MVT::integer_valuetypes())
145  Expand);
146 
147  for (MVT VT : MVT::integer_valuetypes()) {
148  if (VT == MVT::i64)
149  continue;
150 
151  for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
156  }
157  }
158 
160  for (auto MemVT :
163  Expand);
164 
172 
179 
186 
189 
192 
195 
198 
201 
204 
207 
210 
213 
216 
219 
222 
225 
228 
231 
234 
237 
240 
243 
246 
249 
252 
255 
258 
261 
264 
269 
274 
282 
285 
288 
293 
298 
301 
309 
312 
314 
315  // This is totally unsupported, just custom lower to produce an error.
317 
318  // Library functions. These default to Expand, but we have instructions
319  // for them.
322  ISD::FMAXNUM},
323  MVT::f32, Legal);
324 
326 
328 
330 
332 
333  if (Subtarget->has16BitInsts())
335  else
337 
338  // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
339  // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
340  // default unless marked custom/legal.
346  Custom);
347 
348  // Expand to fneg + fadd.
350 
357  Custom);
369  Custom);
370 
373 
374  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
375  for (MVT VT : ScalarIntVTs) {
376  // These should use [SU]DIVREM, so set them to expand
378  Expand);
379 
380  // GPU does not have divrem function for signed or unsigned.
382 
383  // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
385 
387 
388  // AMDGPU uses ADDC/SUBC/ADDE/SUBE
390  }
391 
392  // The hardware supports 32-bit FSHR, but not FSHL.
394 
395  // The hardware supports 32-bit ROTR, but not ROTL.
398 
400 
404  MVT::i64, Custom);
406 
408  Legal);
409 
412  MVT::i64, Custom);
413 
414  static const MVT::SimpleValueType VectorIntTypes[] = {
417 
418  for (MVT VT : VectorIntTypes) {
419  // Expand the following operations for the current type by default.
431  ISD::SETCC},
432  VT, Expand);
433  }
434 
435  static const MVT::SimpleValueType FloatVectorTypes[] = {
438 
439  for (MVT VT : FloatVectorTypes) {
449  VT, Expand);
450  }
451 
452  // This causes using an unrolled select operation rather than expansion with
453  // bit operations. This is in general better, but the alternative using BFI
454  // instructions may be better if the select sources are SGPRs.
457 
460 
463 
466 
469 
472 
475 
478 
481 
484 
485  // There are no libcalls of any kind.
486  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
487  setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
488 
490  setJumpIsExpensive(true);
491 
492  // FIXME: This is only partially true. If we have to do vector compares, any
493  // SGPR pair can be a condition register. If we have a uniform condition, we
494  // are better off doing SALU operations, where there is only one SCC. For now,
495  // we don't have a way of knowing during instruction selection if a condition
496  // will be uniform and we always use vector compares. Assume we are using
497  // vector compares until that is fixed.
499 
502 
504 
505  // We want to find all load dependencies for long chains of stores to enable
506  // merging into very wide vectors. The problem is with vectors with > 4
507  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
508  // vectors are a legal type, even though we have to split the loads
509  // usually. When we can more precisely specify load legality per address
510  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
511  // smarter so that they can figure out what to do in 2 iterations without all
512  // N > 4 stores on the same chain.
514 
515  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
516  // about these during lowering.
517  MaxStoresPerMemcpy = 0xffffffff;
518  MaxStoresPerMemmove = 0xffffffff;
519  MaxStoresPerMemset = 0xffffffff;
520 
521  // The expansion for 64-bit division is enormous.
523  addBypassSlowDiv(64, 32);
524 
535 }
536 
538  if (getTargetMachine().Options.NoSignedZerosFPMath)
539  return true;
540 
541  const auto Flags = Op.getNode()->getFlags();
542  if (Flags.hasNoSignedZeros())
543  return true;
544 
545  return false;
546 }
547 
548 //===----------------------------------------------------------------------===//
549 // Target Information
550 //===----------------------------------------------------------------------===//
551 
553 static bool fnegFoldsIntoOp(unsigned Opc) {
554  switch (Opc) {
555  case ISD::FADD:
556  case ISD::FSUB:
557  case ISD::FMUL:
558  case ISD::FMA:
559  case ISD::FMAD:
560  case ISD::FMINNUM:
561  case ISD::FMAXNUM:
562  case ISD::FMINNUM_IEEE:
563  case ISD::FMAXNUM_IEEE:
564  case ISD::FSIN:
565  case ISD::FTRUNC:
566  case ISD::FRINT:
567  case ISD::FNEARBYINT:
568  case ISD::FCANONICALIZE:
569  case AMDGPUISD::RCP:
572  case AMDGPUISD::SIN_HW:
576  case AMDGPUISD::FMED3:
577  // TODO: handle llvm.amdgcn.fma.legacy
578  return true;
579  default:
580  return false;
581  }
582 }
583 
584 /// \p returns true if the operation will definitely need to use a 64-bit
585 /// encoding, and thus will use a VOP3 encoding regardless of the source
586 /// modifiers.
588 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
589  return N->getNumOperands() > 2 || VT == MVT::f64;
590 }
591 
592 // Most FP instructions support source modifiers, but this could be refined
593 // slightly.
595 static bool hasSourceMods(const SDNode *N) {
596  if (isa<MemSDNode>(N))
597  return false;
598 
599  switch (N->getOpcode()) {
600  case ISD::CopyToReg:
601  case ISD::SELECT:
602  case ISD::FDIV:
603  case ISD::FREM:
604  case ISD::INLINEASM:
605  case ISD::INLINEASM_BR:
608 
609  // TODO: Should really be looking at the users of the bitcast. These are
610  // problematic because bitcasts are used to legalize all stores to integer
611  // types.
612  case ISD::BITCAST:
613  return false;
615  switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
616  case Intrinsic::amdgcn_interp_p1:
617  case Intrinsic::amdgcn_interp_p2:
618  case Intrinsic::amdgcn_interp_mov:
619  case Intrinsic::amdgcn_interp_p1_f16:
620  case Intrinsic::amdgcn_interp_p2_f16:
621  return false;
622  default:
623  return true;
624  }
625  }
626  default:
627  return true;
628  }
629 }
630 
632  unsigned CostThreshold) {
633  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
634  // it is truly free to use a source modifier in all cases. If there are
635  // multiple users but for each one will necessitate using VOP3, there will be
636  // a code size increase. Try to avoid increasing code size unless we know it
637  // will save on the instruction count.
638  unsigned NumMayIncreaseSize = 0;
639  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
640 
641  // XXX - Should this limit number of uses to check?
642  for (const SDNode *U : N->uses()) {
643  if (!hasSourceMods(U))
644  return false;
645 
646  if (!opMustUseVOP3Encoding(U, VT)) {
647  if (++NumMayIncreaseSize > CostThreshold)
648  return false;
649  }
650  }
651 
652  return true;
653 }
654 
656  ISD::NodeType ExtendKind) const {
657  assert(!VT.isVector() && "only scalar expected");
658 
659  // Round to the next multiple of 32-bits.
660  unsigned Size = VT.getSizeInBits();
661  if (Size <= 32)
662  return MVT::i32;
663  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
664 }
665 
667  return MVT::i32;
668 }
669 
671  return true;
672 }
673 
674 // The backend supports 32 and 64 bit floating point immediates.
675 // FIXME: Why are we reporting vectors of FP immediates as legal?
677  bool ForCodeSize) const {
678  EVT ScalarVT = VT.getScalarType();
679  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
680  (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
681 }
682 
683 // We don't want to shrink f64 / f32 constants.
685  EVT ScalarVT = VT.getScalarType();
686  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
687 }
688 
690  ISD::LoadExtType ExtTy,
691  EVT NewVT) const {
692  // TODO: This may be worth removing. Check regression tests for diffs.
693  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
694  return false;
695 
696  unsigned NewSize = NewVT.getStoreSizeInBits();
697 
698  // If we are reducing to a 32-bit load or a smaller multi-dword load,
699  // this is always better.
700  if (NewSize >= 32)
701  return true;
702 
703  EVT OldVT = N->getValueType(0);
704  unsigned OldSize = OldVT.getStoreSizeInBits();
705 
706  MemSDNode *MN = cast<MemSDNode>(N);
707  unsigned AS = MN->getAddressSpace();
708  // Do not shrink an aligned scalar load to sub-dword.
709  // Scalar engine cannot do sub-dword loads.
710  if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
713  (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
714  MN->isInvariant())) &&
716  return false;
717 
718  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
719  // extloads, so doing one requires using a buffer_load. In cases where we
720  // still couldn't use a scalar load, using the wider load shouldn't really
721  // hurt anything.
722 
723  // If the old size already had to be an extload, there's no harm in continuing
724  // to reduce the width.
725  return (OldSize < 32);
726 }
727 
729  const SelectionDAG &DAG,
730  const MachineMemOperand &MMO) const {
731 
732  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
733 
734  if (LoadTy.getScalarType() == MVT::i32)
735  return false;
736 
737  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
738  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
739 
740  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
741  return false;
742 
743  unsigned Fast = 0;
745  CastTy, MMO, &Fast) &&
746  Fast;
747 }
748 
749 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
750 // profitable with the expansion for 64-bit since it's generally good to
751 // speculate things.
753  return true;
754 }
755 
757  return true;
758 }
759 
761  switch (N->getOpcode()) {
762  case ISD::EntryToken:
763  case ISD::TokenFactor:
764  return true;
766  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
767  switch (IntrID) {
768  case Intrinsic::amdgcn_readfirstlane:
769  case Intrinsic::amdgcn_readlane:
770  return true;
771  }
772  return false;
773  }
774  case ISD::LOAD:
775  if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
777  return true;
778  return false;
779  case AMDGPUISD::SETCC: // ballot-style instruction
780  return true;
781  }
782  return false;
783 }
784 
786  SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
787  NegatibleCost &Cost, unsigned Depth) const {
788 
789  switch (Op.getOpcode()) {
790  case ISD::FMA:
791  case ISD::FMAD: {
792  // Negating a fma is not free if it has users without source mods.
793  if (!allUsesHaveSourceMods(Op.getNode()))
794  return SDValue();
795  break;
796  }
797  default:
798  break;
799  }
800 
801  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
802  ForCodeSize, Cost, Depth);
803 }
804 
805 //===---------------------------------------------------------------------===//
806 // Target Properties
807 //===---------------------------------------------------------------------===//
808 
810  assert(VT.isFloatingPoint());
811 
812  // Packed operations do not have a fabs modifier.
813  return VT == MVT::f32 || VT == MVT::f64 ||
814  (Subtarget->has16BitInsts() && VT == MVT::f16);
815 }
816 
818  assert(VT.isFloatingPoint());
819  // Report this based on the end legalized type.
820  VT = VT.getScalarType();
821  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
822 }
823 
825  unsigned NumElem,
826  unsigned AS) const {
827  return true;
828 }
829 
831  // There are few operations which truly have vector input operands. Any vector
832  // operation is going to involve operations on each component, and a
833  // build_vector will be a copy per element, so it always makes sense to use a
834  // build_vector input in place of the extracted element to avoid a copy into a
835  // super register.
836  //
837  // We should probably only do this if all users are extracts only, but this
838  // should be the common case.
839  return true;
840 }
841 
843  // Truncate is just accessing a subregister.
844 
845  unsigned SrcSize = Source.getSizeInBits();
846  unsigned DestSize = Dest.getSizeInBits();
847 
848  return DestSize < SrcSize && DestSize % 32 == 0 ;
849 }
850 
852  // Truncate is just accessing a subregister.
853 
854  unsigned SrcSize = Source->getScalarSizeInBits();
855  unsigned DestSize = Dest->getScalarSizeInBits();
856 
857  if (DestSize== 16 && Subtarget->has16BitInsts())
858  return SrcSize >= 32;
859 
860  return DestSize < SrcSize && DestSize % 32 == 0;
861 }
862 
863 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
864  unsigned SrcSize = Src->getScalarSizeInBits();
865  unsigned DestSize = Dest->getScalarSizeInBits();
866 
867  if (SrcSize == 16 && Subtarget->has16BitInsts())
868  return DestSize >= 32;
869 
870  return SrcSize == 32 && DestSize == 64;
871 }
872 
873 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
874  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
875  // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
876  // this will enable reducing 64-bit operations the 32-bit, which is always
877  // good.
878 
879  if (Src == MVT::i16)
880  return Dest == MVT::i32 ||Dest == MVT::i64 ;
881 
882  return Src == MVT::i32 && Dest == MVT::i64;
883 }
884 
886  return isZExtFree(Val.getValueType(), VT2);
887 }
888 
890  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
891  // limited number of native 64-bit operations. Shrinking an operation to fit
892  // in a single 32-bit register should always be helpful. As currently used,
893  // this is much less general than the name suggests, and is only used in
894  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
895  // not profitable, and may actually be harmful.
896  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
897 }
898 
900  const SDNode* N, CombineLevel Level) const {
901  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
902  N->getOpcode() == ISD::SRL) &&
903  "Expected shift op");
904  // Always commute pre-type legalization and right shifts.
905  // We're looking for shl(or(x,y),z) patterns.
906  if (Level < CombineLevel::AfterLegalizeTypes ||
907  N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
908  return true;
909 
910  // If only user is a i32 right-shift, then don't destroy a BFE pattern.
911  if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
912  (N->use_begin()->getOpcode() == ISD::SRA ||
913  N->use_begin()->getOpcode() == ISD::SRL))
914  return false;
915 
916  // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
917  auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
918  if (LHS.getOpcode() != ISD::SHL)
919  return false;
920  auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
921  auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
922  auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
923  return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
924  LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
925  RHSLd->getExtensionType() == ISD::ZEXTLOAD;
926  };
927  SDValue LHS = N->getOperand(0).getOperand(0);
928  SDValue RHS = N->getOperand(0).getOperand(1);
929  return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
930 }
931 
932 //===---------------------------------------------------------------------===//
933 // TargetLowering Callbacks
934 //===---------------------------------------------------------------------===//
935 
937  bool IsVarArg) {
938  switch (CC) {
946  return CC_AMDGPU;
947  case CallingConv::C:
948  case CallingConv::Fast:
949  case CallingConv::Cold:
950  return CC_AMDGPU_Func;
952  return CC_SI_Gfx;
955  default:
956  report_fatal_error("Unsupported calling convention for call");
957  }
958 }
959 
961  bool IsVarArg) {
962  switch (CC) {
965  llvm_unreachable("kernels should not be handled here");
973  return RetCC_SI_Shader;
975  return RetCC_SI_Gfx;
976  case CallingConv::C:
977  case CallingConv::Fast:
978  case CallingConv::Cold:
979  return RetCC_AMDGPU_Func;
980  default:
981  report_fatal_error("Unsupported calling convention.");
982  }
983 }
984 
985 /// The SelectionDAGBuilder will automatically promote function arguments
986 /// with illegal types. However, this does not work for the AMDGPU targets
987 /// since the function arguments are stored in memory as these illegal types.
988 /// In order to handle this properly we need to get the original types sizes
989 /// from the LLVM IR Function and fixup the ISD:InputArg values before
990 /// passing them to AnalyzeFormalArguments()
991 
992 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
993 /// input values across multiple registers. Each item in the Ins array
994 /// represents a single value that will be stored in registers. Ins[x].VT is
995 /// the value type of the value that will be stored in the register, so
996 /// whatever SDNode we lower the argument to needs to be this type.
997 ///
998 /// In order to correctly lower the arguments we need to know the size of each
999 /// argument. Since Ins[x].VT gives us the size of the register that will
1000 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1001 /// for the original function argument so that we can deduce the correct memory
1002 /// type to use for Ins[x]. In most cases the correct memory type will be
1003 /// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1004 /// we have a kernel argument of type v8i8, this argument will be split into
1005 /// 8 parts and each part will be represented by its own item in the Ins array.
1006 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1007 /// the argument before it was split. From this, we deduce that the memory type
1008 /// for each individual part is i8. We pass the memory type as LocVT to the
1009 /// calling convention analysis function and the register type (Ins[x].VT) as
1010 /// the ValVT.
1012  CCState &State,
1013  const SmallVectorImpl<ISD::InputArg> &Ins) const {
1014  const MachineFunction &MF = State.getMachineFunction();
1015  const Function &Fn = MF.getFunction();
1016  LLVMContext &Ctx = Fn.getParent()->getContext();
1018  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1020 
1021  Align MaxAlign = Align(1);
1022  uint64_t ExplicitArgOffset = 0;
1023  const DataLayout &DL = Fn.getParent()->getDataLayout();
1024 
1025  unsigned InIndex = 0;
1026 
1027  for (const Argument &Arg : Fn.args()) {
1028  const bool IsByRef = Arg.hasByRefAttr();
1029  Type *BaseArgTy = Arg.getType();
1030  Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1031  Align Alignment = DL.getValueOrABITypeAlignment(
1032  IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1033  MaxAlign = std::max(Alignment, MaxAlign);
1034  uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1035 
1036  uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1037  ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1038 
1039  // We're basically throwing away everything passed into us and starting over
1040  // to get accurate in-memory offsets. The "PartOffset" is completely useless
1041  // to us as computed in Ins.
1042  //
1043  // We also need to figure out what type legalization is trying to do to get
1044  // the correct memory offsets.
1045 
1046  SmallVector<EVT, 16> ValueVTs;
1048  ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1049 
1050  for (unsigned Value = 0, NumValues = ValueVTs.size();
1051  Value != NumValues; ++Value) {
1052  uint64_t BasePartOffset = Offsets[Value];
1053 
1054  EVT ArgVT = ValueVTs[Value];
1055  EVT MemVT = ArgVT;
1056  MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1057  unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1058 
1059  if (NumRegs == 1) {
1060  // This argument is not split, so the IR type is the memory type.
1061  if (ArgVT.isExtended()) {
1062  // We have an extended type, like i24, so we should just use the
1063  // register type.
1064  MemVT = RegisterVT;
1065  } else {
1066  MemVT = ArgVT;
1067  }
1068  } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1069  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1070  assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1071  // We have a vector value which has been split into a vector with
1072  // the same scalar type, but fewer elements. This should handle
1073  // all the floating-point vector types.
1074  MemVT = RegisterVT;
1075  } else if (ArgVT.isVector() &&
1076  ArgVT.getVectorNumElements() == NumRegs) {
1077  // This arg has been split so that each element is stored in a separate
1078  // register.
1079  MemVT = ArgVT.getScalarType();
1080  } else if (ArgVT.isExtended()) {
1081  // We have an extended type, like i65.
1082  MemVT = RegisterVT;
1083  } else {
1084  unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1085  assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1086  if (RegisterVT.isInteger()) {
1087  MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1088  } else if (RegisterVT.isVector()) {
1089  assert(!RegisterVT.getScalarType().isFloatingPoint());
1090  unsigned NumElements = RegisterVT.getVectorNumElements();
1091  assert(MemoryBits % NumElements == 0);
1092  // This vector type has been split into another vector type with
1093  // a different elements size.
1094  EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1095  MemoryBits / NumElements);
1096  MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1097  } else {
1098  llvm_unreachable("cannot deduce memory type.");
1099  }
1100  }
1101 
1102  // Convert one element vectors to scalar.
1103  if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1104  MemVT = MemVT.getScalarType();
1105 
1106  // Round up vec3/vec5 argument.
1107  if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1108  assert(MemVT.getVectorNumElements() == 3 ||
1109  MemVT.getVectorNumElements() == 5 ||
1110  (MemVT.getVectorNumElements() >= 9 &&
1111  MemVT.getVectorNumElements() <= 12));
1112  MemVT = MemVT.getPow2VectorType(State.getContext());
1113  } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1114  MemVT = MemVT.getRoundIntegerType(State.getContext());
1115  }
1116 
1117  unsigned PartOffset = 0;
1118  for (unsigned i = 0; i != NumRegs; ++i) {
1119  State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1120  BasePartOffset + PartOffset,
1121  MemVT.getSimpleVT(),
1123  PartOffset += MemVT.getStoreSize();
1124  }
1125  }
1126  }
1127 }
1128 
1130  SDValue Chain, CallingConv::ID CallConv,
1131  bool isVarArg,
1132  const SmallVectorImpl<ISD::OutputArg> &Outs,
1133  const SmallVectorImpl<SDValue> &OutVals,
1134  const SDLoc &DL, SelectionDAG &DAG) const {
1135  // FIXME: Fails for r600 tests
1136  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1137  // "wave terminate should not have return values");
1138  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1139 }
1140 
1141 //===---------------------------------------------------------------------===//
1142 // Target specific lowering
1143 //===---------------------------------------------------------------------===//
1144 
1145 /// Selects the correct CCAssignFn for a given CallingConvention value.
1147  bool IsVarArg) {
1148  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1149 }
1150 
1152  bool IsVarArg) {
1153  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1154 }
1155 
1157  SelectionDAG &DAG,
1158  MachineFrameInfo &MFI,
1159  int ClobberedFI) const {
1160  SmallVector<SDValue, 8> ArgChains;
1161  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1162  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1163 
1164  // Include the original chain at the beginning of the list. When this is
1165  // used by target LowerCall hooks, this helps legalize find the
1166  // CALLSEQ_BEGIN node.
1167  ArgChains.push_back(Chain);
1168 
1169  // Add a chain value for each stack argument corresponding
1170  for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1171  if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1172  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1173  if (FI->getIndex() < 0) {
1174  int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1175  int64_t InLastByte = InFirstByte;
1176  InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1177 
1178  if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1179  (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1180  ArgChains.push_back(SDValue(L, 1));
1181  }
1182  }
1183  }
1184  }
1185 
1186  // Build a tokenfactor for all the chains.
1187  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1188 }
1189 
1191  SmallVectorImpl<SDValue> &InVals,
1192  StringRef Reason) const {
1193  SDValue Callee = CLI.Callee;
1194  SelectionDAG &DAG = CLI.DAG;
1195 
1196  const Function &Fn = DAG.getMachineFunction().getFunction();
1197 
1198  StringRef FuncName("<unknown>");
1199 
1200  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1201  FuncName = G->getSymbol();
1202  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1203  FuncName = G->getGlobal()->getName();
1204 
1205  DiagnosticInfoUnsupported NoCalls(
1206  Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1207  DAG.getContext()->diagnose(NoCalls);
1208 
1209  if (!CLI.IsTailCall) {
1210  for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1211  InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1212  }
1213 
1214  return DAG.getEntryNode();
1215 }
1216 
1218  SmallVectorImpl<SDValue> &InVals) const {
1219  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1220 }
1221 
1223  SelectionDAG &DAG) const {
1224  const Function &Fn = DAG.getMachineFunction().getFunction();
1225 
1226  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1227  SDLoc(Op).getDebugLoc());
1228  DAG.getContext()->diagnose(NoDynamicAlloca);
1229  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1230  return DAG.getMergeValues(Ops, SDLoc());
1231 }
1232 
1234  SelectionDAG &DAG) const {
1235  switch (Op.getOpcode()) {
1236  default:
1237  Op->print(errs(), &DAG);
1238  llvm_unreachable("Custom lowering code for this "
1239  "instruction is not implemented yet!");
1240  break;
1242  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1244  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1245  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1246  case ISD::FREM: return LowerFREM(Op, DAG);
1247  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1248  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1249  case ISD::FRINT: return LowerFRINT(Op, DAG);
1250  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1251  case ISD::FROUND: return LowerFROUND(Op, DAG);
1252  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1253  case ISD::FLOG:
1254  return LowerFLOG(Op, DAG, numbers::ln2f);
1255  case ISD::FLOG10:
1256  return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1257  case ISD::FEXP:
1258  return lowerFEXP(Op, DAG);
1259  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1260  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1261  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1262  case ISD::FP_TO_SINT:
1263  case ISD::FP_TO_UINT:
1264  return LowerFP_TO_INT(Op, DAG);
1265  case ISD::CTTZ:
1266  case ISD::CTTZ_ZERO_UNDEF:
1267  case ISD::CTLZ:
1268  case ISD::CTLZ_ZERO_UNDEF:
1269  return LowerCTLZ_CTTZ(Op, DAG);
1271  }
1272  return Op;
1273 }
1274 
1277  SelectionDAG &DAG) const {
1278  switch (N->getOpcode()) {
1280  // Different parts of legalization seem to interpret which type of
1281  // sign_extend_inreg is the one to check for custom lowering. The extended
1282  // from type is what really matters, but some places check for custom
1283  // lowering of the result type. This results in trying to use
1284  // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1285  // nothing here and let the illegal result integer be handled normally.
1286  return;
1287  default:
1288  return;
1289  }
1290 }
1291 
1293  SDValue Op,
1294  SelectionDAG &DAG) const {
1295 
1296  const DataLayout &DL = DAG.getDataLayout();
1297  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1298  const GlobalValue *GV = G->getGlobal();
1299 
1300  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1301  G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1302  if (!MFI->isModuleEntryFunction() &&
1303  !GV->getName().equals("llvm.amdgcn.module.lds")) {
1304  SDLoc DL(Op);
1305  const Function &Fn = DAG.getMachineFunction().getFunction();
1306  DiagnosticInfoUnsupported BadLDSDecl(
1307  Fn, "local memory global used by non-kernel function",
1308  DL.getDebugLoc(), DS_Warning);
1309  DAG.getContext()->diagnose(BadLDSDecl);
1310 
1311  // We currently don't have a way to correctly allocate LDS objects that
1312  // aren't directly associated with a kernel. We do force inlining of
1313  // functions that use local objects. However, if these dead functions are
1314  // not eliminated, we don't want a compile time error. Just emit a warning
1315  // and a trap, since there should be no callable path here.
1316  SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1317  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1318  Trap, DAG.getRoot());
1319  DAG.setRoot(OutputChain);
1320  return DAG.getUNDEF(Op.getValueType());
1321  }
1322 
1323  // XXX: What does the value of G->getOffset() mean?
1324  assert(G->getOffset() == 0 &&
1325  "Do not know what to do with an non-zero offset");
1326 
1327  // TODO: We could emit code to handle the initialization somewhere.
1328  // We ignore the initializer for now and legalize it to allow selection.
1329  // The initializer will anyway get errored out during assembly emission.
1330  unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1331  return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1332  }
1333  return SDValue();
1334 }
1335 
1337  SelectionDAG &DAG) const {
1339 
1340  EVT VT = Op.getValueType();
1341  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1342  SDLoc SL(Op);
1343  SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1344  SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1345 
1346  SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1347  return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1348  }
1349 
1350  for (const SDUse &U : Op->ops())
1351  DAG.ExtractVectorElements(U.get(), Args);
1352 
1353  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1354 }
1355 
1357  SelectionDAG &DAG) const {
1358 
1360  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1361  EVT VT = Op.getValueType();
1362  EVT SrcVT = Op.getOperand(0).getValueType();
1363 
1364  // For these types, we have some TableGen patterns except if the index is 1
1365  if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1366  (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1367  Start != 1)
1368  return Op;
1369 
1370  if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1371  (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1372  (Start == 0 || Start == 4))
1373  return Op;
1374 
1375  if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
1376  (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
1377  (Start == 0 || Start == 8))
1378  return Op;
1379 
1380  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1381  VT.getVectorNumElements());
1382 
1383  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1384 }
1385 
1386 /// Generate Min/Max node
1389  SDValue True, SDValue False,
1390  SDValue CC,
1391  DAGCombinerInfo &DCI) const {
1392  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1393  return SDValue();
1394 
1395  SelectionDAG &DAG = DCI.DAG;
1396  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1397  switch (CCOpcode) {
1398  case ISD::SETOEQ:
1399  case ISD::SETONE:
1400  case ISD::SETUNE:
1401  case ISD::SETNE:
1402  case ISD::SETUEQ:
1403  case ISD::SETEQ:
1404  case ISD::SETFALSE:
1405  case ISD::SETFALSE2:
1406  case ISD::SETTRUE:
1407  case ISD::SETTRUE2:
1408  case ISD::SETUO:
1409  case ISD::SETO:
1410  break;
1411  case ISD::SETULE:
1412  case ISD::SETULT: {
1413  if (LHS == True)
1414  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1415  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1416  }
1417  case ISD::SETOLE:
1418  case ISD::SETOLT:
1419  case ISD::SETLE:
1420  case ISD::SETLT: {
1421  // Ordered. Assume ordered for undefined.
1422 
1423  // Only do this after legalization to avoid interfering with other combines
1424  // which might occur.
1425  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1426  !DCI.isCalledByLegalizer())
1427  return SDValue();
1428 
1429  // We need to permute the operands to get the correct NaN behavior. The
1430  // selected operand is the second one based on the failing compare with NaN,
1431  // so permute it based on the compare type the hardware uses.
1432  if (LHS == True)
1433  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1434  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1435  }
1436  case ISD::SETUGE:
1437  case ISD::SETUGT: {
1438  if (LHS == True)
1439  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1440  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1441  }
1442  case ISD::SETGT:
1443  case ISD::SETGE:
1444  case ISD::SETOGE:
1445  case ISD::SETOGT: {
1446  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1447  !DCI.isCalledByLegalizer())
1448  return SDValue();
1449 
1450  if (LHS == True)
1451  return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1452  return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1453  }
1454  case ISD::SETCC_INVALID:
1455  llvm_unreachable("Invalid setcc condcode!");
1456  }
1457  return SDValue();
1458 }
1459 
1460 std::pair<SDValue, SDValue>
1462  SDLoc SL(Op);
1463 
1464  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1465 
1466  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1467  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1468 
1469  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1470  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1471 
1472  return std::make_pair(Lo, Hi);
1473 }
1474 
1476  SDLoc SL(Op);
1477 
1478  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1479  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1480  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1481 }
1482 
1484  SDLoc SL(Op);
1485 
1486  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1487  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1488  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1489 }
1490 
1491 // Split a vector type into two parts. The first part is a power of two vector.
1492 // The second part is whatever is left over, and is a scalar if it would
1493 // otherwise be a 1-vector.
1494 std::pair<EVT, EVT>
1496  EVT LoVT, HiVT;
1497  EVT EltVT = VT.getVectorElementType();
1498  unsigned NumElts = VT.getVectorNumElements();
1499  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1500  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1501  HiVT = NumElts - LoNumElts == 1
1502  ? EltVT
1503  : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1504  return std::make_pair(LoVT, HiVT);
1505 }
1506 
1507 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1508 // scalar.
1509 std::pair<SDValue, SDValue>
1511  const EVT &LoVT, const EVT &HiVT,
1512  SelectionDAG &DAG) const {
1513  assert(LoVT.getVectorNumElements() +
1514  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1515  N.getValueType().getVectorNumElements() &&
1516  "More vector elements requested than available!");
1517  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1518  DAG.getVectorIdxConstant(0, DL));
1519  SDValue Hi = DAG.getNode(
1521  HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1522  return std::make_pair(Lo, Hi);
1523 }
1524 
1526  SelectionDAG &DAG) const {
1527  LoadSDNode *Load = cast<LoadSDNode>(Op);
1528  EVT VT = Op.getValueType();
1529  SDLoc SL(Op);
1530 
1531 
1532  // If this is a 2 element vector, we really want to scalarize and not create
1533  // weird 1 element vectors.
1534  if (VT.getVectorNumElements() == 2) {
1535  SDValue Ops[2];
1536  std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1537  return DAG.getMergeValues(Ops, SL);
1538  }
1539 
1540  SDValue BasePtr = Load->getBasePtr();
1541  EVT MemVT = Load->getMemoryVT();
1542 
1543  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1544 
1545  EVT LoVT, HiVT;
1546  EVT LoMemVT, HiMemVT;
1547  SDValue Lo, Hi;
1548 
1549  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1550  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1551  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1552 
1553  unsigned Size = LoMemVT.getStoreSize();
1554  Align BaseAlign = Load->getAlign();
1555  Align HiAlign = commonAlignment(BaseAlign, Size);
1556 
1557  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1558  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1559  BaseAlign, Load->getMemOperand()->getFlags());
1560  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1561  SDValue HiLoad =
1562  DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1563  HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1564  HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1565 
1566  SDValue Join;
1567  if (LoVT == HiVT) {
1568  // This is the case that the vector is power of two so was evenly split.
1569  Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1570  } else {
1571  Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1572  DAG.getVectorIdxConstant(0, SL));
1573  Join = DAG.getNode(
1575  VT, Join, HiLoad,
1576  DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1577  }
1578 
1579  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1580  LoLoad.getValue(1), HiLoad.getValue(1))};
1581 
1582  return DAG.getMergeValues(Ops, SL);
1583 }
1584 
1586  SelectionDAG &DAG) const {
1587  LoadSDNode *Load = cast<LoadSDNode>(Op);
1588  EVT VT = Op.getValueType();
1589  SDValue BasePtr = Load->getBasePtr();
1590  EVT MemVT = Load->getMemoryVT();
1591  SDLoc SL(Op);
1592  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1593  Align BaseAlign = Load->getAlign();
1594  unsigned NumElements = MemVT.getVectorNumElements();
1595 
1596  // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1597  // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1598  if (NumElements != 3 ||
1599  (BaseAlign < Align(8) &&
1600  !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1601  return SplitVectorLoad(Op, DAG);
1602 
1603  assert(NumElements == 3);
1604 
1605  EVT WideVT =
1607  EVT WideMemVT =
1608  EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1609  SDValue WideLoad = DAG.getExtLoad(
1610  Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1611  WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1612  return DAG.getMergeValues(
1613  {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1614  DAG.getVectorIdxConstant(0, SL)),
1615  WideLoad.getValue(1)},
1616  SL);
1617 }
1618 
1620  SelectionDAG &DAG) const {
1621  StoreSDNode *Store = cast<StoreSDNode>(Op);
1622  SDValue Val = Store->getValue();
1623  EVT VT = Val.getValueType();
1624 
1625  // If this is a 2 element vector, we really want to scalarize and not create
1626  // weird 1 element vectors.
1627  if (VT.getVectorNumElements() == 2)
1628  return scalarizeVectorStore(Store, DAG);
1629 
1630  EVT MemVT = Store->getMemoryVT();
1631  SDValue Chain = Store->getChain();
1632  SDValue BasePtr = Store->getBasePtr();
1633  SDLoc SL(Op);
1634 
1635  EVT LoVT, HiVT;
1636  EVT LoMemVT, HiMemVT;
1637  SDValue Lo, Hi;
1638 
1639  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1640  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1641  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1642 
1643  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1644 
1645  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1646  Align BaseAlign = Store->getAlign();
1647  unsigned Size = LoMemVT.getStoreSize();
1648  Align HiAlign = commonAlignment(BaseAlign, Size);
1649 
1650  SDValue LoStore =
1651  DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1652  Store->getMemOperand()->getFlags());
1653  SDValue HiStore =
1654  DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1655  HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1656 
1657  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1658 }
1659 
1660 // This is a shortcut for integer division because we have fast i32<->f32
1661 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1662 // float is enough to accurately represent up to a 24-bit signed integer.
1664  bool Sign) const {
1665  SDLoc DL(Op);
1666  EVT VT = Op.getValueType();
1667  SDValue LHS = Op.getOperand(0);
1668  SDValue RHS = Op.getOperand(1);
1669  MVT IntVT = MVT::i32;
1670  MVT FltVT = MVT::f32;
1671 
1672  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1673  if (LHSSignBits < 9)
1674  return SDValue();
1675 
1676  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1677  if (RHSSignBits < 9)
1678  return SDValue();
1679 
1680  unsigned BitSize = VT.getSizeInBits();
1681  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1682  unsigned DivBits = BitSize - SignBits;
1683  if (Sign)
1684  ++DivBits;
1685 
1688 
1689  SDValue jq = DAG.getConstant(1, DL, IntVT);
1690 
1691  if (Sign) {
1692  // char|short jq = ia ^ ib;
1693  jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1694 
1695  // jq = jq >> (bitsize - 2)
1696  jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1697  DAG.getConstant(BitSize - 2, DL, VT));
1698 
1699  // jq = jq | 0x1
1700  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1701  }
1702 
1703  // int ia = (int)LHS;
1704  SDValue ia = LHS;
1705 
1706  // int ib, (int)RHS;
1707  SDValue ib = RHS;
1708 
1709  // float fa = (float)ia;
1710  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1711 
1712  // float fb = (float)ib;
1713  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1714 
1715  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1716  fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1717 
1718  // fq = trunc(fq);
1719  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1720 
1721  // float fqneg = -fq;
1722  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1723 
1724  MachineFunction &MF = DAG.getMachineFunction();
1725 
1726  bool UseFmadFtz = false;
1727  if (Subtarget->isGCN()) {
1729  UseFmadFtz = MFI->getMode().allFP32Denormals();
1730  }
1731 
1732  // float fr = mad(fqneg, fb, fa);
1733  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1734  : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1735  : (unsigned)ISD::FMAD;
1736  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1737 
1738  // int iq = (int)fq;
1739  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1740 
1741  // fr = fabs(fr);
1742  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1743 
1744  // fb = fabs(fb);
1745  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1746 
1747  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1748 
1749  // int cv = fr >= fb;
1750  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1751 
1752  // jq = (cv ? jq : 0);
1753  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1754 
1755  // dst = iq + jq;
1756  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1757 
1758  // Rem needs compensation, it's easier to recompute it
1759  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1760  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1761 
1762  // Truncate to number of bits this divide really is.
1763  if (Sign) {
1764  SDValue InRegSize
1765  = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1766  Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1767  Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1768  } else {
1769  SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1770  Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1771  Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1772  }
1773 
1774  return DAG.getMergeValues({ Div, Rem }, DL);
1775 }
1776 
1778  SelectionDAG &DAG,
1780  SDLoc DL(Op);
1781  EVT VT = Op.getValueType();
1782 
1783  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1784 
1785  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1786 
1787  SDValue One = DAG.getConstant(1, DL, HalfVT);
1788  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1789 
1790  //HiLo split
1791  SDValue LHS = Op.getOperand(0);
1792  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1793  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1794 
1795  SDValue RHS = Op.getOperand(1);
1796  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1797  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1798 
1799  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1801 
1802  SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1803  LHS_Lo, RHS_Lo);
1804 
1805  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1806  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1807 
1808  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1809  Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1810  return;
1811  }
1812 
1813  if (isTypeLegal(MVT::i64)) {
1814  // The algorithm here is based on ideas from "Software Integer Division",
1815  // Tom Rodeheffer, August 2008.
1816 
1817  MachineFunction &MF = DAG.getMachineFunction();
1819 
1820  // Compute denominator reciprocal.
1821  unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1822  (unsigned)ISD::FMA :
1823  !MFI->getMode().allFP32Denormals() ?
1824  (unsigned)ISD::FMAD :
1825  (unsigned)AMDGPUISD::FMAD_FTZ;
1826 
1827  SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1828  SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1829  SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1830  DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1831  Cvt_Lo);
1832  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1833  SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1834  DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1835  SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1836  DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1837  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1838  SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1839  DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1840  Mul1);
1841  SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1842  SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1843  SDValue Rcp64 = DAG.getBitcast(VT,
1844  DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1845 
1846  SDValue Zero64 = DAG.getConstant(0, DL, VT);
1847  SDValue One64 = DAG.getConstant(1, DL, VT);
1848  SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1849  SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1850 
1851  // First round of UNR (Unsigned integer Newton-Raphson).
1852  SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1853  SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1854  SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1855  SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1856  Zero);
1857  SDValue Mulhi1_Hi =
1858  DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1859  SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1860  Mulhi1_Lo, Zero1);
1861  SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1862  Mulhi1_Hi, Add1_Lo.getValue(1));
1863  SDValue Add1 = DAG.getBitcast(VT,
1864  DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1865 
1866  // Second round of UNR.
1867  SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1868  SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1869  SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1870  Zero);
1871  SDValue Mulhi2_Hi =
1872  DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1873  SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1874  Mulhi2_Lo, Zero1);
1875  SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1876  Mulhi2_Hi, Add2_Lo.getValue(1));
1877  SDValue Add2 = DAG.getBitcast(VT,
1878  DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1879 
1880  SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1881 
1882  SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1883 
1884  SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1885  SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1886  SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1887  Mul3_Lo, Zero1);
1888  SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1889  Mul3_Hi, Sub1_Lo.getValue(1));
1890  SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1891  SDValue Sub1 = DAG.getBitcast(VT,
1892  DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1893 
1894  SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1895  SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1896  ISD::SETUGE);
1897  SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1898  ISD::SETUGE);
1899  SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1900 
1901  // TODO: Here and below portions of the code can be enclosed into if/endif.
1902  // Currently control flow is unconditional and we have 4 selects after
1903  // potential endif to substitute PHIs.
1904 
1905  // if C3 != 0 ...
1906  SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1907  RHS_Lo, Zero1);
1908  SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1909  RHS_Hi, Sub1_Lo.getValue(1));
1910  SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1911  Zero, Sub2_Lo.getValue(1));
1912  SDValue Sub2 = DAG.getBitcast(VT,
1913  DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1914 
1915  SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1916 
1917  SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1918  ISD::SETUGE);
1919  SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1920  ISD::SETUGE);
1921  SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1922 
1923  // if (C6 != 0)
1924  SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1925 
1926  SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1927  RHS_Lo, Zero1);
1928  SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1929  RHS_Hi, Sub2_Lo.getValue(1));
1930  SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1931  Zero, Sub3_Lo.getValue(1));
1932  SDValue Sub3 = DAG.getBitcast(VT,
1933  DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1934 
1935  // endif C6
1936  // endif C3
1937 
1938  SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1939  SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1940 
1941  SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1942  SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1943 
1944  Results.push_back(Div);
1945  Results.push_back(Rem);
1946 
1947  return;
1948  }
1949 
1950  // r600 expandion.
1951  // Get Speculative values
1952  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1953  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1954 
1955  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1956  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1957  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1958 
1959  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1960  SDValue DIV_Lo = Zero;
1961 
1962  const unsigned halfBitWidth = HalfVT.getSizeInBits();
1963 
1964  for (unsigned i = 0; i < halfBitWidth; ++i) {
1965  const unsigned bitPos = halfBitWidth - i - 1;
1966  SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1967  // Get value of high bit
1968  SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1969  HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1970  HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1971 
1972  // Shift
1973  REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1974  // Add LHS high bit
1975  REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1976 
1977  SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1978  SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1979 
1980  DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1981 
1982  // Update REM
1983  SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1984  REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1985  }
1986 
1987  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1988  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1989  Results.push_back(DIV);
1990  Results.push_back(REM);
1991 }
1992 
1993 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1994  SelectionDAG &DAG) const {
1995  SDLoc DL(Op);
1996  EVT VT = Op.getValueType();
1997 
1998  if (VT == MVT::i64) {
2000  LowerUDIVREM64(Op, DAG, Results);
2001  return DAG.getMergeValues(Results, DL);
2002  }
2003 
2004  if (VT == MVT::i32) {
2005  if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2006  return Res;
2007  }
2008 
2009  SDValue X = Op.getOperand(0);
2010  SDValue Y = Op.getOperand(1);
2011 
2012  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2013  // algorithm used here.
2014 
2015  // Initial estimate of inv(y).
2016  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2017 
2018  // One round of UNR.
2019  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2020  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2021  Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2022  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2023 
2024  // Quotient/remainder estimate.
2025  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2026  SDValue R =
2027  DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2028 
2029  // First quotient/remainder refinement.
2030  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2031  SDValue One = DAG.getConstant(1, DL, VT);
2032  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2033  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2034  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2035  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2036  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2037 
2038  // Second quotient/remainder refinement.
2039  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2040  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2041  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2042  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2043  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2044 
2045  return DAG.getMergeValues({Q, R}, DL);
2046 }
2047 
2049  SelectionDAG &DAG) const {
2050  SDLoc DL(Op);
2051  EVT VT = Op.getValueType();
2052 
2053  SDValue LHS = Op.getOperand(0);
2054  SDValue RHS = Op.getOperand(1);
2055 
2056  SDValue Zero = DAG.getConstant(0, DL, VT);
2057  SDValue NegOne = DAG.getConstant(-1, DL, VT);
2058 
2059  if (VT == MVT::i32) {
2060  if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2061  return Res;
2062  }
2063 
2064  if (VT == MVT::i64 &&
2065  DAG.ComputeNumSignBits(LHS) > 32 &&
2066  DAG.ComputeNumSignBits(RHS) > 32) {
2067  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2068 
2069  //HiLo split
2070  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2071  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2072  SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2073  LHS_Lo, RHS_Lo);
2074  SDValue Res[2] = {
2075  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2076  DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2077  };
2078  return DAG.getMergeValues(Res, DL);
2079  }
2080 
2081  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2082  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2083  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2084  SDValue RSign = LHSign; // Remainder sign is the same as LHS
2085 
2086  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2087  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2088 
2089  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2090  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2091 
2092  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2093  SDValue Rem = Div.getValue(1);
2094 
2095  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2096  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2097 
2098  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2099  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2100 
2101  SDValue Res[2] = {
2102  Div,
2103  Rem
2104  };
2105  return DAG.getMergeValues(Res, DL);
2106 }
2107 
2108 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2110  SDLoc SL(Op);
2111  EVT VT = Op.getValueType();
2112  auto Flags = Op->getFlags();
2113  SDValue X = Op.getOperand(0);
2114  SDValue Y = Op.getOperand(1);
2115 
2116  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2117  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2118  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2119  // TODO: For f32 use FMAD instead if !hasFastFMA32?
2120  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2121 }
2122 
2124  SDLoc SL(Op);
2125  SDValue Src = Op.getOperand(0);
2126 
2127  // result = trunc(src)
2128  // if (src > 0.0 && src != result)
2129  // result += 1.0
2130 
2131  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2132 
2133  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2134  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2135 
2136  EVT SetCCVT =
2138 
2139  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2140  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2141  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2142 
2143  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2144  // TODO: Should this propagate fast-math-flags?
2145  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2146 }
2147 
2149  SelectionDAG &DAG) {
2150  const unsigned FractBits = 52;
2151  const unsigned ExpBits = 11;
2152 
2153  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2154  Hi,
2155  DAG.getConstant(FractBits - 32, SL, MVT::i32),
2156  DAG.getConstant(ExpBits, SL, MVT::i32));
2157  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2158  DAG.getConstant(1023, SL, MVT::i32));
2159 
2160  return Exp;
2161 }
2162 
2164  SDLoc SL(Op);
2165  SDValue Src = Op.getOperand(0);
2166 
2167  assert(Op.getValueType() == MVT::f64);
2168 
2169  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2170 
2171  // Extract the upper half, since this is where we will find the sign and
2172  // exponent.
2173  SDValue Hi = getHiHalf64(Src, DAG);
2174 
2175  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2176 
2177  const unsigned FractBits = 52;
2178 
2179  // Extract the sign bit.
2180  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2181  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2182 
2183  // Extend back to 64-bits.
2184  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2185  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2186 
2187  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2188  const SDValue FractMask
2189  = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2190 
2191  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2192  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2193  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2194 
2195  EVT SetCCVT =
2197 
2198  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2199 
2200  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2201  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2202 
2203  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2204  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2205 
2206  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2207 }
2208 
2210  SDLoc SL(Op);
2211  SDValue Src = Op.getOperand(0);
2212 
2213  assert(Op.getValueType() == MVT::f64);
2214 
2215  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2216  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2217  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2218 
2219  // TODO: Should this propagate fast-math-flags?
2220 
2221  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2222  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2223 
2224  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2225 
2226  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2227  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2228 
2229  EVT SetCCVT =
2231  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2232 
2233  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2234 }
2235 
2237  // FNEARBYINT and FRINT are the same, except in their handling of FP
2238  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2239  // rint, so just treat them as equivalent.
2240  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2241 }
2242 
2243 // XXX - May require not supporting f32 denormals?
2244 
2245 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2246 // compare and vselect end up producing worse code than scalarizing the whole
2247 // operation.
2249  SDLoc SL(Op);
2250  SDValue X = Op.getOperand(0);
2251  EVT VT = Op.getValueType();
2252 
2253  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2254 
2255  // TODO: Should this propagate fast-math-flags?
2256 
2257  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2258 
2259  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2260 
2261  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2262  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2263  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2264 
2265  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2266 
2267  EVT SetCCVT =
2268  getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2269 
2270  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2271 
2272  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2273 
2274  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2275 }
2276 
2278  SDLoc SL(Op);
2279  SDValue Src = Op.getOperand(0);
2280 
2281  // result = trunc(src);
2282  // if (src < 0.0 && src != result)
2283  // result += -1.0.
2284 
2285  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2286 
2287  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2288  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2289 
2290  EVT SetCCVT =
2292 
2293  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2294  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2295  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2296 
2297  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2298  // TODO: Should this propagate fast-math-flags?
2299  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2300 }
2301 
2303  double Log2BaseInverted) const {
2304  EVT VT = Op.getValueType();
2305 
2306  SDLoc SL(Op);
2307  SDValue Operand = Op.getOperand(0);
2308  SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2309  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2310 
2311  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2312 }
2313 
2314 // exp2(M_LOG2E_F * f);
2316  EVT VT = Op.getValueType();
2317  SDLoc SL(Op);
2318  SDValue Src = Op.getOperand(0);
2319 
2320  const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2321  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2322  return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2323 }
2324 
2325 static bool isCtlzOpc(unsigned Opc) {
2326  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2327 }
2328 
2329 static bool isCttzOpc(unsigned Opc) {
2330  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2331 }
2332 
2334  SDLoc SL(Op);
2335  SDValue Src = Op.getOperand(0);
2336 
2337  assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2338  bool Ctlz = isCtlzOpc(Op.getOpcode());
2339  unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2340 
2341  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2342  Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2343 
2344  if (Src.getValueType() == MVT::i32) {
2345  // (ctlz hi:lo) -> (umin (ffbh src), 32)
2346  // (cttz hi:lo) -> (umin (ffbl src), 32)
2347  // (ctlz_zero_undef src) -> (ffbh src)
2348  // (cttz_zero_undef src) -> (ffbl src)
2349  SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2350  if (!ZeroUndef) {
2351  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2352  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2353  }
2354  return NewOpr;
2355  }
2356 
2357  SDValue Lo, Hi;
2358  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2359 
2360  SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2361  SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2362 
2363  // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2364  // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2365  // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2366  // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2367 
2368  unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2369  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2370  if (Ctlz)
2371  OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2372  else
2373  OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2374 
2375  SDValue NewOpr;
2376  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2377  if (!ZeroUndef) {
2378  const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2379  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2380  }
2381 
2382  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2383 }
2384 
2386  bool Signed) const {
2387  // The regular method converting a 64-bit integer to float roughly consists of
2388  // 2 steps: normalization and rounding. In fact, after normalization, the
2389  // conversion from a 64-bit integer to a float is essentially the same as the
2390  // one from a 32-bit integer. The only difference is that it has more
2391  // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2392  // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2393  // converted into the correct float number. The basic steps for the unsigned
2394  // conversion are illustrated in the following pseudo code:
2395  //
2396  // f32 uitofp(i64 u) {
2397  // i32 hi, lo = split(u);
2398  // // Only count the leading zeros in hi as we have native support of the
2399  // // conversion from i32 to f32. If hi is all 0s, the conversion is
2400  // // reduced to a 32-bit one automatically.
2401  // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2402  // u <<= shamt;
2403  // hi, lo = split(u);
2404  // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2405  // // convert it as a 32-bit integer and scale the result back.
2406  // return uitofp(hi) * 2^(32 - shamt);
2407  // }
2408  //
2409  // The signed one follows the same principle but uses 'ffbh_i32' to count its
2410  // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2411  // converted instead followed by negation based its sign bit.
2412 
2413  SDLoc SL(Op);
2414  SDValue Src = Op.getOperand(0);
2415 
2416  SDValue Lo, Hi;
2417  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2418  SDValue Sign;
2419  SDValue ShAmt;
2420  if (Signed && Subtarget->isGCN()) {
2421  // We also need to consider the sign bit in Lo if Hi has just sign bits,
2422  // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2423  // account. That is, the maximal shift is
2424  // - 32 if Lo and Hi have opposite signs;
2425  // - 33 if Lo and Hi have the same sign.
2426  //
2427  // Or, MaxShAmt = 33 + OppositeSign, where
2428  //
2429  // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2430  // - -1 if Lo and Hi have opposite signs; and
2431  // - 0 otherwise.
2432  //
2433  // All in all, ShAmt is calculated as
2434  //
2435  // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2436  //
2437  // or
2438  //
2439  // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2440  //
2441  // to reduce the critical path.
2442  SDValue OppositeSign = DAG.getNode(
2443  ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2444  DAG.getConstant(31, SL, MVT::i32));
2445  SDValue MaxShAmt =
2446  DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2447  OppositeSign);
2448  // Count the leading sign bits.
2449  ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2450  // Different from unsigned conversion, the shift should be one bit less to
2451  // preserve the sign bit.
2452  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2453  DAG.getConstant(1, SL, MVT::i32));
2454  ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2455  } else {
2456  if (Signed) {
2457  // Without 'ffbh_i32', only leading zeros could be counted. Take the
2458  // absolute value first.
2459  Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2460  DAG.getConstant(63, SL, MVT::i64));
2461  SDValue Abs =
2462  DAG.getNode(ISD::XOR, SL, MVT::i64,
2463  DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2464  std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2465  }
2466  // Count the leading zeros.
2467  ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2468  // The shift amount for signed integers is [0, 32].
2469  }
2470  // Normalize the given 64-bit integer.
2471  SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2472  // Split it again.
2473  std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2474  // Calculate the adjust bit for rounding.
2475  // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2476  SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2477  DAG.getConstant(1, SL, MVT::i32), Lo);
2478  // Get the 32-bit normalized integer.
2479  Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2480  // Convert the normalized 32-bit integer into f32.
2481  unsigned Opc =
2482  (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2483  SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2484 
2485  // Finally, need to scale back the converted floating number as the original
2486  // 64-bit integer is converted as a 32-bit one.
2487  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2488  ShAmt);
2489  // On GCN, use LDEXP directly.
2490  if (Subtarget->isGCN())
2491  return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2492 
2493  // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2494  // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2495  // exponent is enough to avoid overflowing into the sign bit.
2496  SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2497  DAG.getConstant(23, SL, MVT::i32));
2498  SDValue IVal =
2499  DAG.getNode(ISD::ADD, SL, MVT::i32,
2500  DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2501  if (Signed) {
2502  // Set the sign bit.
2503  Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2504  DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2505  DAG.getConstant(31, SL, MVT::i32));
2506  IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2507  }
2508  return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2509 }
2510 
2512  bool Signed) const {
2513  SDLoc SL(Op);
2514  SDValue Src = Op.getOperand(0);
2515 
2516  SDValue Lo, Hi;
2517  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2518 
2520  SL, MVT::f64, Hi);
2521 
2522  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2523 
2524  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2525  DAG.getConstant(32, SL, MVT::i32));
2526  // TODO: Should this propagate fast-math-flags?
2527  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2528 }
2529 
2531  SelectionDAG &DAG) const {
2532  // TODO: Factor out code common with LowerSINT_TO_FP.
2533  EVT DestVT = Op.getValueType();
2534  SDValue Src = Op.getOperand(0);
2535  EVT SrcVT = Src.getValueType();
2536 
2537  if (SrcVT == MVT::i16) {
2538  if (DestVT == MVT::f16)
2539  return Op;
2540  SDLoc DL(Op);
2541 
2542  // Promote src to i32
2544  return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2545  }
2546 
2547  assert(SrcVT == MVT::i64 && "operation should be legal");
2548 
2549  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2550  SDLoc DL(Op);
2551 
2552  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2553  SDValue FPRoundFlag =
2554  DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2555  SDValue FPRound =
2556  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2557 
2558  return FPRound;
2559  }
2560 
2561  if (DestVT == MVT::f32)
2562  return LowerINT_TO_FP32(Op, DAG, false);
2563 
2564  assert(DestVT == MVT::f64);
2565  return LowerINT_TO_FP64(Op, DAG, false);
2566 }
2567 
2569  SelectionDAG &DAG) const {
2570  EVT DestVT = Op.getValueType();
2571 
2572  SDValue Src = Op.getOperand(0);
2573  EVT SrcVT = Src.getValueType();
2574 
2575  if (SrcVT == MVT::i16) {
2576  if (DestVT == MVT::f16)
2577  return Op;
2578 
2579  SDLoc DL(Op);
2580  // Promote src to i32
2582  return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2583  }
2584 
2585  assert(SrcVT == MVT::i64 && "operation should be legal");
2586 
2587  // TODO: Factor out code common with LowerUINT_TO_FP.
2588 
2589  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2590  SDLoc DL(Op);
2591  SDValue Src = Op.getOperand(0);
2592 
2593  SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2594  SDValue FPRoundFlag =
2595  DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2596  SDValue FPRound =
2597  DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2598 
2599  return FPRound;
2600  }
2601 
2602  if (DestVT == MVT::f32)
2603  return LowerINT_TO_FP32(Op, DAG, true);
2604 
2605  assert(DestVT == MVT::f64);
2606  return LowerINT_TO_FP64(Op, DAG, true);
2607 }
2608 
2610  bool Signed) const {
2611  SDLoc SL(Op);
2612 
2613  SDValue Src = Op.getOperand(0);
2614  EVT SrcVT = Src.getValueType();
2615 
2616  assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2617 
2618  // The basic idea of converting a floating point number into a pair of 32-bit
2619  // integers is illustrated as follows:
2620  //
2621  // tf := trunc(val);
2622  // hif := floor(tf * 2^-32);
2623  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2624  // hi := fptoi(hif);
2625  // lo := fptoi(lof);
2626  //
2627  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2628  SDValue Sign;
2629  if (Signed && SrcVT == MVT::f32) {
2630  // However, a 32-bit floating point number has only 23 bits mantissa and
2631  // it's not enough to hold all the significant bits of `lof` if val is
2632  // negative. To avoid the loss of precision, We need to take the absolute
2633  // value after truncating and flip the result back based on the original
2634  // signedness.
2635  Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2636  DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2637  DAG.getConstant(31, SL, MVT::i32));
2638  Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2639  }
2640 
2641  SDValue K0, K1;
2642  if (SrcVT == MVT::f64) {
2643  K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
2644  SL, SrcVT);
2645  K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
2646  SL, SrcVT);
2647  } else {
2648  K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
2649  SrcVT);
2650  K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
2651  SrcVT);
2652  }
2653  // TODO: Should this propagate fast-math-flags?
2654  SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2655 
2656  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2657 
2658  SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2659 
2660  SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2661  : ISD::FP_TO_UINT,
2662  SL, MVT::i32, FloorMul);
2663  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2664 
2665  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2666  DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2667 
2668  if (Signed && SrcVT == MVT::f32) {
2669  assert(Sign);
2670  // Flip the result based on the signedness, which is either all 0s or 1s.
2671  Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2672  DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2673  // r := xor(r, sign) - sign;
2674  Result =
2675  DAG.getNode(ISD::SUB, SL, MVT::i64,
2676  DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2677  }
2678 
2679  return Result;
2680 }
2681 
2683  SDLoc DL(Op);
2684  SDValue N0 = Op.getOperand(0);
2685 
2686  // Convert to target node to get known bits
2687  if (N0.getValueType() == MVT::f32)
2688  return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2689 
2690  if (getTargetMachine().Options.UnsafeFPMath) {
2691  // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2692  return SDValue();
2693  }
2694 
2696 
2697  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2698  const unsigned ExpMask = 0x7ff;
2699  const unsigned ExpBiasf64 = 1023;
2700  const unsigned ExpBiasf16 = 15;
2701  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2702  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2703  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2704  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2705  DAG.getConstant(32, DL, MVT::i64));
2706  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2707  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2708  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2709  DAG.getConstant(20, DL, MVT::i64));
2710  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2711  DAG.getConstant(ExpMask, DL, MVT::i32));
2712  // Subtract the fp64 exponent bias (1023) to get the real exponent and
2713  // add the f16 bias (15) to get the biased exponent for the f16 format.
2714  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2715  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2716 
2717  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2718  DAG.getConstant(8, DL, MVT::i32));
2719  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2720  DAG.getConstant(0xffe, DL, MVT::i32));
2721 
2722  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2723  DAG.getConstant(0x1ff, DL, MVT::i32));
2724  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2725 
2726  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2727  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2728 
2729  // (M != 0 ? 0x0200 : 0) | 0x7c00;
2730  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2731  DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2732  Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2733 
2734  // N = M | (E << 12);
2735  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2736  DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2737  DAG.getConstant(12, DL, MVT::i32)));
2738 
2739  // B = clamp(1-E, 0, 13);
2740  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2741  One, E);
2742  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2743  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2744  DAG.getConstant(13, DL, MVT::i32));
2745 
2746  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2747  DAG.getConstant(0x1000, DL, MVT::i32));
2748 
2749  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2750  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2751  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2752  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2753 
2754  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2755  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2756  DAG.getConstant(0x7, DL, MVT::i32));
2757  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2758  DAG.getConstant(2, DL, MVT::i32));
2759  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2760  One, Zero, ISD::SETEQ);
2761  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2762  One, Zero, ISD::SETGT);
2763  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2764  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2765 
2766  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2767  DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2768  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2769  I, V, ISD::SETEQ);
2770 
2771  // Extract the sign bit.
2772  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2773  DAG.getConstant(16, DL, MVT::i32));
2774  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2775  DAG.getConstant(0x8000, DL, MVT::i32));
2776 
2777  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2778  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2779 }
2780 
2782  SelectionDAG &DAG) const {
2783  SDValue Src = Op.getOperand(0);
2784  unsigned OpOpcode = Op.getOpcode();
2785  EVT SrcVT = Src.getValueType();
2786  EVT DestVT = Op.getValueType();
2787 
2788  // Will be selected natively
2789  if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2790  return Op;
2791 
2792  // Promote i16 to i32
2793  if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2794  SDLoc DL(Op);
2795 
2796  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2797  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2798  }
2799 
2800  if (SrcVT == MVT::f16 ||
2801  (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2802  SDLoc DL(Op);
2803 
2804  SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2805  unsigned Ext =
2807  return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2808  }
2809 
2810  if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2811  return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2812 
2813  return SDValue();
2814 }
2815 
2817  SelectionDAG &DAG) const {
2818  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2819  MVT VT = Op.getSimpleValueType();
2820  MVT ScalarVT = VT.getScalarType();
2821 
2822  assert(VT.isVector());
2823 
2824  SDValue Src = Op.getOperand(0);
2825  SDLoc DL(Op);
2826 
2827  // TODO: Don't scalarize on Evergreen?
2828  unsigned NElts = VT.getVectorNumElements();
2830  DAG.ExtractVectorElements(Src, Args, 0, NElts);
2831 
2832  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2833  for (unsigned I = 0; I < NElts; ++I)
2834  Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2835 
2836  return DAG.getBuildVector(VT, DL, Args);
2837 }
2838 
2839 //===----------------------------------------------------------------------===//
2840 // Custom DAG optimizations
2841 //===----------------------------------------------------------------------===//
2842 
2843 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2844  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2845 }
2846 
2847 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2848  EVT VT = Op.getValueType();
2849  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2850  // as unsigned 24-bit values.
2852 }
2853 
2856  SelectionDAG &DAG = DCI.DAG;
2857  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2858  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2859 
2860  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2861  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2862  unsigned NewOpcode = Node24->getOpcode();
2863  if (IsIntrin) {
2864  unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2865  switch (IID) {
2866  case Intrinsic::amdgcn_mul_i24:
2867  NewOpcode = AMDGPUISD::MUL_I24;
2868  break;
2869  case Intrinsic::amdgcn_mul_u24:
2870  NewOpcode = AMDGPUISD::MUL_U24;
2871  break;
2872  case Intrinsic::amdgcn_mulhi_i24:
2873  NewOpcode = AMDGPUISD::MULHI_I24;
2874  break;
2875  case Intrinsic::amdgcn_mulhi_u24:
2876  NewOpcode = AMDGPUISD::MULHI_U24;
2877  break;
2878  default:
2879  llvm_unreachable("Expected 24-bit mul intrinsic");
2880  }
2881  }
2882 
2883  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2884 
2885  // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2886  // the operands to have other uses, but will only perform simplifications that
2887  // involve bypassing some nodes for this user.
2888  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2889  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2890  if (DemandedLHS || DemandedRHS)
2891  return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2892  DemandedLHS ? DemandedLHS : LHS,
2893  DemandedRHS ? DemandedRHS : RHS);
2894 
2895  // Now try SimplifyDemandedBits which can simplify the nodes used by our
2896  // operands if this node is the only user.
2897  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2898  return SDValue(Node24, 0);
2899  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2900  return SDValue(Node24, 0);
2901 
2902  return SDValue();
2903 }
2904 
2905 template <typename IntTy>
2906 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2907  uint32_t Width, const SDLoc &DL) {
2908  if (Width + Offset < 32) {
2909  uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2910  IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2911  return DAG.getConstant(Result, DL, MVT::i32);
2912  }
2913 
2914  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2915 }
2916 
2917 static bool hasVolatileUser(SDNode *Val) {
2918  for (SDNode *U : Val->uses()) {
2919  if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2920  if (M->isVolatile())
2921  return true;
2922  }
2923  }
2924 
2925  return false;
2926 }
2927 
2929  // i32 vectors are the canonical memory type.
2930  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2931  return false;
2932 
2933  if (!VT.isByteSized())
2934  return false;
2935 
2936  unsigned Size = VT.getStoreSize();
2937 
2938  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2939  return false;
2940 
2941  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2942  return false;
2943 
2944  return true;
2945 }
2946 
2947 // Replace load of an illegal type with a store of a bitcast to a friendlier
2948 // type.
2950  DAGCombinerInfo &DCI) const {
2951  if (!DCI.isBeforeLegalize())
2952  return SDValue();
2953 
2954  LoadSDNode *LN = cast<LoadSDNode>(N);
2955  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2956  return SDValue();
2957 
2958  SDLoc SL(N);
2959  SelectionDAG &DAG = DCI.DAG;
2960  EVT VT = LN->getMemoryVT();
2961 
2962  unsigned Size = VT.getStoreSize();
2963  Align Alignment = LN->getAlign();
2964  if (Alignment < Size && isTypeLegal(VT)) {
2965  unsigned IsFast;
2966  unsigned AS = LN->getAddressSpace();
2967 
2968  // Expand unaligned loads earlier than legalization. Due to visitation order
2969  // problems during legalization, the emitted instructions to pack and unpack
2970  // the bytes again are not eliminated in the case of an unaligned copy.
2972  VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2973  if (VT.isVector())
2974  return SplitVectorLoad(SDValue(LN, 0), DAG);
2975 
2976  SDValue Ops[2];
2977  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2978 
2979  return DAG.getMergeValues(Ops, SDLoc(N));
2980  }
2981 
2982  if (!IsFast)
2983  return SDValue();
2984  }
2985 
2986  if (!shouldCombineMemoryType(VT))
2987  return SDValue();
2988 
2989  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2990 
2991  SDValue NewLoad
2992  = DAG.getLoad(NewVT, SL, LN->getChain(),
2993  LN->getBasePtr(), LN->getMemOperand());
2994 
2995  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2996  DCI.CombineTo(N, BC, NewLoad.getValue(1));
2997  return SDValue(N, 0);
2998 }
2999 
3000 // Replace store of an illegal type with a store of a bitcast to a friendlier
3001 // type.
3003  DAGCombinerInfo &DCI) const {
3004  if (!DCI.isBeforeLegalize())
3005  return SDValue();
3006 
3007  StoreSDNode *SN = cast<StoreSDNode>(N);
3008  if (!SN->isSimple() || !ISD::isNormalStore(SN))
3009  return SDValue();
3010 
3011  EVT VT = SN->getMemoryVT();
3012  unsigned Size = VT.getStoreSize();
3013 
3014  SDLoc SL(N);
3015  SelectionDAG &DAG = DCI.DAG;
3016  Align Alignment = SN->getAlign();
3017  if (Alignment < Size && isTypeLegal(VT)) {
3018  unsigned IsFast;
3019  unsigned AS = SN->getAddressSpace();
3020 
3021  // Expand unaligned stores earlier than legalization. Due to visitation
3022  // order problems during legalization, the emitted instructions to pack and
3023  // unpack the bytes again are not eliminated in the case of an unaligned
3024  // copy.
3026  VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3027  if (VT.isVector())
3028  return SplitVectorStore(SDValue(SN, 0), DAG);
3029 
3030  return expandUnalignedStore(SN, DAG);
3031  }
3032 
3033  if (!IsFast)
3034  return SDValue();
3035  }
3036 
3037  if (!shouldCombineMemoryType(VT))
3038  return SDValue();
3039 
3040  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3041  SDValue Val = SN->getValue();
3042 
3043  //DCI.AddToWorklist(Val.getNode());
3044 
3045  bool OtherUses = !Val.hasOneUse();
3046  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3047  if (OtherUses) {
3048  SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3049  DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3050  }
3051 
3052  return DAG.getStore(SN->getChain(), SL, CastVal,
3053  SN->getBasePtr(), SN->getMemOperand());
3054 }
3055 
3056 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3057 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3058 // issues.
3060  DAGCombinerInfo &DCI) const {
3061  SelectionDAG &DAG = DCI.DAG;
3062  SDValue N0 = N->getOperand(0);
3063 
3064  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3065  // (vt2 (truncate (assertzext vt0:x, vt1)))
3066  if (N0.getOpcode() == ISD::TRUNCATE) {
3067  SDValue N1 = N->getOperand(1);
3068  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3069  SDLoc SL(N);
3070 
3071  SDValue Src = N0.getOperand(0);
3072  EVT SrcVT = Src.getValueType();
3073  if (SrcVT.bitsGE(ExtVT)) {
3074  SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3075  return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3076  }
3077  }
3078 
3079  return SDValue();
3080 }
3081 
3083  SDNode *N, DAGCombinerInfo &DCI) const {
3084  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3085  switch (IID) {
3086  case Intrinsic::amdgcn_mul_i24:
3087  case Intrinsic::amdgcn_mul_u24:
3088  case Intrinsic::amdgcn_mulhi_i24:
3089  case Intrinsic::amdgcn_mulhi_u24:
3090  return simplifyMul24(N, DCI);
3091  case Intrinsic::amdgcn_fract:
3092  case Intrinsic::amdgcn_rsq:
3093  case Intrinsic::amdgcn_rcp_legacy:
3094  case Intrinsic::amdgcn_rsq_legacy:
3095  case Intrinsic::amdgcn_rsq_clamp:
3096  case Intrinsic::amdgcn_ldexp: {
3097  // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3098  SDValue Src = N->getOperand(1);
3099  return Src.isUndef() ? Src : SDValue();
3100  }
3101  default:
3102  return SDValue();
3103  }
3104 }
3105 
3106 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3107 /// binary operation \p Opc to it with the corresponding constant operands.
3109  DAGCombinerInfo &DCI, const SDLoc &SL,
3110  unsigned Opc, SDValue LHS,
3111  uint32_t ValLo, uint32_t ValHi) const {
3112  SelectionDAG &DAG = DCI.DAG;
3113  SDValue Lo, Hi;
3114  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3115 
3116  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3117  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3118 
3119  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3120  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3121 
3122  // Re-visit the ands. It's possible we eliminated one of them and it could
3123  // simplify the vector.
3124  DCI.AddToWorklist(Lo.getNode());
3125  DCI.AddToWorklist(Hi.getNode());
3126 
3127  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3128  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3129 }
3130 
3132  DAGCombinerInfo &DCI) const {
3133  EVT VT = N->getValueType(0);
3134 
3135  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3136  if (!RHS)
3137  return SDValue();
3138 
3139  SDValue LHS = N->getOperand(0);
3140  unsigned RHSVal = RHS->getZExtValue();
3141  if (!RHSVal)
3142  return LHS;
3143 
3144  SDLoc SL(N);
3145  SelectionDAG &DAG = DCI.DAG;
3146 
3147  switch (LHS->getOpcode()) {
3148  default:
3149  break;
3150  case ISD::ZERO_EXTEND:
3151  case ISD::SIGN_EXTEND:
3152  case ISD::ANY_EXTEND: {
3153  SDValue X = LHS->getOperand(0);
3154 
3155  if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3157  // Prefer build_vector as the canonical form if packed types are legal.
3158  // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3159  SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3160  { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3161  return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3162  }
3163 
3164  // shl (ext x) => zext (shl x), if shift does not overflow int
3165  if (VT != MVT::i64)
3166  break;
3167  KnownBits Known = DAG.computeKnownBits(X);
3168  unsigned LZ = Known.countMinLeadingZeros();
3169  if (LZ < RHSVal)
3170  break;
3171  EVT XVT = X.getValueType();
3172  SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3173  return DAG.getZExtOrTrunc(Shl, SL, VT);
3174  }
3175  }
3176 
3177  if (VT != MVT::i64)
3178  return SDValue();
3179 
3180  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3181 
3182  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3183  // common case, splitting this into a move and a 32-bit shift is faster and
3184  // the same code size.
3185  if (RHSVal < 32)
3186  return SDValue();
3187 
3188  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3189 
3190  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3191  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3192 
3193  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3194 
3195  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3196  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3197 }
3198 
3200  DAGCombinerInfo &DCI) const {
3201  if (N->getValueType(0) != MVT::i64)
3202  return SDValue();
3203 
3204  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3205  if (!RHS)
3206  return SDValue();
3207 
3208  SelectionDAG &DAG = DCI.DAG;
3209  SDLoc SL(N);
3210  unsigned RHSVal = RHS->getZExtValue();
3211 
3212  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3213  if (RHSVal == 32) {
3214  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3215  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3216  DAG.getConstant(31, SL, MVT::i32));
3217 
3218  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3219  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3220  }
3221 
3222  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3223  if (RHSVal == 63) {
3224  SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3225  SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3226  DAG.getConstant(31, SL, MVT::i32));
3227  SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3228  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3229  }
3230 
3231  return SDValue();
3232 }
3233 
3235  DAGCombinerInfo &DCI) const {
3236  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3237  if (!RHS)
3238  return SDValue();
3239 
3240  EVT VT = N->getValueType(0);
3241  SDValue LHS = N->getOperand(0);
3242  unsigned ShiftAmt = RHS->getZExtValue();
3243  SelectionDAG &DAG = DCI.DAG;
3244  SDLoc SL(N);
3245 
3246  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3247  // this improves the ability to match BFE patterns in isel.
3248  if (LHS.getOpcode() == ISD::AND) {
3249  if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3250  unsigned MaskIdx, MaskLen;
3251  if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3252  MaskIdx == ShiftAmt) {
3253  return DAG.getNode(
3254  ISD::AND, SL, VT,
3255  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3256  DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3257  }
3258  }
3259  }
3260 
3261  if (VT != MVT::i64)
3262  return SDValue();
3263 
3264  if (ShiftAmt < 32)
3265  return SDValue();
3266 
3267  // srl i64:x, C for C >= 32
3268  // =>
3269  // build_pair (srl hi_32(x), C - 32), 0
3270  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3271 
3272  SDValue Hi = getHiHalf64(LHS, DAG);
3273 
3274  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3275  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3276 
3277  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3278 
3279  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3280 }
3281 
3283  SDNode *N, DAGCombinerInfo &DCI) const {
3284  SDLoc SL(N);
3285  SelectionDAG &DAG = DCI.DAG;
3286  EVT VT = N->getValueType(0);
3287  SDValue Src = N->getOperand(0);
3288 
3289  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3290  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3291  SDValue Vec = Src.getOperand(0);
3292  if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3293  SDValue Elt0 = Vec.getOperand(0);
3294  EVT EltVT = Elt0.getValueType();
3295  if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3296  if (EltVT.isFloatingPoint()) {
3297  Elt0 = DAG.getNode(ISD::BITCAST, SL,
3298  EltVT.changeTypeToInteger(), Elt0);
3299  }
3300 
3301  return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3302  }
3303  }
3304  }
3305 
3306  // Equivalent of above for accessing the high element of a vector as an
3307  // integer operation.
3308  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3309  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3310  if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3311  if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3312  SDValue BV = stripBitcast(Src.getOperand(0));
3313  if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3314  BV.getValueType().getVectorNumElements() == 2) {
3315  SDValue SrcElt = BV.getOperand(1);
3316  EVT SrcEltVT = SrcElt.getValueType();
3317  if (SrcEltVT.isFloatingPoint()) {
3318  SrcElt = DAG.getNode(ISD::BITCAST, SL,
3319  SrcEltVT.changeTypeToInteger(), SrcElt);
3320  }
3321 
3322  return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3323  }
3324  }
3325  }
3326  }
3327 
3328  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3329  //
3330  // i16 (trunc (srl i64:x, K)), K <= 16 ->
3331  // i16 (trunc (srl (i32 (trunc x), K)))
3332  if (VT.getScalarSizeInBits() < 32) {
3333  EVT SrcVT = Src.getValueType();
3334  if (SrcVT.getScalarSizeInBits() > 32 &&
3335  (Src.getOpcode() == ISD::SRL ||
3336  Src.getOpcode() == ISD::SRA ||
3337  Src.getOpcode() == ISD::SHL)) {
3338  SDValue Amt = Src.getOperand(1);
3339  KnownBits Known = DAG.computeKnownBits(Amt);
3340 
3341  // - For left shifts, do the transform as long as the shift
3342  // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
3343  // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
3344  // losing information stored in the high bits when truncating.
3345  const unsigned MaxCstSize =
3346  (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
3347  if (Known.getMaxValue().ule(MaxCstSize)) {
3348  EVT MidVT = VT.isVector() ?
3351 
3352  EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3353  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3354  Src.getOperand(0));
3355  DCI.AddToWorklist(Trunc.getNode());
3356 
3357  if (Amt.getValueType() != NewShiftVT) {
3358  Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3359  DCI.AddToWorklist(Amt.getNode());
3360  }
3361 
3362  SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3363  Trunc, Amt);
3364  return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3365  }
3366  }
3367  }
3368 
3369  return SDValue();
3370 }
3371 
3372 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3373 // instructions. If we only match on the legalized i64 mul expansion,
3374 // SimplifyDemandedBits will be unable to remove them because there will be
3375 // multiple uses due to the separate mul + mulh[su].
3376 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3377  SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3378  if (Size <= 32) {
3379  unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3380  return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3381  }
3382 
3383  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3384  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3385 
3386  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3387  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3388 
3389  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3390 }
3391 
3393  DAGCombinerInfo &DCI) const {
3394  EVT VT = N->getValueType(0);
3395 
3396  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3397  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3398  // unnecessarily). isDivergent() is used as an approximation of whether the
3399  // value is in an SGPR.
3400  if (!N->isDivergent())
3401  return SDValue();
3402 
3403  unsigned Size = VT.getSizeInBits();
3404  if (VT.isVector() || Size > 64)
3405  return SDValue();
3406 
3407  // There are i16 integer mul/mad.
3408  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3409  return SDValue();
3410 
3411  SelectionDAG &DAG = DCI.DAG;
3412  SDLoc DL(N);
3413 
3414  SDValue N0 = N->getOperand(0);
3415  SDValue N1 = N->getOperand(1);
3416 
3417  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3418  // in the source into any_extends if the result of the mul is truncated. Since
3419  // we can assume the high bits are whatever we want, use the underlying value
3420  // to avoid the unknown high bits from interfering.
3421  if (N0.getOpcode() == ISD::ANY_EXTEND)
3422  N0 = N0.getOperand(0);
3423 
3424  if (N1.getOpcode() == ISD::ANY_EXTEND)
3425  N1 = N1.getOperand(0);
3426 
3427  SDValue Mul;
3428 
3429  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3430  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3431  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3432  Mul = getMul24(DAG, DL, N0, N1, Size, false);
3433  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3434  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3435  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3436  Mul = getMul24(DAG, DL, N0, N1, Size, true);
3437  } else {
3438  return SDValue();
3439  }
3440 
3441  // We need to use sext even for MUL_U24, because MUL_U24 is used
3442  // for signed multiply of 8 and 16-bit types.
3443  return DAG.getSExtOrTrunc(Mul, DL, VT);
3444 }
3445 
3446 SDValue
3448  DAGCombinerInfo &DCI) const {
3449  if (N->getValueType(0) != MVT::i32)
3450  return SDValue();
3451 
3452  SelectionDAG &DAG = DCI.DAG;
3453  SDLoc DL(N);
3454 
3455  SDValue N0 = N->getOperand(0);
3456  SDValue N1 = N->getOperand(1);
3457 
3458  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3459  // in the source into any_extends if the result of the mul is truncated. Since
3460  // we can assume the high bits are whatever we want, use the underlying value
3461  // to avoid the unknown high bits from interfering.
3462  if (N0.getOpcode() == ISD::ANY_EXTEND)
3463  N0 = N0.getOperand(0);
3464  if (N1.getOpcode() == ISD::ANY_EXTEND)
3465  N1 = N1.getOperand(0);
3466 
3467  // Try to use two fast 24-bit multiplies (one for each half of the result)
3468  // instead of one slow extending multiply.
3469  unsigned LoOpcode, HiOpcode;
3470  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3471  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3472  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3473  LoOpcode = AMDGPUISD::MUL_U24;
3474  HiOpcode = AMDGPUISD::MULHI_U24;
3475  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3476  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3477  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3478  LoOpcode = AMDGPUISD::MUL_I24;
3479  HiOpcode = AMDGPUISD::MULHI_I24;
3480  } else {
3481  return SDValue();
3482  }
3483 
3484  SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3485  SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3486  DCI.CombineTo(N, Lo, Hi);
3487  return SDValue(N, 0);
3488 }
3489 
3491  DAGCombinerInfo &DCI) const {
3492  EVT VT = N->getValueType(0);
3493 
3494  if (!Subtarget->hasMulI24() || VT.isVector())
3495  return SDValue();
3496 
3497  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3498  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3499  // unnecessarily). isDivergent() is used as an approximation of whether the
3500  // value is in an SGPR.
3501  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3502  // valu op anyway)
3503  if (Subtarget->hasSMulHi() && !N->isDivergent())
3504  return SDValue();
3505 
3506  SelectionDAG &DAG = DCI.DAG;
3507  SDLoc DL(N);
3508 
3509  SDValue N0 = N->getOperand(0);
3510  SDValue N1 = N->getOperand(1);
3511 
3512  if (!isI24(N0, DAG) || !isI24(N1, DAG))
3513  return SDValue();
3514 
3515  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3516  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3517 
3518  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3519  DCI.AddToWorklist(Mulhi.getNode());
3520  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3521 }
3522 
3524  DAGCombinerInfo &DCI) const {
3525  EVT VT = N->getValueType(0);
3526 
3527  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3528  return SDValue();
3529 
3530  // Don't generate 24-bit multiplies on values that are in SGPRs, since
3531  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3532  // unnecessarily). isDivergent() is used as an approximation of whether the
3533  // value is in an SGPR.
3534  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3535  // valu op anyway)
3536  if (Subtarget->hasSMulHi() && !N->isDivergent())
3537  return SDValue();
3538 
3539  SelectionDAG &DAG = DCI.DAG;
3540  SDLoc DL(N);
3541 
3542  SDValue N0 = N->getOperand(0);
3543  SDValue N1 = N->getOperand(1);
3544 
3545  if (!isU24(N0, DAG) || !isU24(N1, DAG))
3546  return SDValue();
3547 
3548  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3549  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3550 
3551  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3552  DCI.AddToWorklist(Mulhi.getNode());
3553  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3554 }
3555 
3556 static bool isNegativeOne(SDValue Val) {
3557  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3558  return C->isAllOnes();
3559  return false;
3560 }
3561 
3562 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3563  SDValue Op,
3564  const SDLoc &DL,
3565  unsigned Opc) const {
3566  EVT VT = Op.getValueType();
3567  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3568  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3569  LegalVT != MVT::i16))
3570  return SDValue();
3571 
3572  if (VT != MVT::i32)
3574 
3575  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3576  if (VT != MVT::i32)
3577  FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3578 
3579  return FFBX;
3580 }
3581 
3582 // The native instructions return -1 on 0 input. Optimize out a select that
3583 // produces -1 on 0.
3584 //
3585 // TODO: If zero is not undef, we could also do this if the output is compared
3586 // against the bitwidth.
3587 //
3588 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3591  DAGCombinerInfo &DCI) const {
3592  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3593  if (!CmpRhs || !CmpRhs->isZero())
3594  return SDValue();
3595 
3596  SelectionDAG &DAG = DCI.DAG;
3597  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3598  SDValue CmpLHS = Cond.getOperand(0);
3599 
3600  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3601  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3602  if (CCOpcode == ISD::SETEQ &&
3603  (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3604  RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3605  unsigned Opc =
3607  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3608  }
3609 
3610  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3611  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3612  if (CCOpcode == ISD::SETNE &&
3613  (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3614  LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3615  unsigned Opc =
3617 
3618  return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3619  }
3620 
3621  return SDValue();
3622 }
3623 
3625  unsigned Op,
3626  const SDLoc &SL,
3627  SDValue Cond,
3628  SDValue N1,
3629  SDValue N2) {
3630  SelectionDAG &DAG = DCI.DAG;
3631  EVT VT = N1.getValueType();
3632 
3633  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3634  N1.getOperand(0), N2.getOperand(0));
3635  DCI.AddToWorklist(NewSelect.getNode());
3636  return DAG.getNode(Op, SL, VT, NewSelect);
3637 }
3638 
3639 // Pull a free FP operation out of a select so it may fold into uses.
3640 //
3641 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3642 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3643 //
3644 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3645 // select c, (fabs x), +k -> fabs (select c, x, k)
3647  SDValue N) {
3648  SelectionDAG &DAG = DCI.DAG;
3649  SDValue Cond = N.getOperand(0);
3650  SDValue LHS = N.getOperand(1);
3651  SDValue RHS = N.getOperand(2);
3652 
3653  EVT VT = N.getValueType();
3654  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3655  (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3656  return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3657  SDLoc(N), Cond, LHS, RHS);
3658  }
3659 
3660  bool Inv = false;
3661  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3662  std::swap(LHS, RHS);
3663  Inv = true;
3664  }
3665 
3666  // TODO: Support vector constants.
3667  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3668  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3669  SDLoc SL(N);
3670  // If one side is an fneg/fabs and the other is a constant, we can push the
3671  // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3672  SDValue NewLHS = LHS.getOperand(0);
3673  SDValue NewRHS = RHS;
3674 
3675  // Careful: if the neg can be folded up, don't try to pull it back down.
3676  bool ShouldFoldNeg = true;
3677 
3678  if (NewLHS.hasOneUse()) {
3679  unsigned Opc = NewLHS.getOpcode();
3680  if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3681  ShouldFoldNeg = false;
3682  if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3683  ShouldFoldNeg = false;
3684  }
3685 
3686  if (ShouldFoldNeg) {
3687  if (LHS.getOpcode() == ISD::FNEG)
3688  NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3689  else if (CRHS->isNegative())
3690  return SDValue();
3691 
3692  if (Inv)
3693  std::swap(NewLHS, NewRHS);
3694 
3695  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3696  Cond, NewLHS, NewRHS);
3697  DCI.AddToWorklist(NewSelect.getNode());
3698  return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3699  }
3700  }
3701 
3702  return SDValue();
3703 }
3704 
3705 
3707  DAGCombinerInfo &DCI) const {
3708  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3709  return Folded;
3710 
3711  SDValue Cond = N->getOperand(0);
3712  if (Cond.getOpcode() != ISD::SETCC)
3713  return SDValue();
3714 
3715  EVT VT = N->getValueType(0);
3716  SDValue LHS = Cond.getOperand(0);
3717  SDValue RHS = Cond.getOperand(1);
3718  SDValue CC = Cond.getOperand(2);
3719 
3720  SDValue True = N->getOperand(1);
3721  SDValue False = N->getOperand(2);
3722 
3723  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3724  SelectionDAG &DAG = DCI.DAG;
3725  if (DAG.isConstantValueOfAnyType(True) &&
3726  !DAG.isConstantValueOfAnyType(False)) {
3727  // Swap cmp + select pair to move constant to false input.
3728  // This will allow using VOPC cndmasks more often.
3729  // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3730 
3731  SDLoc SL(N);
3732  ISD::CondCode NewCC =
3733  getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3734 
3735  SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3736  return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3737  }
3738 
3739  if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3740  SDValue MinMax
3741  = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3742  // Revisit this node so we can catch min3/max3/med3 patterns.
3743  //DCI.AddToWorklist(MinMax.getNode());
3744  return MinMax;
3745  }
3746  }
3747 
3748  // There's no reason to not do this if the condition has other uses.
3749  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3750 }
3751 
3752 static bool isInv2Pi(const APFloat &APF) {
3753  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3754  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3755  static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3756 
3757  return APF.bitwiseIsEqual(KF16) ||
3758  APF.bitwiseIsEqual(KF32) ||
3759  APF.bitwiseIsEqual(KF64);
3760 }
3761 
3762 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3763 // additional cost to negate them.
3765  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3766  if (C->isZero() && !C->isNegative())
3767  return true;
3768 
3769  if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3770  return true;
3771  }
3772 
3773  return false;
3774 }
3775 
3776 static unsigned inverseMinMax(unsigned Opc) {
3777  switch (Opc) {
3778  case ISD::FMAXNUM:
3779  return ISD::FMINNUM;
3780  case ISD::FMINNUM:
3781  return ISD::FMAXNUM;
3782  case ISD::FMAXNUM_IEEE:
3783  return ISD::FMINNUM_IEEE;
3784  case ISD::FMINNUM_IEEE:
3785  return ISD::FMAXNUM_IEEE;
3787  return AMDGPUISD::FMIN_LEGACY;
3789  return AMDGPUISD::FMAX_LEGACY;
3790  default:
3791  llvm_unreachable("invalid min/max opcode");
3792  }
3793 }
3794 
3796  DAGCombinerInfo &DCI) const {
3797  SelectionDAG &DAG = DCI.DAG;
3798  SDValue N0 = N->getOperand(0);
3799  EVT VT = N->getValueType(0);
3800 
3801  unsigned Opc = N0.getOpcode();
3802 
3803  // If the input has multiple uses and we can either fold the negate down, or
3804  // the other uses cannot, give up. This both prevents unprofitable
3805  // transformations and infinite loops: we won't repeatedly try to fold around
3806  // a negate that has no 'good' form.
3807  if (N0.hasOneUse()) {
3808  // This may be able to fold into the source, but at a code size cost. Don't
3809  // fold if the fold into the user is free.
3810  if (allUsesHaveSourceMods(N, 0))
3811  return SDValue();
3812  } else {
3813  if (fnegFoldsIntoOp(Opc) &&
3815  return SDValue();
3816  }
3817 
3818  SDLoc SL(N);
3819  switch (Opc) {
3820  case ISD::FADD: {
3821  if (!mayIgnoreSignedZero(N0))
3822  return SDValue();
3823 
3824  // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3825  SDValue LHS = N0.getOperand(0);
3826  SDValue RHS = N0.getOperand(1);
3827 
3828  if (LHS.getOpcode() != ISD::FNEG)
3829  LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3830  else
3831  LHS = LHS.getOperand(0);
3832 
3833  if (RHS.getOpcode() != ISD::FNEG)
3834  RHS = DAG.getNode(ISD::FNEG, SL, VT,