LLVM 17.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
28
29using namespace llvm;
30
31#include "AMDGPUGenCallingConv.inc"
32
34 "amdgpu-bypass-slow-div",
35 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
36 cl::init(true));
37
38// Find a larger type to do a load / store of a vector with.
40 unsigned StoreSize = VT.getStoreSizeInBits();
41 if (StoreSize <= 32)
42 return EVT::getIntegerVT(Ctx, StoreSize);
43
44 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
45 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
46}
47
49 return DAG.computeKnownBits(Op).countMaxActiveBits();
50}
51
53 // In order for this to be a signed 24-bit value, bit 23, must
54 // be a sign bit.
55 return DAG.ComputeMaxSignificantBits(Op);
56}
57
59 const AMDGPUSubtarget &STI)
60 : TargetLowering(TM), Subtarget(&STI) {
61 // Lower floating point store/load to integer store/load to reduce the number
62 // of patterns in tablegen.
65
68
71
74
77
80
83
86
89
92
95
98
101
104
107
110
113
116
119
122
125
128
131
134
137
140
141 // There are no 64-bit extloads. These should be done as a 32-bit extload and
142 // an extension to 64-bit.
143 for (MVT VT : MVT::integer_valuetypes())
145 Expand);
146
147 for (MVT VT : MVT::integer_valuetypes()) {
148 if (VT == MVT::i64)
149 continue;
150
151 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
156 }
157 }
158
160 for (auto MemVT :
163 Expand);
164
173
180
188
191
194
197
200
203
206
209
212
215
218
221
224
227
230
233
236
239
242
245
248
251
254
257
260
263
266
271
276
285
289
292
297
302
305
313
316
318
319 // This is totally unsupported, just custom lower to produce an error.
321
322 // Library functions. These default to Expand, but we have instructions
323 // for them.
327 MVT::f32, Legal);
328
330
332
334
336
338
339 if (Subtarget->has16BitInsts())
341 else
343
344 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
345 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
346 // default unless marked custom/legal.
352 Custom);
353
354 // Expand to fneg + fadd.
356
363 Custom);
375 Custom);
376
379
380 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
381 for (MVT VT : ScalarIntVTs) {
382 // These should use [SU]DIVREM, so set them to expand
384 Expand);
385
386 // GPU does not have divrem function for signed or unsigned.
388
389 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
391
393
394 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
396 }
397
398 // The hardware supports 32-bit FSHR, but not FSHL.
400
401 // The hardware supports 32-bit ROTR, but not ROTL.
404
406
412
414 Legal);
415
419
420 static const MVT::SimpleValueType VectorIntTypes[] = {
423
424 for (MVT VT : VectorIntTypes) {
425 // Expand the following operations for the current type by default.
437 ISD::SETCC},
438 VT, Expand);
439 }
440
441 static const MVT::SimpleValueType FloatVectorTypes[] = {
444
445 for (MVT VT : FloatVectorTypes) {
455 VT, Expand);
456 }
457
458 // This causes using an unrolled select operation rather than expansion with
459 // bit operations. This is in general better, but the alternative using BFI
460 // instructions may be better if the select sources are SGPRs.
463
466
469
472
475
478
481
484
487
490
491 // There are no libcalls of any kind.
492 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
493 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
494
496 setJumpIsExpensive(true);
497
498 // FIXME: This is only partially true. If we have to do vector compares, any
499 // SGPR pair can be a condition register. If we have a uniform condition, we
500 // are better off doing SALU operations, where there is only one SCC. For now,
501 // we don't have a way of knowing during instruction selection if a condition
502 // will be uniform and we always use vector compares. Assume we are using
503 // vector compares until that is fixed.
505
508
510
511 // We want to find all load dependencies for long chains of stores to enable
512 // merging into very wide vectors. The problem is with vectors with > 4
513 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
514 // vectors are a legal type, even though we have to split the loads
515 // usually. When we can more precisely specify load legality per address
516 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
517 // smarter so that they can figure out what to do in 2 iterations without all
518 // N > 4 stores on the same chain.
520
521 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
522 // about these during lowering.
523 MaxStoresPerMemcpy = 0xffffffff;
524 MaxStoresPerMemmove = 0xffffffff;
525 MaxStoresPerMemset = 0xffffffff;
526
527 // The expansion for 64-bit division is enormous.
529 addBypassSlowDiv(64, 32);
530
541}
542
544 if (getTargetMachine().Options.NoSignedZerosFPMath)
545 return true;
546
547 const auto Flags = Op.getNode()->getFlags();
548 if (Flags.hasNoSignedZeros())
549 return true;
550
551 return false;
552}
553
554//===----------------------------------------------------------------------===//
555// Target Information
556//===----------------------------------------------------------------------===//
557
559static bool fnegFoldsIntoOp(unsigned Opc) {
560 switch (Opc) {
561 case ISD::FADD:
562 case ISD::FSUB:
563 case ISD::FMUL:
564 case ISD::FMA:
565 case ISD::FMAD:
566 case ISD::FMINNUM:
567 case ISD::FMAXNUM:
570 case ISD::SELECT:
571 case ISD::FSIN:
572 case ISD::FTRUNC:
573 case ISD::FRINT:
574 case ISD::FNEARBYINT:
576 case AMDGPUISD::RCP:
583 case AMDGPUISD::FMED3:
584 // TODO: handle llvm.amdgcn.fma.legacy
585 return true;
586 default:
587 return false;
588 }
589}
590
591/// \p returns true if the operation will definitely need to use a 64-bit
592/// encoding, and thus will use a VOP3 encoding regardless of the source
593/// modifiers.
595static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
596 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
597 VT == MVT::f64;
598}
599
600/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
601/// type for ISD::SELECT.
603static bool selectSupportsSourceMods(const SDNode *N) {
604 // TODO: Only applies if select will be vector
605 return N->getValueType(0) == MVT::f32;
606}
607
608// Most FP instructions support source modifiers, but this could be refined
609// slightly.
611static bool hasSourceMods(const SDNode *N) {
612 if (isa<MemSDNode>(N))
613 return false;
614
615 switch (N->getOpcode()) {
616 case ISD::CopyToReg:
617 case ISD::FDIV:
618 case ISD::FREM:
619 case ISD::INLINEASM:
623
624 // TODO: Should really be looking at the users of the bitcast. These are
625 // problematic because bitcasts are used to legalize all stores to integer
626 // types.
627 case ISD::BITCAST:
628 return false;
630 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
631 case Intrinsic::amdgcn_interp_p1:
632 case Intrinsic::amdgcn_interp_p2:
633 case Intrinsic::amdgcn_interp_mov:
634 case Intrinsic::amdgcn_interp_p1_f16:
635 case Intrinsic::amdgcn_interp_p2_f16:
636 return false;
637 default:
638 return true;
639 }
640 }
641 case ISD::SELECT:
643 default:
644 return true;
645 }
646}
647
649 unsigned CostThreshold) {
650 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
651 // it is truly free to use a source modifier in all cases. If there are
652 // multiple users but for each one will necessitate using VOP3, there will be
653 // a code size increase. Try to avoid increasing code size unless we know it
654 // will save on the instruction count.
655 unsigned NumMayIncreaseSize = 0;
656 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
657
658 assert(!N->use_empty());
659
660 // XXX - Should this limit number of uses to check?
661 for (const SDNode *U : N->uses()) {
662 if (!hasSourceMods(U))
663 return false;
664
665 if (!opMustUseVOP3Encoding(U, VT)) {
666 if (++NumMayIncreaseSize > CostThreshold)
667 return false;
668 }
669 }
670
671 return true;
672}
673
675 ISD::NodeType ExtendKind) const {
676 assert(!VT.isVector() && "only scalar expected");
677
678 // Round to the next multiple of 32-bits.
679 unsigned Size = VT.getSizeInBits();
680 if (Size <= 32)
681 return MVT::i32;
682 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
683}
684
686 return MVT::i32;
687}
688
690 return true;
691}
692
693// The backend supports 32 and 64 bit floating point immediates.
694// FIXME: Why are we reporting vectors of FP immediates as legal?
696 bool ForCodeSize) const {
697 EVT ScalarVT = VT.getScalarType();
698 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
699 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
700}
701
702// We don't want to shrink f64 / f32 constants.
704 EVT ScalarVT = VT.getScalarType();
705 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
706}
707
709 ISD::LoadExtType ExtTy,
710 EVT NewVT) const {
711 // TODO: This may be worth removing. Check regression tests for diffs.
713 return false;
714
715 unsigned NewSize = NewVT.getStoreSizeInBits();
716
717 // If we are reducing to a 32-bit load or a smaller multi-dword load,
718 // this is always better.
719 if (NewSize >= 32)
720 return true;
721
722 EVT OldVT = N->getValueType(0);
723 unsigned OldSize = OldVT.getStoreSizeInBits();
724
725 MemSDNode *MN = cast<MemSDNode>(N);
726 unsigned AS = MN->getAddressSpace();
727 // Do not shrink an aligned scalar load to sub-dword.
728 // Scalar engine cannot do sub-dword loads.
729 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
732 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
733 MN->isInvariant())) &&
735 return false;
736
737 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
738 // extloads, so doing one requires using a buffer_load. In cases where we
739 // still couldn't use a scalar load, using the wider load shouldn't really
740 // hurt anything.
741
742 // If the old size already had to be an extload, there's no harm in continuing
743 // to reduce the width.
744 return (OldSize < 32);
745}
746
748 const SelectionDAG &DAG,
749 const MachineMemOperand &MMO) const {
750
751 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
752
753 if (LoadTy.getScalarType() == MVT::i32)
754 return false;
755
756 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
757 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
758
759 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
760 return false;
761
762 unsigned Fast = 0;
764 CastTy, MMO, &Fast) &&
765 Fast;
766}
767
768// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
769// profitable with the expansion for 64-bit since it's generally good to
770// speculate things.
772 return true;
773}
774
776 return true;
777}
778
780 switch (N->getOpcode()) {
781 case ISD::EntryToken:
782 case ISD::TokenFactor:
783 return true;
785 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
786 switch (IntrID) {
787 case Intrinsic::amdgcn_readfirstlane:
788 case Intrinsic::amdgcn_readlane:
789 return true;
790 }
791 return false;
792 }
793 case ISD::LOAD:
794 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
796 return true;
797 return false;
798 case AMDGPUISD::SETCC: // ballot-style instruction
799 return true;
800 }
801 return false;
802}
803
805 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
806 NegatibleCost &Cost, unsigned Depth) const {
807
808 switch (Op.getOpcode()) {
809 case ISD::FMA:
810 case ISD::FMAD: {
811 // Negating a fma is not free if it has users without source mods.
812 if (!allUsesHaveSourceMods(Op.getNode()))
813 return SDValue();
814 break;
815 }
816 case AMDGPUISD::RCP: {
817 SDValue Src = Op.getOperand(0);
818 EVT VT = Op.getValueType();
819 SDLoc SL(Op);
820
821 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
822 ForCodeSize, Cost, Depth + 1);
823 if (NegSrc)
824 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
825 return SDValue();
826 }
827 default:
828 break;
829 }
830
831 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
832 ForCodeSize, Cost, Depth);
833}
834
835//===---------------------------------------------------------------------===//
836// Target Properties
837//===---------------------------------------------------------------------===//
838
841
842 // Packed operations do not have a fabs modifier.
843 return VT == MVT::f32 || VT == MVT::f64 ||
844 (Subtarget->has16BitInsts() && VT == MVT::f16);
845}
846
849 // Report this based on the end legalized type.
850 VT = VT.getScalarType();
851 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
852}
853
855 unsigned NumElem,
856 unsigned AS) const {
857 return true;
858}
859
861 // There are few operations which truly have vector input operands. Any vector
862 // operation is going to involve operations on each component, and a
863 // build_vector will be a copy per element, so it always makes sense to use a
864 // build_vector input in place of the extracted element to avoid a copy into a
865 // super register.
866 //
867 // We should probably only do this if all users are extracts only, but this
868 // should be the common case.
869 return true;
870}
871
873 // Truncate is just accessing a subregister.
874
875 unsigned SrcSize = Source.getSizeInBits();
876 unsigned DestSize = Dest.getSizeInBits();
877
878 return DestSize < SrcSize && DestSize % 32 == 0 ;
879}
880
882 // Truncate is just accessing a subregister.
883
884 unsigned SrcSize = Source->getScalarSizeInBits();
885 unsigned DestSize = Dest->getScalarSizeInBits();
886
887 if (DestSize== 16 && Subtarget->has16BitInsts())
888 return SrcSize >= 32;
889
890 return DestSize < SrcSize && DestSize % 32 == 0;
891}
892
894 unsigned SrcSize = Src->getScalarSizeInBits();
895 unsigned DestSize = Dest->getScalarSizeInBits();
896
897 if (SrcSize == 16 && Subtarget->has16BitInsts())
898 return DestSize >= 32;
899
900 return SrcSize == 32 && DestSize == 64;
901}
902
904 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
905 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
906 // this will enable reducing 64-bit operations the 32-bit, which is always
907 // good.
908
909 if (Src == MVT::i16)
910 return Dest == MVT::i32 ||Dest == MVT::i64 ;
911
912 return Src == MVT::i32 && Dest == MVT::i64;
913}
914
916 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
917 // limited number of native 64-bit operations. Shrinking an operation to fit
918 // in a single 32-bit register should always be helpful. As currently used,
919 // this is much less general than the name suggests, and is only used in
920 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
921 // not profitable, and may actually be harmful.
922 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
923}
924
926 const SDNode* N, CombineLevel Level) const {
927 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
928 N->getOpcode() == ISD::SRL) &&
929 "Expected shift op");
930 // Always commute pre-type legalization and right shifts.
931 // We're looking for shl(or(x,y),z) patterns.
933 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
934 return true;
935
936 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
937 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
938 (N->use_begin()->getOpcode() == ISD::SRA ||
939 N->use_begin()->getOpcode() == ISD::SRL))
940 return false;
941
942 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
943 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
944 if (LHS.getOpcode() != ISD::SHL)
945 return false;
946 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
947 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
948 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
949 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
950 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
951 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
952 };
953 SDValue LHS = N->getOperand(0).getOperand(0);
954 SDValue RHS = N->getOperand(0).getOperand(1);
955 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
956}
957
958//===---------------------------------------------------------------------===//
959// TargetLowering Callbacks
960//===---------------------------------------------------------------------===//
961
963 bool IsVarArg) {
964 switch (CC) {
972 return CC_AMDGPU;
973 case CallingConv::C:
976 return CC_AMDGPU_Func;
978 return CC_SI_Gfx;
981 default:
982 report_fatal_error("Unsupported calling convention for call");
983 }
984}
985
987 bool IsVarArg) {
988 switch (CC) {
991 llvm_unreachable("kernels should not be handled here");
999 return RetCC_SI_Shader;
1001 return RetCC_SI_Gfx;
1002 case CallingConv::C:
1003 case CallingConv::Fast:
1004 case CallingConv::Cold:
1005 return RetCC_AMDGPU_Func;
1006 default:
1007 report_fatal_error("Unsupported calling convention.");
1008 }
1009}
1010
1011/// The SelectionDAGBuilder will automatically promote function arguments
1012/// with illegal types. However, this does not work for the AMDGPU targets
1013/// since the function arguments are stored in memory as these illegal types.
1014/// In order to handle this properly we need to get the original types sizes
1015/// from the LLVM IR Function and fixup the ISD:InputArg values before
1016/// passing them to AnalyzeFormalArguments()
1017
1018/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1019/// input values across multiple registers. Each item in the Ins array
1020/// represents a single value that will be stored in registers. Ins[x].VT is
1021/// the value type of the value that will be stored in the register, so
1022/// whatever SDNode we lower the argument to needs to be this type.
1023///
1024/// In order to correctly lower the arguments we need to know the size of each
1025/// argument. Since Ins[x].VT gives us the size of the register that will
1026/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1027/// for the original function argument so that we can deduce the correct memory
1028/// type to use for Ins[x]. In most cases the correct memory type will be
1029/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1030/// we have a kernel argument of type v8i8, this argument will be split into
1031/// 8 parts and each part will be represented by its own item in the Ins array.
1032/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1033/// the argument before it was split. From this, we deduce that the memory type
1034/// for each individual part is i8. We pass the memory type as LocVT to the
1035/// calling convention analysis function and the register type (Ins[x].VT) as
1036/// the ValVT.
1038 CCState &State,
1039 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1040 const MachineFunction &MF = State.getMachineFunction();
1041 const Function &Fn = MF.getFunction();
1042 LLVMContext &Ctx = Fn.getParent()->getContext();
1043 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1044 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1046
1047 Align MaxAlign = Align(1);
1048 uint64_t ExplicitArgOffset = 0;
1049 const DataLayout &DL = Fn.getParent()->getDataLayout();
1050
1051 unsigned InIndex = 0;
1052
1053 for (const Argument &Arg : Fn.args()) {
1054 const bool IsByRef = Arg.hasByRefAttr();
1055 Type *BaseArgTy = Arg.getType();
1056 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1057 Align Alignment = DL.getValueOrABITypeAlignment(
1058 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1059 MaxAlign = std::max(Alignment, MaxAlign);
1060 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1061
1062 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1063 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1064
1065 // We're basically throwing away everything passed into us and starting over
1066 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1067 // to us as computed in Ins.
1068 //
1069 // We also need to figure out what type legalization is trying to do to get
1070 // the correct memory offsets.
1071
1072 SmallVector<EVT, 16> ValueVTs;
1074 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1075
1076 for (unsigned Value = 0, NumValues = ValueVTs.size();
1077 Value != NumValues; ++Value) {
1078 uint64_t BasePartOffset = Offsets[Value];
1079
1080 EVT ArgVT = ValueVTs[Value];
1081 EVT MemVT = ArgVT;
1082 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1083 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1084
1085 if (NumRegs == 1) {
1086 // This argument is not split, so the IR type is the memory type.
1087 if (ArgVT.isExtended()) {
1088 // We have an extended type, like i24, so we should just use the
1089 // register type.
1090 MemVT = RegisterVT;
1091 } else {
1092 MemVT = ArgVT;
1093 }
1094 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1095 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1096 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1097 // We have a vector value which has been split into a vector with
1098 // the same scalar type, but fewer elements. This should handle
1099 // all the floating-point vector types.
1100 MemVT = RegisterVT;
1101 } else if (ArgVT.isVector() &&
1102 ArgVT.getVectorNumElements() == NumRegs) {
1103 // This arg has been split so that each element is stored in a separate
1104 // register.
1105 MemVT = ArgVT.getScalarType();
1106 } else if (ArgVT.isExtended()) {
1107 // We have an extended type, like i65.
1108 MemVT = RegisterVT;
1109 } else {
1110 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1111 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1112 if (RegisterVT.isInteger()) {
1113 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1114 } else if (RegisterVT.isVector()) {
1115 assert(!RegisterVT.getScalarType().isFloatingPoint());
1116 unsigned NumElements = RegisterVT.getVectorNumElements();
1117 assert(MemoryBits % NumElements == 0);
1118 // This vector type has been split into another vector type with
1119 // a different elements size.
1120 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1121 MemoryBits / NumElements);
1122 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1123 } else {
1124 llvm_unreachable("cannot deduce memory type.");
1125 }
1126 }
1127
1128 // Convert one element vectors to scalar.
1129 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1130 MemVT = MemVT.getScalarType();
1131
1132 // Round up vec3/vec5 argument.
1133 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1134 assert(MemVT.getVectorNumElements() == 3 ||
1135 MemVT.getVectorNumElements() == 5 ||
1136 (MemVT.getVectorNumElements() >= 9 &&
1137 MemVT.getVectorNumElements() <= 12));
1138 MemVT = MemVT.getPow2VectorType(State.getContext());
1139 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1140 MemVT = MemVT.getRoundIntegerType(State.getContext());
1141 }
1142
1143 unsigned PartOffset = 0;
1144 for (unsigned i = 0; i != NumRegs; ++i) {
1145 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1146 BasePartOffset + PartOffset,
1147 MemVT.getSimpleVT(),
1149 PartOffset += MemVT.getStoreSize();
1150 }
1151 }
1152 }
1153}
1154
1156 SDValue Chain, CallingConv::ID CallConv,
1157 bool isVarArg,
1159 const SmallVectorImpl<SDValue> &OutVals,
1160 const SDLoc &DL, SelectionDAG &DAG) const {
1161 // FIXME: Fails for r600 tests
1162 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1163 // "wave terminate should not have return values");
1164 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1165}
1166
1167//===---------------------------------------------------------------------===//
1168// Target specific lowering
1169//===---------------------------------------------------------------------===//
1170
1171/// Selects the correct CCAssignFn for a given CallingConvention value.
1173 bool IsVarArg) {
1174 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1175}
1176
1178 bool IsVarArg) {
1180}
1181
1183 SelectionDAG &DAG,
1184 MachineFrameInfo &MFI,
1185 int ClobberedFI) const {
1186 SmallVector<SDValue, 8> ArgChains;
1187 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1188 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1189
1190 // Include the original chain at the beginning of the list. When this is
1191 // used by target LowerCall hooks, this helps legalize find the
1192 // CALLSEQ_BEGIN node.
1193 ArgChains.push_back(Chain);
1194
1195 // Add a chain value for each stack argument corresponding
1196 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1197 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1198 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1199 if (FI->getIndex() < 0) {
1200 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1201 int64_t InLastByte = InFirstByte;
1202 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1203
1204 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1205 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1206 ArgChains.push_back(SDValue(L, 1));
1207 }
1208 }
1209 }
1210 }
1211
1212 // Build a tokenfactor for all the chains.
1213 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1214}
1215
1218 StringRef Reason) const {
1219 SDValue Callee = CLI.Callee;
1220 SelectionDAG &DAG = CLI.DAG;
1221
1222 const Function &Fn = DAG.getMachineFunction().getFunction();
1223
1224 StringRef FuncName("<unknown>");
1225
1226 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1227 FuncName = G->getSymbol();
1228 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1229 FuncName = G->getGlobal()->getName();
1230
1232 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1233 DAG.getContext()->diagnose(NoCalls);
1234
1235 if (!CLI.IsTailCall) {
1236 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1237 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1238 }
1239
1240 return DAG.getEntryNode();
1241}
1242
1244 SmallVectorImpl<SDValue> &InVals) const {
1245 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1246}
1247
1249 SelectionDAG &DAG) const {
1250 const Function &Fn = DAG.getMachineFunction().getFunction();
1251
1252 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1253 SDLoc(Op).getDebugLoc());
1254 DAG.getContext()->diagnose(NoDynamicAlloca);
1255 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1256 return DAG.getMergeValues(Ops, SDLoc());
1257}
1258
1260 SelectionDAG &DAG) const {
1261 switch (Op.getOpcode()) {
1262 default:
1263 Op->print(errs(), &DAG);
1264 llvm_unreachable("Custom lowering code for this "
1265 "instruction is not implemented yet!");
1266 break;
1267 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1268 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1269 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1270 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1271 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1272 case ISD::FREM: return LowerFREM(Op, DAG);
1273 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1274 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1275 case ISD::FRINT: return LowerFRINT(Op, DAG);
1276 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1277 case ISD::FROUNDEVEN:
1278 return LowerFROUNDEVEN(Op, DAG);
1279 case ISD::FROUND: return LowerFROUND(Op, DAG);
1280 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1281 case ISD::FLOG:
1282 return LowerFLOG(Op, DAG, numbers::ln2f);
1283 case ISD::FLOG10:
1284 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1285 case ISD::FEXP:
1286 return lowerFEXP(Op, DAG);
1287 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1288 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1289 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1290 case ISD::FP_TO_SINT:
1291 case ISD::FP_TO_UINT:
1292 return LowerFP_TO_INT(Op, DAG);
1293 case ISD::CTTZ:
1295 case ISD::CTLZ:
1297 return LowerCTLZ_CTTZ(Op, DAG);
1299 }
1300 return Op;
1301}
1302
1305 SelectionDAG &DAG) const {
1306 switch (N->getOpcode()) {
1308 // Different parts of legalization seem to interpret which type of
1309 // sign_extend_inreg is the one to check for custom lowering. The extended
1310 // from type is what really matters, but some places check for custom
1311 // lowering of the result type. This results in trying to use
1312 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1313 // nothing here and let the illegal result integer be handled normally.
1314 return;
1315 default:
1316 return;
1317 }
1318}
1319
1321 SDValue Op,
1322 SelectionDAG &DAG) const {
1323
1324 const DataLayout &DL = DAG.getDataLayout();
1325 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1326 const GlobalValue *GV = G->getGlobal();
1327
1328 if (!MFI->isModuleEntryFunction()) {
1329 if (std::optional<uint32_t> Address =
1331 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1332 }
1333 }
1334
1335 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1336 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1337 if (!MFI->isModuleEntryFunction() &&
1338 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1339 SDLoc DL(Op);
1340 const Function &Fn = DAG.getMachineFunction().getFunction();
1341 DiagnosticInfoUnsupported BadLDSDecl(
1342 Fn, "local memory global used by non-kernel function",
1343 DL.getDebugLoc(), DS_Warning);
1344 DAG.getContext()->diagnose(BadLDSDecl);
1345
1346 // We currently don't have a way to correctly allocate LDS objects that
1347 // aren't directly associated with a kernel. We do force inlining of
1348 // functions that use local objects. However, if these dead functions are
1349 // not eliminated, we don't want a compile time error. Just emit a warning
1350 // and a trap, since there should be no callable path here.
1352 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1353 Trap, DAG.getRoot());
1354 DAG.setRoot(OutputChain);
1355 return DAG.getUNDEF(Op.getValueType());
1356 }
1357
1358 // XXX: What does the value of G->getOffset() mean?
1359 assert(G->getOffset() == 0 &&
1360 "Do not know what to do with an non-zero offset");
1361
1362 // TODO: We could emit code to handle the initialization somewhere.
1363 // We ignore the initializer for now and legalize it to allow selection.
1364 // The initializer will anyway get errored out during assembly emission.
1365 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1366 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1367 }
1368 return SDValue();
1369}
1370
1372 SelectionDAG &DAG) const {
1374 SDLoc SL(Op);
1375
1376 EVT VT = Op.getValueType();
1377 if (VT.getVectorElementType().getSizeInBits() < 32) {
1378 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1379 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1380 unsigned NewNumElt = OpBitSize / 32;
1381 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1383 MVT::i32, NewNumElt);
1384 for (const SDUse &U : Op->ops()) {
1385 SDValue In = U.get();
1386 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1387 if (NewNumElt > 1)
1388 DAG.ExtractVectorElements(NewIn, Args);
1389 else
1390 Args.push_back(NewIn);
1391 }
1392
1393 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1394 NewNumElt * Op.getNumOperands());
1395 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1396 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1397 }
1398 }
1399
1400 for (const SDUse &U : Op->ops())
1401 DAG.ExtractVectorElements(U.get(), Args);
1402
1403 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1404}
1405
1407 SelectionDAG &DAG) const {
1408
1410 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1411 EVT VT = Op.getValueType();
1412 EVT SrcVT = Op.getOperand(0).getValueType();
1413
1414 // For these types, we have some TableGen patterns except if the index is 1
1415 if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1416 (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1417 Start != 1)
1418 return Op;
1419
1420 if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1421 (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1422 (Start == 0 || Start == 4))
1423 return Op;
1424
1425 if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
1426 (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
1427 (Start == 0 || Start == 8))
1428 return Op;
1429
1430 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1432
1433 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1434}
1435
1436// TODO: Handle fabs too
1438 if (Val.getOpcode() == ISD::FNEG)
1439 return Val.getOperand(0);
1440
1441 return Val;
1442}
1444 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1445 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1446 SelectionDAG &DAG = DCI.DAG;
1447 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1448 switch (CCOpcode) {
1449 case ISD::SETOEQ:
1450 case ISD::SETONE:
1451 case ISD::SETUNE:
1452 case ISD::SETNE:
1453 case ISD::SETUEQ:
1454 case ISD::SETEQ:
1455 case ISD::SETFALSE:
1456 case ISD::SETFALSE2:
1457 case ISD::SETTRUE:
1458 case ISD::SETTRUE2:
1459 case ISD::SETUO:
1460 case ISD::SETO:
1461 break;
1462 case ISD::SETULE:
1463 case ISD::SETULT: {
1464 if (LHS == True)
1465 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1466 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1467 }
1468 case ISD::SETOLE:
1469 case ISD::SETOLT:
1470 case ISD::SETLE:
1471 case ISD::SETLT: {
1472 // Ordered. Assume ordered for undefined.
1473
1474 // Only do this after legalization to avoid interfering with other combines
1475 // which might occur.
1477 !DCI.isCalledByLegalizer())
1478 return SDValue();
1479
1480 // We need to permute the operands to get the correct NaN behavior. The
1481 // selected operand is the second one based on the failing compare with NaN,
1482 // so permute it based on the compare type the hardware uses.
1483 if (LHS == True)
1484 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1485 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1486 }
1487 case ISD::SETUGE:
1488 case ISD::SETUGT: {
1489 if (LHS == True)
1490 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1491 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1492 }
1493 case ISD::SETGT:
1494 case ISD::SETGE:
1495 case ISD::SETOGE:
1496 case ISD::SETOGT: {
1498 !DCI.isCalledByLegalizer())
1499 return SDValue();
1500
1501 if (LHS == True)
1502 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1503 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1504 }
1505 case ISD::SETCC_INVALID:
1506 llvm_unreachable("Invalid setcc condcode!");
1507 }
1508 return SDValue();
1509}
1510
1511/// Generate Min/Max node
1513 SDValue LHS, SDValue RHS,
1514 SDValue True, SDValue False,
1515 SDValue CC,
1516 DAGCombinerInfo &DCI) const {
1517 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1518 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1519
1520 SelectionDAG &DAG = DCI.DAG;
1521
1522 // If we can't directly match this, try to see if we can fold an fneg to
1523 // match.
1524
1525 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1526 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1527 SDValue NegTrue = peekFNeg(True);
1528
1529 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1530 // fmin/fmax.
1531 //
1532 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1533 // -> fneg (fmin_legacy lhs, K)
1534 //
1535 // TODO: Use getNegatedExpression
1536 if (LHS == NegTrue && CFalse && CRHS) {
1537 APFloat NegRHS = neg(CRHS->getValueAPF());
1538 if (NegRHS == CFalse->getValueAPF()) {
1539 SDValue Combined =
1540 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1541 if (Combined)
1542 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1543 return SDValue();
1544 }
1545 }
1546
1547 return SDValue();
1548}
1549
1550std::pair<SDValue, SDValue>
1552 SDLoc SL(Op);
1553
1554 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1555
1556 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1557 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1558
1559 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1560 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1561
1562 return std::pair(Lo, Hi);
1563}
1564
1566 SDLoc SL(Op);
1567
1568 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1569 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1570 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1571}
1572
1574 SDLoc SL(Op);
1575
1576 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1577 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1578 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1579}
1580
1581// Split a vector type into two parts. The first part is a power of two vector.
1582// The second part is whatever is left over, and is a scalar if it would
1583// otherwise be a 1-vector.
1584std::pair<EVT, EVT>
1586 EVT LoVT, HiVT;
1587 EVT EltVT = VT.getVectorElementType();
1588 unsigned NumElts = VT.getVectorNumElements();
1589 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1590 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1591 HiVT = NumElts - LoNumElts == 1
1592 ? EltVT
1593 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1594 return std::pair(LoVT, HiVT);
1595}
1596
1597// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1598// scalar.
1599std::pair<SDValue, SDValue>
1601 const EVT &LoVT, const EVT &HiVT,
1602 SelectionDAG &DAG) const {
1604 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1605 N.getValueType().getVectorNumElements() &&
1606 "More vector elements requested than available!");
1608 DAG.getVectorIdxConstant(0, DL));
1609 SDValue Hi = DAG.getNode(
1611 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1612 return std::pair(Lo, Hi);
1613}
1614
1616 SelectionDAG &DAG) const {
1617 LoadSDNode *Load = cast<LoadSDNode>(Op);
1618 EVT VT = Op.getValueType();
1619 SDLoc SL(Op);
1620
1621
1622 // If this is a 2 element vector, we really want to scalarize and not create
1623 // weird 1 element vectors.
1624 if (VT.getVectorNumElements() == 2) {
1625 SDValue Ops[2];
1626 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1627 return DAG.getMergeValues(Ops, SL);
1628 }
1629
1630 SDValue BasePtr = Load->getBasePtr();
1631 EVT MemVT = Load->getMemoryVT();
1632
1633 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1634
1635 EVT LoVT, HiVT;
1636 EVT LoMemVT, HiMemVT;
1637 SDValue Lo, Hi;
1638
1639 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1640 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1641 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1642
1643 unsigned Size = LoMemVT.getStoreSize();
1644 Align BaseAlign = Load->getAlign();
1645 Align HiAlign = commonAlignment(BaseAlign, Size);
1646
1647 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1648 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1649 BaseAlign, Load->getMemOperand()->getFlags());
1650 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1651 SDValue HiLoad =
1652 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1653 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1654 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1655
1656 SDValue Join;
1657 if (LoVT == HiVT) {
1658 // This is the case that the vector is power of two so was evenly split.
1659 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1660 } else {
1661 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1662 DAG.getVectorIdxConstant(0, SL));
1663 Join = DAG.getNode(
1665 VT, Join, HiLoad,
1667 }
1668
1669 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1670 LoLoad.getValue(1), HiLoad.getValue(1))};
1671
1672 return DAG.getMergeValues(Ops, SL);
1673}
1674
1676 SelectionDAG &DAG) const {
1677 LoadSDNode *Load = cast<LoadSDNode>(Op);
1678 EVT VT = Op.getValueType();
1679 SDValue BasePtr = Load->getBasePtr();
1680 EVT MemVT = Load->getMemoryVT();
1681 SDLoc SL(Op);
1682 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1683 Align BaseAlign = Load->getAlign();
1684 unsigned NumElements = MemVT.getVectorNumElements();
1685
1686 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1687 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1688 if (NumElements != 3 ||
1689 (BaseAlign < Align(8) &&
1690 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1691 return SplitVectorLoad(Op, DAG);
1692
1693 assert(NumElements == 3);
1694
1695 EVT WideVT =
1697 EVT WideMemVT =
1699 SDValue WideLoad = DAG.getExtLoad(
1700 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1701 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1702 return DAG.getMergeValues(
1703 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1704 DAG.getVectorIdxConstant(0, SL)),
1705 WideLoad.getValue(1)},
1706 SL);
1707}
1708
1710 SelectionDAG &DAG) const {
1711 StoreSDNode *Store = cast<StoreSDNode>(Op);
1712 SDValue Val = Store->getValue();
1713 EVT VT = Val.getValueType();
1714
1715 // If this is a 2 element vector, we really want to scalarize and not create
1716 // weird 1 element vectors.
1717 if (VT.getVectorNumElements() == 2)
1718 return scalarizeVectorStore(Store, DAG);
1719
1720 EVT MemVT = Store->getMemoryVT();
1721 SDValue Chain = Store->getChain();
1722 SDValue BasePtr = Store->getBasePtr();
1723 SDLoc SL(Op);
1724
1725 EVT LoVT, HiVT;
1726 EVT LoMemVT, HiMemVT;
1727 SDValue Lo, Hi;
1728
1729 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1730 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1731 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1732
1733 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1734
1735 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1736 Align BaseAlign = Store->getAlign();
1737 unsigned Size = LoMemVT.getStoreSize();
1738 Align HiAlign = commonAlignment(BaseAlign, Size);
1739
1740 SDValue LoStore =
1741 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1742 Store->getMemOperand()->getFlags());
1743 SDValue HiStore =
1744 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1745 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1746
1747 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1748}
1749
1750// This is a shortcut for integer division because we have fast i32<->f32
1751// conversions, and fast f32 reciprocal instructions. The fractional part of a
1752// float is enough to accurately represent up to a 24-bit signed integer.
1754 bool Sign) const {
1755 SDLoc DL(Op);
1756 EVT VT = Op.getValueType();
1757 SDValue LHS = Op.getOperand(0);
1758 SDValue RHS = Op.getOperand(1);
1759 MVT IntVT = MVT::i32;
1760 MVT FltVT = MVT::f32;
1761
1762 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1763 if (LHSSignBits < 9)
1764 return SDValue();
1765
1766 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1767 if (RHSSignBits < 9)
1768 return SDValue();
1769
1770 unsigned BitSize = VT.getSizeInBits();
1771 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1772 unsigned DivBits = BitSize - SignBits;
1773 if (Sign)
1774 ++DivBits;
1775
1778
1779 SDValue jq = DAG.getConstant(1, DL, IntVT);
1780
1781 if (Sign) {
1782 // char|short jq = ia ^ ib;
1783 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1784
1785 // jq = jq >> (bitsize - 2)
1786 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1787 DAG.getConstant(BitSize - 2, DL, VT));
1788
1789 // jq = jq | 0x1
1790 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1791 }
1792
1793 // int ia = (int)LHS;
1794 SDValue ia = LHS;
1795
1796 // int ib, (int)RHS;
1797 SDValue ib = RHS;
1798
1799 // float fa = (float)ia;
1800 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1801
1802 // float fb = (float)ib;
1803 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1804
1805 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1806 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1807
1808 // fq = trunc(fq);
1809 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1810
1811 // float fqneg = -fq;
1812 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1813
1815
1816 bool UseFmadFtz = false;
1817 if (Subtarget->isGCN()) {
1819 UseFmadFtz = MFI->getMode().allFP32Denormals();
1820 }
1821
1822 // float fr = mad(fqneg, fb, fa);
1823 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1824 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1826 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1827
1828 // int iq = (int)fq;
1829 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1830
1831 // fr = fabs(fr);
1832 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1833
1834 // fb = fabs(fb);
1835 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1836
1837 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1838
1839 // int cv = fr >= fb;
1840 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1841
1842 // jq = (cv ? jq : 0);
1843 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1844
1845 // dst = iq + jq;
1846 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1847
1848 // Rem needs compensation, it's easier to recompute it
1849 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1850 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1851
1852 // Truncate to number of bits this divide really is.
1853 if (Sign) {
1854 SDValue InRegSize
1855 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1856 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1857 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1858 } else {
1859 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1860 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1861 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1862 }
1863
1864 return DAG.getMergeValues({ Div, Rem }, DL);
1865}
1866
1868 SelectionDAG &DAG,
1870 SDLoc DL(Op);
1871 EVT VT = Op.getValueType();
1872
1873 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1874
1875 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1876
1877 SDValue One = DAG.getConstant(1, DL, HalfVT);
1878 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1879
1880 //HiLo split
1881 SDValue LHS = Op.getOperand(0);
1882 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1883 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1884
1885 SDValue RHS = Op.getOperand(1);
1886 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1887 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1888
1889 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1891
1892 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1893 LHS_Lo, RHS_Lo);
1894
1895 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1896 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1897
1898 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1899 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1900 return;
1901 }
1902
1903 if (isTypeLegal(MVT::i64)) {
1904 // The algorithm here is based on ideas from "Software Integer Division",
1905 // Tom Rodeheffer, August 2008.
1906
1909
1910 // Compute denominator reciprocal.
1911 unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1913 !MFI->getMode().allFP32Denormals() ?
1915 (unsigned)AMDGPUISD::FMAD_FTZ;
1916
1917 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1918 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1919 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1920 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1921 Cvt_Lo);
1922 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1923 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1924 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1925 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1926 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1927 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1928 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1929 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1930 Mul1);
1931 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1932 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1933 SDValue Rcp64 = DAG.getBitcast(VT,
1934 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1935
1936 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1937 SDValue One64 = DAG.getConstant(1, DL, VT);
1938 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1939 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1940
1941 // First round of UNR (Unsigned integer Newton-Raphson).
1942 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1943 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1944 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1945 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1946 Zero);
1947 SDValue Mulhi1_Hi =
1948 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1949 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1950 Mulhi1_Lo, Zero1);
1951 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1952 Mulhi1_Hi, Add1_Lo.getValue(1));
1953 SDValue Add1 = DAG.getBitcast(VT,
1954 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1955
1956 // Second round of UNR.
1957 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1958 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1959 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1960 Zero);
1961 SDValue Mulhi2_Hi =
1962 DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1963 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1964 Mulhi2_Lo, Zero1);
1965 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1966 Mulhi2_Hi, Add2_Lo.getValue(1));
1967 SDValue Add2 = DAG.getBitcast(VT,
1968 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1969
1970 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1971
1972 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1973
1974 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1975 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1976 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1977 Mul3_Lo, Zero1);
1978 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1979 Mul3_Hi, Sub1_Lo.getValue(1));
1980 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1981 SDValue Sub1 = DAG.getBitcast(VT,
1982 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1983
1984 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1985 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1986 ISD::SETUGE);
1987 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1988 ISD::SETUGE);
1989 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1990
1991 // TODO: Here and below portions of the code can be enclosed into if/endif.
1992 // Currently control flow is unconditional and we have 4 selects after
1993 // potential endif to substitute PHIs.
1994
1995 // if C3 != 0 ...
1996 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1997 RHS_Lo, Zero1);
1998 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1999 RHS_Hi, Sub1_Lo.getValue(1));
2000 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
2001 Zero, Sub2_Lo.getValue(1));
2002 SDValue Sub2 = DAG.getBitcast(VT,
2003 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2004
2005 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2006
2007 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2008 ISD::SETUGE);
2009 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2010 ISD::SETUGE);
2011 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2012
2013 // if (C6 != 0)
2014 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2015
2016 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
2017 RHS_Lo, Zero1);
2018 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
2019 RHS_Hi, Sub2_Lo.getValue(1));
2020 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
2021 Zero, Sub3_Lo.getValue(1));
2022 SDValue Sub3 = DAG.getBitcast(VT,
2023 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2024
2025 // endif C6
2026 // endif C3
2027
2028 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2029 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2030
2031 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2032 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2033
2034 Results.push_back(Div);
2035 Results.push_back(Rem);
2036
2037 return;
2038 }
2039
2040 // r600 expandion.
2041 // Get Speculative values
2042 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2043 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2044
2045 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2046 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2047 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2048
2049 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2050 SDValue DIV_Lo = Zero;
2051
2052 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2053
2054 for (unsigned i = 0; i < halfBitWidth; ++i) {
2055 const unsigned bitPos = halfBitWidth - i - 1;
2056 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2057 // Get value of high bit
2058 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2059 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2060 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2061
2062 // Shift
2063 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2064 // Add LHS high bit
2065 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2066
2067 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2068 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2069
2070 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2071
2072 // Update REM
2073 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2074 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2075 }
2076
2077 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2078 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2079 Results.push_back(DIV);
2080 Results.push_back(REM);
2081}
2082
2084 SelectionDAG &DAG) const {
2085 SDLoc DL(Op);
2086 EVT VT = Op.getValueType();
2087
2088 if (VT == MVT::i64) {
2090 LowerUDIVREM64(Op, DAG, Results);
2091 return DAG.getMergeValues(Results, DL);
2092 }
2093
2094 if (VT == MVT::i32) {
2095 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2096 return Res;
2097 }
2098
2099 SDValue X = Op.getOperand(0);
2100 SDValue Y = Op.getOperand(1);
2101
2102 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2103 // algorithm used here.
2104
2105 // Initial estimate of inv(y).
2106 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2107
2108 // One round of UNR.
2109 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2110 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2111 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2112 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2113
2114 // Quotient/remainder estimate.
2115 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2116 SDValue R =
2117 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2118
2119 // First quotient/remainder refinement.
2120 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2121 SDValue One = DAG.getConstant(1, DL, VT);
2122 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2123 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2124 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2125 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2126 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2127
2128 // Second quotient/remainder refinement.
2129 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2130 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2131 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2132 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2133 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2134
2135 return DAG.getMergeValues({Q, R}, DL);
2136}
2137
2139 SelectionDAG &DAG) const {
2140 SDLoc DL(Op);
2141 EVT VT = Op.getValueType();
2142
2143 SDValue LHS = Op.getOperand(0);
2144 SDValue RHS = Op.getOperand(1);
2145
2146 SDValue Zero = DAG.getConstant(0, DL, VT);
2147 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2148
2149 if (VT == MVT::i32) {
2150 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2151 return Res;
2152 }
2153
2154 if (VT == MVT::i64 &&
2155 DAG.ComputeNumSignBits(LHS) > 32 &&
2156 DAG.ComputeNumSignBits(RHS) > 32) {
2157 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2158
2159 //HiLo split
2160 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2161 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2162 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2163 LHS_Lo, RHS_Lo);
2164 SDValue Res[2] = {
2165 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2166 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2167 };
2168 return DAG.getMergeValues(Res, DL);
2169 }
2170
2171 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2172 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2173 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2174 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2175
2176 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2177 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2178
2179 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2180 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2181
2182 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2183 SDValue Rem = Div.getValue(1);
2184
2185 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2186 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2187
2188 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2189 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2190
2191 SDValue Res[2] = {
2192 Div,
2193 Rem
2194 };
2195 return DAG.getMergeValues(Res, DL);
2196}
2197
2198// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2200 SDLoc SL(Op);
2201 EVT VT = Op.getValueType();
2202 auto Flags = Op->getFlags();
2203 SDValue X = Op.getOperand(0);
2204 SDValue Y = Op.getOperand(1);
2205
2206 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2207 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2208 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2209 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2210 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2211}
2212
2214 SDLoc SL(Op);
2215 SDValue Src = Op.getOperand(0);
2216
2217 // result = trunc(src)
2218 // if (src > 0.0 && src != result)
2219 // result += 1.0
2220
2221 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2222
2223 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2224 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2225
2226 EVT SetCCVT =
2228
2229 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2230 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2231 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2232
2233 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2234 // TODO: Should this propagate fast-math-flags?
2235 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2236}
2237
2239 SelectionDAG &DAG) {
2240 const unsigned FractBits = 52;
2241 const unsigned ExpBits = 11;
2242
2243 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2244 Hi,
2245 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2246 DAG.getConstant(ExpBits, SL, MVT::i32));
2247 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2248 DAG.getConstant(1023, SL, MVT::i32));
2249
2250 return Exp;
2251}
2252
2254 SDLoc SL(Op);
2255 SDValue Src = Op.getOperand(0);
2256
2257 assert(Op.getValueType() == MVT::f64);
2258
2259 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2260
2261 // Extract the upper half, since this is where we will find the sign and
2262 // exponent.
2263 SDValue Hi = getHiHalf64(Src, DAG);
2264
2265 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2266
2267 const unsigned FractBits = 52;
2268
2269 // Extract the sign bit.
2270 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2271 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2272
2273 // Extend back to 64-bits.
2274 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2275 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2276
2277 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2278 const SDValue FractMask
2279 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2280
2281 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2282 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2283 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2284
2285 EVT SetCCVT =
2287
2288 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2289
2290 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2291 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2292
2293 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2294 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2295
2296 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2297}
2298
2300 SDLoc SL(Op);
2301 SDValue Src = Op.getOperand(0);
2302
2303 assert(Op.getValueType() == MVT::f64);
2304
2305 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2306 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2307 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2308
2309 // TODO: Should this propagate fast-math-flags?
2310
2311 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2312 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2313
2314 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2315
2316 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2317 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2318
2319 EVT SetCCVT =
2321 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2322
2323 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2324}
2325
2327 // FNEARBYINT and FRINT are the same, except in their handling of FP
2328 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2329 // rint, so just treat them as equivalent.
2330 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2331}
2332
2334 SelectionDAG &DAG) const {
2335 auto VT = Op.getValueType();
2336 auto Arg = Op.getOperand(0u);
2337 return DAG.getNode(ISD::FRINT, SDLoc(Op), VT, Arg);
2338}
2339
2340// XXX - May require not supporting f32 denormals?
2341
2342// Don't handle v2f16. The extra instructions to scalarize and repack around the
2343// compare and vselect end up producing worse code than scalarizing the whole
2344// operation.
2346 SDLoc SL(Op);
2347 SDValue X = Op.getOperand(0);
2348 EVT VT = Op.getValueType();
2349
2350 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2351
2352 // TODO: Should this propagate fast-math-flags?
2353
2354 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2355
2356 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2357
2358 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2359 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2360 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2361
2362 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2363
2364 EVT SetCCVT =
2365 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2366
2367 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2368
2369 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2370
2371 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2372}
2373
2375 SDLoc SL(Op);
2376 SDValue Src = Op.getOperand(0);
2377
2378 // result = trunc(src);
2379 // if (src < 0.0 && src != result)
2380 // result += -1.0.
2381
2382 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2383
2384 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2385 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2386
2387 EVT SetCCVT =
2389
2390 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2391 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2392 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2393
2394 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2395 // TODO: Should this propagate fast-math-flags?
2396 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2397}
2398
2400 double Log2BaseInverted) const {
2401 EVT VT = Op.getValueType();
2402
2403 SDLoc SL(Op);
2404 SDValue Operand = Op.getOperand(0);
2405 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2406 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2407
2408 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2409}
2410
2411// exp2(M_LOG2E_F * f);
2413 EVT VT = Op.getValueType();
2414 SDLoc SL(Op);
2415 SDValue Src = Op.getOperand(0);
2416
2417 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2418 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2419 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2420}
2421
2422static bool isCtlzOpc(unsigned Opc) {
2423 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2424}
2425
2426static bool isCttzOpc(unsigned Opc) {
2427 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2428}
2429
2431 SDLoc SL(Op);
2432 SDValue Src = Op.getOperand(0);
2433
2434 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2435 bool Ctlz = isCtlzOpc(Op.getOpcode());
2436 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2437
2438 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2439 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2440
2441 if (Src.getValueType() == MVT::i32) {
2442 // (ctlz hi:lo) -> (umin (ffbh src), 32)
2443 // (cttz hi:lo) -> (umin (ffbl src), 32)
2444 // (ctlz_zero_undef src) -> (ffbh src)
2445 // (cttz_zero_undef src) -> (ffbl src)
2446 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2447 if (!ZeroUndef) {
2448 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2449 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2450 }
2451 return NewOpr;
2452 }
2453
2454 SDValue Lo, Hi;
2455 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2456
2457 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2458 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2459
2460 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2461 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2462 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2463 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2464
2465 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2466 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2467 if (Ctlz)
2468 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2469 else
2470 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2471
2472 SDValue NewOpr;
2473 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2474 if (!ZeroUndef) {
2475 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2476 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2477 }
2478
2479 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2480}
2481
2483 bool Signed) const {
2484 // The regular method converting a 64-bit integer to float roughly consists of
2485 // 2 steps: normalization and rounding. In fact, after normalization, the
2486 // conversion from a 64-bit integer to a float is essentially the same as the
2487 // one from a 32-bit integer. The only difference is that it has more
2488 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2489 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2490 // converted into the correct float number. The basic steps for the unsigned
2491 // conversion are illustrated in the following pseudo code:
2492 //
2493 // f32 uitofp(i64 u) {
2494 // i32 hi, lo = split(u);
2495 // // Only count the leading zeros in hi as we have native support of the
2496 // // conversion from i32 to f32. If hi is all 0s, the conversion is
2497 // // reduced to a 32-bit one automatically.
2498 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2499 // u <<= shamt;
2500 // hi, lo = split(u);
2501 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2502 // // convert it as a 32-bit integer and scale the result back.
2503 // return uitofp(hi) * 2^(32 - shamt);
2504 // }
2505 //
2506 // The signed one follows the same principle but uses 'ffbh_i32' to count its
2507 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2508 // converted instead followed by negation based its sign bit.
2509
2510 SDLoc SL(Op);
2511 SDValue Src = Op.getOperand(0);
2512
2513 SDValue Lo, Hi;
2514 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2515 SDValue Sign;
2516 SDValue ShAmt;
2517 if (Signed && Subtarget->isGCN()) {
2518 // We also need to consider the sign bit in Lo if Hi has just sign bits,
2519 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2520 // account. That is, the maximal shift is
2521 // - 32 if Lo and Hi have opposite signs;
2522 // - 33 if Lo and Hi have the same sign.
2523 //
2524 // Or, MaxShAmt = 33 + OppositeSign, where
2525 //
2526 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2527 // - -1 if Lo and Hi have opposite signs; and
2528 // - 0 otherwise.
2529 //
2530 // All in all, ShAmt is calculated as
2531 //
2532 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2533 //
2534 // or
2535 //
2536 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2537 //
2538 // to reduce the critical path.
2539 SDValue OppositeSign = DAG.getNode(
2540 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2541 DAG.getConstant(31, SL, MVT::i32));
2542 SDValue MaxShAmt =
2543 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2544 OppositeSign);
2545 // Count the leading sign bits.
2546 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2547 // Different from unsigned conversion, the shift should be one bit less to
2548 // preserve the sign bit.
2549 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2550 DAG.getConstant(1, SL, MVT::i32));
2551 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2552 } else {
2553 if (Signed) {
2554 // Without 'ffbh_i32', only leading zeros could be counted. Take the
2555 // absolute value first.
2556 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2557 DAG.getConstant(63, SL, MVT::i64));
2558 SDValue Abs =
2559 DAG.getNode(ISD::XOR, SL, MVT::i64,
2560 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2561 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2562 }
2563 // Count the leading zeros.
2564 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2565 // The shift amount for signed integers is [0, 32].
2566 }
2567 // Normalize the given 64-bit integer.
2568 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2569 // Split it again.
2570 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2571 // Calculate the adjust bit for rounding.
2572 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2573 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2574 DAG.getConstant(1, SL, MVT::i32), Lo);
2575 // Get the 32-bit normalized integer.
2576 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2577 // Convert the normalized 32-bit integer into f32.
2578 unsigned Opc =
2579 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2580 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2581
2582 // Finally, need to scale back the converted floating number as the original
2583 // 64-bit integer is converted as a 32-bit one.
2584 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2585 ShAmt);
2586 // On GCN, use LDEXP directly.
2587 if (Subtarget->isGCN())
2588 return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2589
2590 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2591 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2592 // exponent is enough to avoid overflowing into the sign bit.
2593 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2594 DAG.getConstant(23, SL, MVT::i32));
2595 SDValue IVal =
2596 DAG.getNode(ISD::ADD, SL, MVT::i32,
2597 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2598 if (Signed) {
2599 // Set the sign bit.
2600 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2601 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2602 DAG.getConstant(31, SL, MVT::i32));
2603 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2604 }
2605 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2606}
2607
2609 bool Signed) const {
2610 SDLoc SL(Op);
2611 SDValue Src = Op.getOperand(0);
2612
2613 SDValue Lo, Hi;
2614 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2615
2617 SL, MVT::f64, Hi);
2618
2619 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2620
2621 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2622 DAG.getConstant(32, SL, MVT::i32));
2623 // TODO: Should this propagate fast-math-flags?
2624 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2625}
2626
2628 SelectionDAG &DAG) const {
2629 // TODO: Factor out code common with LowerSINT_TO_FP.
2630 EVT DestVT = Op.getValueType();
2631 SDValue Src = Op.getOperand(0);
2632 EVT SrcVT = Src.getValueType();
2633
2634 if (SrcVT == MVT::i16) {
2635 if (DestVT == MVT::f16)
2636 return Op;
2637 SDLoc DL(Op);
2638
2639 // Promote src to i32
2640 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2641 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2642 }
2643
2644 assert(SrcVT == MVT::i64 && "operation should be legal");
2645
2646 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2647 SDLoc DL(Op);
2648
2649 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2650 SDValue FPRoundFlag =
2651 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2652 SDValue FPRound =
2653 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2654
2655 return FPRound;
2656 }
2657
2658 if (DestVT == MVT::f32)
2659 return LowerINT_TO_FP32(Op, DAG, false);
2660
2661 assert(DestVT == MVT::f64);
2662 return LowerINT_TO_FP64(Op, DAG, false);
2663}
2664
2666 SelectionDAG &DAG) const {
2667 EVT DestVT = Op.getValueType();
2668
2669 SDValue Src = Op.getOperand(0);
2670 EVT SrcVT = Src.getValueType();
2671
2672 if (SrcVT == MVT::i16) {
2673 if (DestVT == MVT::f16)
2674 return Op;
2675
2676 SDLoc DL(Op);
2677 // Promote src to i32
2678 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2679 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2680 }
2681
2682 assert(SrcVT == MVT::i64 && "operation should be legal");
2683
2684 // TODO: Factor out code common with LowerUINT_TO_FP.
2685
2686 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2687 SDLoc DL(Op);
2688 SDValue Src = Op.getOperand(0);
2689
2690 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2691 SDValue FPRoundFlag =
2692 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
2693 SDValue FPRound =
2694 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2695
2696 return FPRound;
2697 }
2698
2699 if (DestVT == MVT::f32)
2700 return LowerINT_TO_FP32(Op, DAG, true);
2701
2702 assert(DestVT == MVT::f64);
2703 return LowerINT_TO_FP64(Op, DAG, true);
2704}
2705
2707 bool Signed) const {
2708 SDLoc SL(Op);
2709
2710 SDValue Src = Op.getOperand(0);
2711 EVT SrcVT = Src.getValueType();
2712
2713 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2714
2715 // The basic idea of converting a floating point number into a pair of 32-bit
2716 // integers is illustrated as follows:
2717 //
2718 // tf := trunc(val);
2719 // hif := floor(tf * 2^-32);
2720 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2721 // hi := fptoi(hif);
2722 // lo := fptoi(lof);
2723 //
2724 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2725 SDValue Sign;
2726 if (Signed && SrcVT == MVT::f32) {
2727 // However, a 32-bit floating point number has only 23 bits mantissa and
2728 // it's not enough to hold all the significant bits of `lof` if val is
2729 // negative. To avoid the loss of precision, We need to take the absolute
2730 // value after truncating and flip the result back based on the original
2731 // signedness.
2732 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2733 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2734 DAG.getConstant(31, SL, MVT::i32));
2735 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2736 }
2737
2738 SDValue K0, K1;
2739 if (SrcVT == MVT::f64) {
2740 K0 = DAG.getConstantFP(
2741 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
2742 SrcVT);
2743 K1 = DAG.getConstantFP(
2744 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
2745 SrcVT);
2746 } else {
2747 K0 = DAG.getConstantFP(
2748 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
2749 K1 = DAG.getConstantFP(
2750 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
2751 }
2752 // TODO: Should this propagate fast-math-flags?
2753 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2754
2755 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2756
2757 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2758
2759 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2761 SL, MVT::i32, FloorMul);
2762 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2763
2764 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2765 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2766
2767 if (Signed && SrcVT == MVT::f32) {
2768 assert(Sign);
2769 // Flip the result based on the signedness, which is either all 0s or 1s.
2770 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2771 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2772 // r := xor(r, sign) - sign;
2773 Result =
2774 DAG.getNode(ISD::SUB, SL, MVT::i64,
2775 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2776 }
2777
2778 return Result;
2779}
2780
2782 SDLoc DL(Op);
2783 SDValue N0 = Op.getOperand(0);
2784
2785 // Convert to target node to get known bits
2786 if (N0.getValueType() == MVT::f32)
2787 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2788
2789 if (getTargetMachine().Options.UnsafeFPMath) {
2790 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2791 return SDValue();
2792 }
2793
2795
2796 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2797 const unsigned ExpMask = 0x7ff;
2798 const unsigned ExpBiasf64 = 1023;
2799 const unsigned ExpBiasf16 = 15;
2800 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2801 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2802 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2803 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2804 DAG.getConstant(32, DL, MVT::i64));
2805 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2806 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2807 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2808 DAG.getConstant(20, DL, MVT::i64));
2809 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2810 DAG.getConstant(ExpMask, DL, MVT::i32));
2811 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2812 // add the f16 bias (15) to get the biased exponent for the f16 format.
2813 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2814 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2815
2816 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2817 DAG.getConstant(8, DL, MVT::i32));
2818 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2819 DAG.getConstant(0xffe, DL, MVT::i32));
2820
2821 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2822 DAG.getConstant(0x1ff, DL, MVT::i32));
2823 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2824
2825 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2826 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2827
2828 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2830 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2831 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2832
2833 // N = M | (E << 12);
2834 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2835 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2836 DAG.getConstant(12, DL, MVT::i32)));
2837
2838 // B = clamp(1-E, 0, 13);
2839 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2840 One, E);
2841 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2842 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2843 DAG.getConstant(13, DL, MVT::i32));
2844
2845 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2846 DAG.getConstant(0x1000, DL, MVT::i32));
2847
2848 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2849 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2850 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2851 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2852
2853 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2854 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2855 DAG.getConstant(0x7, DL, MVT::i32));
2856 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2857 DAG.getConstant(2, DL, MVT::i32));
2858 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2859 One, Zero, ISD::SETEQ);
2860 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2861 One, Zero, ISD::SETGT);
2862 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2863 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2864
2865 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2866 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2867 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2868 I, V, ISD::SETEQ);
2869
2870 // Extract the sign bit.
2871 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2872 DAG.getConstant(16, DL, MVT::i32));
2873 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2874 DAG.getConstant(0x8000, DL, MVT::i32));
2875
2876 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2877 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2878}
2879
2881 SelectionDAG &DAG) const {
2882 SDValue Src = Op.getOperand(0);
2883 unsigned OpOpcode = Op.getOpcode();
2884 EVT SrcVT = Src.getValueType();
2885 EVT DestVT = Op.getValueType();
2886
2887 // Will be selected natively
2888 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2889 return Op;
2890
2891 // Promote i16 to i32
2892 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2893 SDLoc DL(Op);
2894
2895 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2896 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2897 }
2898
2899 if (SrcVT == MVT::f16 ||
2900 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2901 SDLoc DL(Op);
2902
2903 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2904 unsigned Ext =
2906 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2907 }
2908
2909 if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2910 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2911
2912 return SDValue();
2913}
2914
2916 SelectionDAG &DAG) const {
2917 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2918 MVT VT = Op.getSimpleValueType();
2919 MVT ScalarVT = VT.getScalarType();
2920
2921 assert(VT.isVector());
2922
2923 SDValue Src = Op.getOperand(0);
2924 SDLoc DL(Op);
2925
2926 // TODO: Don't scalarize on Evergreen?
2927 unsigned NElts = VT.getVectorNumElements();
2929 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2930
2931 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2932 for (unsigned I = 0; I < NElts; ++I)
2933 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2934
2935 return DAG.getBuildVector(VT, DL, Args);
2936}
2937
2938//===----------------------------------------------------------------------===//
2939// Custom DAG optimizations
2940//===----------------------------------------------------------------------===//
2941
2942static bool isU24(SDValue Op, SelectionDAG &DAG) {
2943 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2944}
2945
2946static bool isI24(SDValue Op, SelectionDAG &DAG) {
2947 EVT VT = Op.getValueType();
2948 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2949 // as unsigned 24-bit values.
2951}
2952
2955 SelectionDAG &DAG = DCI.DAG;
2956 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2957 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2958
2959 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2960 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2961 unsigned NewOpcode = Node24->getOpcode();
2962 if (IsIntrin) {
2963 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2964 switch (IID) {
2965 case Intrinsic::amdgcn_mul_i24:
2966 NewOpcode = AMDGPUISD::MUL_I24;
2967 break;
2968 case Intrinsic::amdgcn_mul_u24:
2969 NewOpcode = AMDGPUISD::MUL_U24;
2970 break;
2971 case Intrinsic::amdgcn_mulhi_i24:
2972 NewOpcode = AMDGPUISD::MULHI_I24;
2973 break;
2974 case Intrinsic::amdgcn_mulhi_u24:
2975 NewOpcode = AMDGPUISD::MULHI_U24;
2976 break;
2977 default:
2978 llvm_unreachable("Expected 24-bit mul intrinsic");
2979 }
2980 }
2981
2982 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2983
2984 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2985 // the operands to have other uses, but will only perform simplifications that
2986 // involve bypassing some nodes for this user.
2987 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2988 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2989 if (DemandedLHS || DemandedRHS)
2990 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2991 DemandedLHS ? DemandedLHS : LHS,
2992 DemandedRHS ? DemandedRHS : RHS);
2993
2994 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2995 // operands if this node is the only user.
2996 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2997 return SDValue(Node24, 0);
2998 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2999 return SDValue(Node24, 0);
3000
3001 return SDValue();
3002}
3003
3004template <typename IntTy>
3006 uint32_t Width, const SDLoc &DL) {
3007 if (Width + Offset < 32) {
3008 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3009 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3010 return DAG.getConstant(Result, DL, MVT::i32);
3011 }
3012
3013 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3014}
3015
3016static bool hasVolatileUser(SDNode *Val) {
3017 for (SDNode *U : Val->uses()) {
3018 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3019 if (M->isVolatile())
3020 return true;
3021 }
3022 }
3023
3024 return false;
3025}
3026
3028 // i32 vectors are the canonical memory type.
3029 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3030 return false;
3031
3032 if (!VT.isByteSized())
3033 return false;
3034
3035 unsigned Size = VT.getStoreSize();
3036
3037 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3038 return false;
3039
3040 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3041 return false;
3042
3043 return true;
3044}
3045
3046// Replace load of an illegal type with a store of a bitcast to a friendlier
3047// type.
3049 DAGCombinerInfo &DCI) const {
3050 if (!DCI.isBeforeLegalize())
3051 return SDValue();
3052
3053 LoadSDNode *LN = cast<LoadSDNode>(N);
3054 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3055 return SDValue();
3056
3057 SDLoc SL(N);
3058 SelectionDAG &DAG = DCI.DAG;
3059 EVT VT = LN->getMemoryVT();
3060
3061 unsigned Size = VT.getStoreSize();
3062 Align Alignment = LN->getAlign();
3063 if (Alignment < Size && isTypeLegal(VT)) {
3064 unsigned IsFast;
3065 unsigned AS = LN->getAddressSpace();
3066
3067 // Expand unaligned loads earlier than legalization. Due to visitation order
3068 // problems during legalization, the emitted instructions to pack and unpack
3069 // the bytes again are not eliminated in the case of an unaligned copy.
3071 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3072 if (VT.isVector())
3073 return SplitVectorLoad(SDValue(LN, 0), DAG);
3074
3075 SDValue Ops[2];
3076 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3077
3078 return DAG.getMergeValues(Ops, SDLoc(N));
3079 }
3080
3081 if (!IsFast)
3082 return SDValue();
3083 }
3084
3085 if (!shouldCombineMemoryType(VT))
3086 return SDValue();
3087
3088 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3089
3090 SDValue NewLoad
3091 = DAG.getLoad(NewVT, SL, LN->getChain(),
3092 LN->getBasePtr(), LN->getMemOperand());
3093
3094 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3095 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3096 return SDValue(N, 0);
3097}
3098
3099// Replace store of an illegal type with a store of a bitcast to a friendlier
3100// type.
3102 DAGCombinerInfo &DCI) const {
3103 if (!DCI.isBeforeLegalize())
3104 return SDValue();
3105
3106 StoreSDNode *SN = cast<StoreSDNode>(N);
3107 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3108 return SDValue();
3109
3110 EVT VT = SN->getMemoryVT();
3111 unsigned Size = VT.getStoreSize();
3112
3113 SDLoc SL(N);
3114 SelectionDAG &DAG = DCI.DAG;
3115 Align Alignment = SN->getAlign();
3116 if (Alignment < Size && isTypeLegal(VT)) {
3117 unsigned IsFast;
3118 unsigned AS = SN->getAddressSpace();
3119
3120 // Expand unaligned stores earlier than legalization. Due to visitation
3121 // order problems during legalization, the emitted instructions to pack and
3122 // unpack the bytes again are not eliminated in the case of an unaligned
3123 // copy.
3125 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3126 if (VT.isVector())
3127 return SplitVectorStore(SDValue(SN, 0), DAG);
3128
3129 return expandUnalignedStore(SN, DAG);
3130 }
3131
3132 if (!IsFast)
3133 return SDValue();
3134 }
3135
3136 if (!shouldCombineMemoryType(VT))
3137 return SDValue();
3138
3139 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3140 SDValue Val = SN->getValue();
3141
3142 //DCI.AddToWorklist(Val.getNode());
3143
3144 bool OtherUses = !Val.hasOneUse();
3145 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3146 if (OtherUses) {
3147 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3148 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3149 }
3150
3151 return DAG.getStore(SN->getChain(), SL, CastVal,
3152 SN->getBasePtr(), SN->getMemOperand());
3153}
3154
3155// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3156// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3157// issues.
3159 DAGCombinerInfo &DCI) const {
3160 SelectionDAG &DAG = DCI.DAG;
3161 SDValue N0 = N->getOperand(0);
3162
3163 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3164 // (vt2 (truncate (assertzext vt0:x, vt1)))
3165 if (N0.getOpcode() == ISD::TRUNCATE) {
3166 SDValue N1 = N->getOperand(1);
3167 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3168 SDLoc SL(N);
3169
3170 SDValue Src = N0.getOperand(0);
3171 EVT SrcVT = Src.getValueType();
3172 if (SrcVT.bitsGE(ExtVT)) {
3173 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3174 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3175 }
3176 }
3177
3178 return SDValue();
3179}
3180
3182 SDNode *N, DAGCombinerInfo &DCI) const {
3183 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3184 switch (IID) {
3185 case Intrinsic::amdgcn_mul_i24:
3186 case Intrinsic::amdgcn_mul_u24:
3187 case Intrinsic::amdgcn_mulhi_i24:
3188 case Intrinsic::amdgcn_mulhi_u24:
3189 return simplifyMul24(N, DCI);
3190 case Intrinsic::amdgcn_fract:
3191 case Intrinsic::amdgcn_rsq:
3192 case Intrinsic::amdgcn_rcp_legacy:
3193 case Intrinsic::amdgcn_rsq_legacy:
3194 case Intrinsic::amdgcn_rsq_clamp:
3195 case Intrinsic::amdgcn_ldexp: {
3196 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3197 SDValue Src = N->getOperand(1);
3198 return Src.isUndef() ? Src : SDValue();
3199 }
3200 default:
3201 return SDValue();
3202 }
3203}
3204
3205/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3206/// binary operation \p Opc to it with the corresponding constant operands.
3208 DAGCombinerInfo &DCI, const SDLoc &SL,
3209 unsigned Opc, SDValue LHS,
3210 uint32_t ValLo, uint32_t ValHi) const {
3211 SelectionDAG &DAG = DCI.DAG;
3212 SDValue Lo, Hi;
3213 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3214
3215 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3216 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3217
3218 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3219 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3220
3221 // Re-visit the ands. It's possible we eliminated one of them and it could
3222 // simplify the vector.
3223 DCI.AddToWorklist(Lo.getNode());
3224 DCI.AddToWorklist(Hi.getNode());
3225
3226 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3227 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3228}
3229
3231 DAGCombinerInfo &DCI) const {
3232 EVT VT = N->getValueType(0);
3233
3234 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3235 if (!RHS)
3236 return SDValue();
3237
3238 SDValue LHS = N->getOperand(0);
3239 unsigned RHSVal = RHS->getZExtValue();
3240 if (!RHSVal)
3241 return LHS;
3242
3243 SDLoc SL(N);
3244 SelectionDAG &DAG = DCI.DAG;
3245
3246 switch (LHS->getOpcode()) {
3247 default:
3248 break;
3249 case ISD::ZERO_EXTEND:
3250 case ISD::SIGN_EXTEND:
3251 case ISD::ANY_EXTEND: {
3252 SDValue X = LHS->getOperand(0);
3253
3254 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3256 // Prefer build_vector as the canonical form if packed types are legal.
3257 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3258 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3259 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3260 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3261 }
3262
3263 // shl (ext x) => zext (shl x), if shift does not overflow int
3264 if (VT != MVT::i64)
3265 break;
3266 KnownBits Known = DAG.computeKnownBits(X);
3267 unsigned LZ = Known.countMinLeadingZeros();
3268 if (LZ < RHSVal)
3269 break;
3270 EVT XVT = X.getValueType();
3271 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3272 return DAG.getZExtOrTrunc(Shl, SL, VT);
3273 }
3274 }
3275
3276 if (VT != MVT::i64)
3277 return SDValue();
3278
3279 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3280
3281 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3282 // common case, splitting this into a move and a 32-bit shift is faster and
3283 // the same code size.
3284 if (RHSVal < 32)
3285 return SDValue();
3286
3287 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3288
3290 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3291
3292 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3293
3294 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3295 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3296}
3297
3299 DAGCombinerInfo &DCI) const {
3300 if (N->getValueType(0) != MVT::i64)
3301 return SDValue();
3302
3303 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3304 if (!RHS)
3305 return SDValue();
3306
3307 SelectionDAG &DAG = DCI.DAG;
3308 SDLoc SL(N);
3309 unsigned RHSVal = RHS->getZExtValue();
3310
3311 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3312 if (RHSVal == 32) {
3313 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3314 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3315 DAG.getConstant(31, SL, MVT::i32));
3316
3317 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3318 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3319 }
3320
3321 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3322 if (RHSVal == 63) {
3323 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3324 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3325 DAG.getConstant(31, SL, MVT::i32));
3326 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3327 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3328 }
3329
3330 return SDValue();
3331}
3332
3334 DAGCombinerInfo &DCI) const {
3335 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3336 if (!RHS)
3337 return SDValue();
3338
3339 EVT VT = N->getValueType(0);
3340 SDValue LHS = N->getOperand(0);
3341 unsigned ShiftAmt = RHS->getZExtValue();
3342 SelectionDAG &DAG = DCI.DAG;
3343 SDLoc SL(N);
3344
3345 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3346 // this improves the ability to match BFE patterns in isel.
3347 if (LHS.getOpcode() == ISD::AND) {
3348 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3349 unsigned MaskIdx, MaskLen;
3350 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3351 MaskIdx == ShiftAmt) {
3352 return DAG.getNode(
3353 ISD::AND, SL, VT,
3354 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3355 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3356 }
3357 }
3358 }
3359
3360 if (VT != MVT::i64)
3361 return SDValue();
3362
3363 if (ShiftAmt < 32)
3364 return SDValue();
3365
3366 // srl i64:x, C for C >= 32
3367 // =>
3368 // build_pair (srl hi_32(x), C - 32), 0
3369 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3370
3371 SDValue Hi = getHiHalf64(LHS, DAG);
3372
3373 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3374 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3375
3376 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3377
3378 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3379}
3380
3382 SDNode *N, DAGCombinerInfo &DCI) const {
3383 SDLoc SL(N);
3384 SelectionDAG &DAG = DCI.DAG;
3385 EVT VT = N->getValueType(0);
3386 SDValue Src = N->getOperand(0);
3387
3388 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3389 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3390 SDValue Vec = Src.getOperand(0);
3391 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3392 SDValue Elt0 = Vec.getOperand(0);
3393 EVT EltVT = Elt0.getValueType();
3394 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3395 if (EltVT.isFloatingPoint()) {
3396 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3397 EltVT.changeTypeToInteger(), Elt0);
3398 }
3399
3400 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3401 }
3402 }
3403 }
3404
3405 // Equivalent of above for accessing the high element of a vector as an
3406 // integer operation.
3407 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3408 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3409 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3410 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3411 SDValue BV = stripBitcast(Src.getOperand(0));
3412 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3413 BV.getValueType().getVectorNumElements() == 2) {
3414 SDValue SrcElt = BV.getOperand(1);
3415 EVT SrcEltVT = SrcElt.getValueType();
3416 if (SrcEltVT.isFloatingPoint()) {
3417 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3418 SrcEltVT.changeTypeToInteger(), SrcElt);
3419 }
3420
3421 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3422 }
3423 }
3424 }
3425 }
3426
3427 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3428 //
3429 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3430 // i16 (trunc (srl (i32 (trunc x), K)))
3431 if (VT.getScalarSizeInBits() < 32) {
3432 EVT SrcVT = Src.getValueType();
3433 if (SrcVT.getScalarSizeInBits() > 32 &&
3434 (Src.getOpcode() == ISD::SRL ||
3435 Src.getOpcode() == ISD::SRA ||
3436 Src.getOpcode() == ISD::SHL)) {
3437 SDValue Amt = Src.getOperand(1);
3438 KnownBits Known = DAG.computeKnownBits(Amt);
3439
3440 // - For left shifts, do the transform as long as the shift
3441 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
3442 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
3443 // losing information stored in the high bits when truncating.
3444 const unsigned MaxCstSize =
3445 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
3446 if (Known.getMaxValue().ule(MaxCstSize)) {
3447 EVT MidVT = VT.isVector() ?
3450
3451 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3452 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3453 Src.getOperand(0));
3454 DCI.AddToWorklist(Trunc.getNode());
3455
3456 if (Amt.getValueType() != NewShiftVT) {
3457 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3458 DCI.AddToWorklist(Amt.getNode());
3459 }
3460
3461 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3462 Trunc, Amt);
3463 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3464 }
3465 }
3466 }
3467
3468 return SDValue();
3469}
3470
3471// We need to specifically handle i64 mul here to avoid unnecessary conversion
3472// instructions. If we only match on the legalized i64 mul expansion,
3473// SimplifyDemandedBits will be unable to remove them because there will be
3474// multiple uses due to the separate mul + mulh[su].
3475static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3476 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3477 if (Size <= 32) {
3478 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3479 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3480 }
3481
3482 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3483 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3484
3485 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3486 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3487
3488 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3489}
3490
3492 DAGCombinerInfo &DCI) const {
3493 EVT VT = N->getValueType(0);
3494
3495 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3496 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3497 // unnecessarily). isDivergent() is used as an approximation of whether the
3498 // value is in an SGPR.
3499 if (!N->isDivergent())
3500 return SDValue();
3501
3502 unsigned Size = VT.getSizeInBits();
3503 if (VT.isVector() || Size > 64)
3504 return SDValue();
3505
3506 // There are i16 integer mul/mad.
3507 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3508 return SDValue();
3509
3510 SelectionDAG &DAG = DCI.DAG;
3511 SDLoc DL(N);
3512
3513 SDValue N0 = N->getOperand(0);
3514 SDValue N1 = N->getOperand(1);
3515
3516 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3517 // in the source into any_extends if the result of the mul is truncated. Since
3518 // we can assume the high bits are whatever we want, use the underlying value
3519 // to avoid the unknown high bits from interfering.
3520 if (N0.getOpcode() == ISD::ANY_EXTEND)
3521 N0 = N0.getOperand(0);
3522
3523 if (N1.getOpcode() == ISD::ANY_EXTEND)
3524 N1 = N1.getOperand(0);
3525
3526 SDValue Mul;
3527
3528 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3529 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3530 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3531 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3532 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3533 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3534 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3535 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3536 } else {
3537 return SDValue();
3538 }
3539
3540 // We need to use sext even for MUL_U24, because MUL_U24 is used
3541 // for signed multiply of 8 and 16-bit types.
3542 return DAG.getSExtOrTrunc(Mul, DL, VT);
3543}
3544
3545SDValue
3547 DAGCombinerInfo &DCI) const {
3548 if (N->getValueType(0) != MVT::i32)
3549 return SDValue();
3550
3551 SelectionDAG &DAG = DCI.DAG;
3552 SDLoc DL(N);
3553
3554 SDValue N0 = N->getOperand(0);
3555 SDValue N1 = N->getOperand(1);
3556
3557 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3558 // in the source into any_extends if the result of the mul is truncated. Since
3559 // we can assume the high bits are whatever we want, use the underlying value
3560 // to avoid the unknown high bits from interfering.
3561 if (N0.getOpcode() == ISD::ANY_EXTEND)
3562 N0 = N0.getOperand(0);
3563 if (N1.getOpcode() == ISD::ANY_EXTEND)
3564 N1 = N1.getOperand(0);
3565
3566 // Try to use two fast 24-bit multiplies (one for each half of the result)
3567 // instead of one slow extending multiply.
3568 unsigned LoOpcode, HiOpcode;
3569 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3570 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3571 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3572 LoOpcode = AMDGPUISD::MUL_U24;
3573 HiOpcode = AMDGPUISD::MULHI_U24;
3574 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3575 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3576 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3577 LoOpcode = AMDGPUISD::MUL_I24;
3578 HiOpcode = AMDGPUISD::MULHI_I24;
3579 } else {
3580 return SDValue();
3581 }
3582
3583 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3584 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3585 DCI.CombineTo(N, Lo, Hi);
3586 return SDValue(N, 0);
3587}
3588
3590 DAGCombinerInfo &DCI) const {
3591 EVT VT = N->getValueType(0);