18 #include "llvm/IR/IntrinsicsX86.h"
24 #define DEBUG_TYPE "x86tti"
40 if (
auto *ConstantMask = dyn_cast<ConstantDataVector>(
Mask))
62 if (isa<ConstantAggregateZero>(
Mask))
70 unsigned AddrSpace = cast<PointerType>(Ptr->
getType())->getAddressSpace();
92 if (isa<ConstantAggregateZero>(
Mask)) {
105 unsigned AddrSpace = cast<PointerType>(Ptr->
getType())->getAddressSpace();
121 bool LogicalShift =
false;
122 bool ShiftLeft =
false;
128 case Intrinsic::x86_sse2_psrai_d:
129 case Intrinsic::x86_sse2_psrai_w:
130 case Intrinsic::x86_avx2_psrai_d:
131 case Intrinsic::x86_avx2_psrai_w:
132 case Intrinsic::x86_avx512_psrai_q_128:
133 case Intrinsic::x86_avx512_psrai_q_256:
134 case Intrinsic::x86_avx512_psrai_d_512:
135 case Intrinsic::x86_avx512_psrai_q_512:
136 case Intrinsic::x86_avx512_psrai_w_512:
139 case Intrinsic::x86_sse2_psra_d:
140 case Intrinsic::x86_sse2_psra_w:
141 case Intrinsic::x86_avx2_psra_d:
142 case Intrinsic::x86_avx2_psra_w:
143 case Intrinsic::x86_avx512_psra_q_128:
144 case Intrinsic::x86_avx512_psra_q_256:
145 case Intrinsic::x86_avx512_psra_d_512:
146 case Intrinsic::x86_avx512_psra_q_512:
147 case Intrinsic::x86_avx512_psra_w_512:
148 LogicalShift =
false;
151 case Intrinsic::x86_sse2_psrli_d:
152 case Intrinsic::x86_sse2_psrli_q:
153 case Intrinsic::x86_sse2_psrli_w:
154 case Intrinsic::x86_avx2_psrli_d:
155 case Intrinsic::x86_avx2_psrli_q:
156 case Intrinsic::x86_avx2_psrli_w:
157 case Intrinsic::x86_avx512_psrli_d_512:
158 case Intrinsic::x86_avx512_psrli_q_512:
159 case Intrinsic::x86_avx512_psrli_w_512:
162 case Intrinsic::x86_sse2_psrl_d:
163 case Intrinsic::x86_sse2_psrl_q:
164 case Intrinsic::x86_sse2_psrl_w:
165 case Intrinsic::x86_avx2_psrl_d:
166 case Intrinsic::x86_avx2_psrl_q:
167 case Intrinsic::x86_avx2_psrl_w:
168 case Intrinsic::x86_avx512_psrl_d_512:
169 case Intrinsic::x86_avx512_psrl_q_512:
170 case Intrinsic::x86_avx512_psrl_w_512:
174 case Intrinsic::x86_sse2_pslli_d:
175 case Intrinsic::x86_sse2_pslli_q:
176 case Intrinsic::x86_sse2_pslli_w:
177 case Intrinsic::x86_avx2_pslli_d:
178 case Intrinsic::x86_avx2_pslli_q:
179 case Intrinsic::x86_avx2_pslli_w:
180 case Intrinsic::x86_avx512_pslli_d_512:
181 case Intrinsic::x86_avx512_pslli_q_512:
182 case Intrinsic::x86_avx512_pslli_w_512:
185 case Intrinsic::x86_sse2_psll_d:
186 case Intrinsic::x86_sse2_psll_q:
187 case Intrinsic::x86_sse2_psll_w:
188 case Intrinsic::x86_avx2_psll_d:
189 case Intrinsic::x86_avx2_psll_q:
190 case Intrinsic::x86_avx2_psll_w:
191 case Intrinsic::x86_avx512_psll_d_512:
192 case Intrinsic::x86_avx512_psll_q_512:
193 case Intrinsic::x86_avx512_psll_w_512:
198 assert((LogicalShift || !ShiftLeft) &&
"Only logical shifts can shift left");
202 auto VT = cast<FixedVectorType>(Vec->getType());
203 auto SVT = VT->getElementType();
204 auto AmtVT = Amt->getType();
205 unsigned VWidth = VT->getNumElements();
206 unsigned BitWidth = SVT->getPrimitiveSizeInBits();
212 assert(AmtVT->isIntegerTy(32) &&
"Unexpected shift-by-immediate type");
216 Amt =
Builder.CreateZExtOrTrunc(Amt, SVT);
217 Amt =
Builder.CreateVectorSplat(VWidth, Amt);
218 return (LogicalShift ? (ShiftLeft ?
Builder.CreateShl(Vec, Amt)
219 :
Builder.CreateLShr(Vec, Amt))
220 :
Builder.CreateAShr(Vec, Amt));
226 return Builder.CreateAShr(Vec,
Builder.CreateVectorSplat(VWidth, Amt));
231 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
232 cast<VectorType>(AmtVT)->getElementType() == SVT &&
233 "Unexpected shift-by-scalar type");
234 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
244 Amt =
Builder.CreateShuffleVector(Amt, ZeroSplat);
245 return (LogicalShift ? (ShiftLeft ?
Builder.CreateShl(Vec, Amt)
246 :
Builder.CreateLShr(Vec, Amt))
247 :
Builder.CreateAShr(Vec, Amt));
252 auto CDV = dyn_cast<ConstantDataVector>(Amt);
258 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
259 cast<VectorType>(AmtVT)->getElementType() == SVT &&
260 "Unexpected shift-by-scalar type");
264 for (
unsigned i = 0, NumSubElts = 64 /
BitWidth;
i != NumSubElts; ++
i) {
265 unsigned SubEltIdx = (NumSubElts - 1) -
i;
266 auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
287 auto ShiftVec =
Builder.CreateVectorSplat(VWidth, ShiftAmt);
290 return Builder.CreateShl(Vec, ShiftVec);
293 return Builder.CreateLShr(Vec, ShiftVec);
295 return Builder.CreateAShr(Vec, ShiftVec);
303 bool LogicalShift =
false;
304 bool ShiftLeft =
false;
309 case Intrinsic::x86_avx2_psrav_d:
310 case Intrinsic::x86_avx2_psrav_d_256:
311 case Intrinsic::x86_avx512_psrav_q_128:
312 case Intrinsic::x86_avx512_psrav_q_256:
313 case Intrinsic::x86_avx512_psrav_d_512:
314 case Intrinsic::x86_avx512_psrav_q_512:
315 case Intrinsic::x86_avx512_psrav_w_128:
316 case Intrinsic::x86_avx512_psrav_w_256:
317 case Intrinsic::x86_avx512_psrav_w_512:
318 LogicalShift =
false;
321 case Intrinsic::x86_avx2_psrlv_d:
322 case Intrinsic::x86_avx2_psrlv_d_256:
323 case Intrinsic::x86_avx2_psrlv_q:
324 case Intrinsic::x86_avx2_psrlv_q_256:
325 case Intrinsic::x86_avx512_psrlv_d_512:
326 case Intrinsic::x86_avx512_psrlv_q_512:
327 case Intrinsic::x86_avx512_psrlv_w_128:
328 case Intrinsic::x86_avx512_psrlv_w_256:
329 case Intrinsic::x86_avx512_psrlv_w_512:
333 case Intrinsic::x86_avx2_psllv_d:
334 case Intrinsic::x86_avx2_psllv_d_256:
335 case Intrinsic::x86_avx2_psllv_q:
336 case Intrinsic::x86_avx2_psllv_q_256:
337 case Intrinsic::x86_avx512_psllv_d_512:
338 case Intrinsic::x86_avx512_psllv_q_512:
339 case Intrinsic::x86_avx512_psllv_w_128:
340 case Intrinsic::x86_avx512_psllv_w_256:
341 case Intrinsic::x86_avx512_psllv_w_512:
346 assert((LogicalShift || !ShiftLeft) &&
"Only logical shifts can shift left");
350 auto VT = cast<FixedVectorType>(II.
getType());
351 auto SVT = VT->getElementType();
352 int NumElts = VT->getNumElements();
353 int BitWidth = SVT->getIntegerBitWidth();
361 return (LogicalShift ? (ShiftLeft ?
Builder.CreateShl(Vec, Amt)
362 :
Builder.CreateLShr(Vec, Amt))
363 :
Builder.CreateAShr(Vec, Amt));
367 auto *CShift = dyn_cast<Constant>(Amt);
373 bool AnyOutOfRange =
false;
375 for (
int I = 0;
I < NumElts; ++
I) {
376 auto *CElt = CShift->getAggregateElement(
I);
377 if (isa_and_nonnull<UndefValue>(CElt)) {
378 ShiftAmts.push_back(-1);
382 auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
389 APInt ShiftVal = COp->getValue();
391 AnyOutOfRange = LogicalShift;
401 auto OutOfRange = [&](
int Idx) {
return (Idx < 0) || (
BitWidth <= Idx); };
404 for (
int Idx : ShiftAmts) {
408 assert(LogicalShift &&
"Logical shift expected");
421 for (
int Idx : ShiftAmts) {
430 return Builder.CreateShl(Vec, ShiftVec);
433 return Builder.CreateLShr(Vec, ShiftVec);
435 return Builder.CreateAShr(Vec, ShiftVec);
445 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
448 auto *ArgTy = cast<FixedVectorType>(Arg0->
getType());
450 unsigned NumSrcElts = ArgTy->getNumElements();
451 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
452 "Unexpected packing types");
454 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
456 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
457 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
458 "Unexpected packing types");
461 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
466 APInt MinValue, MaxValue;
485 Arg0 =
Builder.CreateSelect(
Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
486 Arg1 =
Builder.CreateSelect(
Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
487 Arg0 =
Builder.CreateSelect(
Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
488 Arg1 =
Builder.CreateSelect(
Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
492 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
493 for (
unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
494 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
495 for (
unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
496 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
498 auto *Shuffle =
Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
501 return Builder.CreateTrunc(Shuffle, ResTy);
510 if (isa<UndefValue>(
Arg))
513 auto *ArgTy = dyn_cast<FixedVectorType>(
Arg->getType());
523 unsigned NumElts = ArgTy->getNumElements();
529 Res =
Builder.CreateBitCast(Res, IntegerTy);
530 Res =
Builder.CreateZExtOrTrunc(Res, ResTy);
543 "Unexpected types for x86 addcarry");
547 Value *UAdd =
Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
550 Value *UAddResult =
Builder.CreateExtractValue(UAdd, 0);
554 Res =
Builder.CreateInsertValue(Res, UAddOV, 0);
555 return Builder.CreateInsertValue(Res, UAddResult, 1);
567 auto *VecTy = cast<FixedVectorType>(II.
getType());
568 assert(VecTy->getNumElements() == 4 &&
"insertps with wrong vector type");
575 uint8_t Imm = CInt->getZExtValue();
576 uint8_t ZMask = Imm & 0xf;
577 uint8_t DestLane = (Imm >> 4) & 0
x3;
578 uint8_t SourceLane = (Imm >> 6) & 0
x3;
588 int ShuffleMask[4] = {0, 1, 2, 3};
597 (ZMask & (1 << DestLane))) {
601 ShuffleMask[DestLane] = SourceLane;
603 for (
unsigned i = 0;
i < 4; ++
i)
604 if ((ZMask >>
i) & 0x1)
605 ShuffleMask[
i] =
i + 4;
612 ShuffleMask[DestLane] = SourceLane + 4;
623 auto LowConstantHighUndef = [&](uint64_t Val) {
631 Constant *C0 = dyn_cast<Constant>(Op0);
637 if (CILength && CIIndex) {
647 unsigned Length = APLength == 0 ? 64 : APLength.
getZExtValue();
651 unsigned End =
Index + Length;
662 if ((Length % 8) == 0 && (
Index % 8) == 0) {
671 for (
int i = 0;
i != (
int)Length; ++
i)
672 ShuffleMask.push_back(
i +
Index);
673 for (
int i = Length;
i != 8; ++
i)
674 ShuffleMask.push_back(
i + 16);
675 for (
int i = 8;
i != 16; ++
i)
676 ShuffleMask.push_back(-1);
679 Builder.CreateBitCast(Op0, ShufTy),
695 Value *
Args[] = {Op0, CILength, CIIndex};
704 return LowConstantHighUndef(0);
724 unsigned Length = APLength == 0 ? 64 : APLength.
getZExtValue();
728 unsigned End =
Index + Length;
739 if ((Length % 8) == 0 && (
Index % 8) == 0) {
749 ShuffleMask.push_back(
i);
750 for (
int i = 0;
i != (
int)Length; ++
i)
751 ShuffleMask.push_back(
i + 16);
752 for (
int i =
Index + Length;
i != 8; ++
i)
753 ShuffleMask.push_back(
i);
754 for (
int i = 8;
i != 16; ++
i)
755 ShuffleMask.push_back(-1);
758 Builder.CreateBitCast(Op1, ShufTy),
764 Constant *C0 = dyn_cast<Constant>(Op0);
770 C1 ? dyn_cast_or_null<ConstantInt>(
C1->getAggregateElement((
unsigned)0))
780 APInt Val = V00 | V10;
794 Value *
Args[] = {Op0, Op1, CILength, CIIndex};
810 auto *VecTy = cast<FixedVectorType>(II.
getType());
811 unsigned NumElts = VecTy->getNumElements();
812 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
813 "Unexpected number of elements in shuffle mask!");
820 for (
unsigned I = 0;
I < NumElts; ++
I) {
822 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
825 if (isa<UndefValue>(COp)) {
830 int8_t
Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
855 auto *VecTy = cast<FixedVectorType>(II.
getType());
856 unsigned NumElts = VecTy->getNumElements();
857 bool IsPD = VecTy->getScalarType()->isDoubleTy();
858 unsigned NumLaneElts = IsPD ? 2 : 4;
859 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
865 for (
unsigned I = 0;
I < NumElts; ++
I) {
867 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
870 if (isa<UndefValue>(COp)) {
875 APInt Index = cast<ConstantInt>(COp)->getValue();
881 Index.lshrInPlace(1);
886 Index +=
APInt(32, (
I / NumLaneElts) * NumLaneElts);
888 Indexes[
I] =
Index.getZExtValue();
902 auto *VecTy = cast<FixedVectorType>(II.
getType());
903 unsigned Size = VecTy->getNumElements();
905 "Unexpected shuffle mask size");
910 for (
unsigned I = 0;
I <
Size; ++
I) {
912 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
915 if (isa<UndefValue>(COp)) {
931 auto SimplifyDemandedVectorEltsLow = [&IC](
Value *
Op,
unsigned Width,
932 unsigned DemandedWidth) {
940 case Intrinsic::x86_bmi_bextr_32:
941 case Intrinsic::x86_bmi_bextr_64:
942 case Intrinsic::x86_tbm_bextri_u32:
943 case Intrinsic::x86_tbm_bextri_u64:
946 uint64_t
Shift =
C->getZExtValue();
947 uint64_t Length = (
Shift >> 8) & 0xff;
955 if (
auto *InC = dyn_cast<ConstantInt>(II.
getArgOperand(0))) {
956 uint64_t Result = InC->getZExtValue() >>
Shift;
959 Result &= maskTrailingOnes<uint64_t>(Length);
968 case Intrinsic::x86_bmi_bzhi_32:
969 case Intrinsic::x86_bmi_bzhi_64:
972 uint64_t
Index =
C->getZExtValue() & 0xff;
981 if (
auto *InC = dyn_cast<ConstantInt>(II.
getArgOperand(0))) {
982 uint64_t Result = InC->getZExtValue();
983 Result &= maskTrailingOnes<uint64_t>(
Index);
990 case Intrinsic::x86_bmi_pext_32:
991 case Intrinsic::x86_bmi_pext_64:
992 if (
auto *MaskC = dyn_cast<ConstantInt>(II.
getArgOperand(1))) {
993 if (MaskC->isNullValue()) {
996 if (MaskC->isAllOnesValue()) {
1000 if (MaskC->getValue().isShiftedMask()) {
1004 unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1014 if (
auto *SrcC = dyn_cast<ConstantInt>(II.
getArgOperand(0))) {
1015 uint64_t Src = SrcC->getZExtValue();
1016 uint64_t
Mask = MaskC->getZExtValue();
1017 uint64_t Result = 0;
1018 uint64_t BitToSet = 1;
1023 if (BitToTest & Src)
1036 case Intrinsic::x86_bmi_pdep_32:
1037 case Intrinsic::x86_bmi_pdep_64:
1038 if (
auto *MaskC = dyn_cast<ConstantInt>(II.
getArgOperand(1))) {
1039 if (MaskC->isNullValue()) {
1042 if (MaskC->isAllOnesValue()) {
1045 if (MaskC->getValue().isShiftedMask()) {
1049 unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1058 if (
auto *SrcC = dyn_cast<ConstantInt>(II.
getArgOperand(0))) {
1059 uint64_t Src = SrcC->getZExtValue();
1060 uint64_t
Mask = MaskC->getZExtValue();
1061 uint64_t Result = 0;
1062 uint64_t BitToTest = 1;
1067 if (BitToTest & Src)
1081 case Intrinsic::x86_sse_cvtss2si:
1082 case Intrinsic::x86_sse_cvtss2si64:
1083 case Intrinsic::x86_sse_cvttss2si:
1084 case Intrinsic::x86_sse_cvttss2si64:
1085 case Intrinsic::x86_sse2_cvtsd2si:
1086 case Intrinsic::x86_sse2_cvtsd2si64:
1087 case Intrinsic::x86_sse2_cvttsd2si:
1088 case Intrinsic::x86_sse2_cvttsd2si64:
1089 case Intrinsic::x86_avx512_vcvtss2si32:
1090 case Intrinsic::x86_avx512_vcvtss2si64:
1091 case Intrinsic::x86_avx512_vcvtss2usi32:
1092 case Intrinsic::x86_avx512_vcvtss2usi64:
1093 case Intrinsic::x86_avx512_vcvtsd2si32:
1094 case Intrinsic::x86_avx512_vcvtsd2si64:
1095 case Intrinsic::x86_avx512_vcvtsd2usi32:
1096 case Intrinsic::x86_avx512_vcvtsd2usi64:
1097 case Intrinsic::x86_avx512_cvttss2si:
1098 case Intrinsic::x86_avx512_cvttss2si64:
1099 case Intrinsic::x86_avx512_cvttss2usi:
1100 case Intrinsic::x86_avx512_cvttss2usi64:
1101 case Intrinsic::x86_avx512_cvttsd2si:
1102 case Intrinsic::x86_avx512_cvttsd2si64:
1103 case Intrinsic::x86_avx512_cvttsd2usi:
1104 case Intrinsic::x86_avx512_cvttsd2usi64: {
1108 unsigned VWidth = cast<FixedVectorType>(
Arg->getType())->getNumElements();
1109 if (
Value *V = SimplifyDemandedVectorEltsLow(
Arg, VWidth, 1)) {
1115 case Intrinsic::x86_mmx_pmovmskb:
1116 case Intrinsic::x86_sse_movmsk_ps:
1117 case Intrinsic::x86_sse2_movmsk_pd:
1118 case Intrinsic::x86_sse2_pmovmskb_128:
1119 case Intrinsic::x86_avx_movmsk_pd_256:
1120 case Intrinsic::x86_avx_movmsk_ps_256:
1121 case Intrinsic::x86_avx2_pmovmskb:
1127 case Intrinsic::x86_sse_comieq_ss:
1128 case Intrinsic::x86_sse_comige_ss:
1129 case Intrinsic::x86_sse_comigt_ss:
1130 case Intrinsic::x86_sse_comile_ss:
1131 case Intrinsic::x86_sse_comilt_ss:
1132 case Intrinsic::x86_sse_comineq_ss:
1133 case Intrinsic::x86_sse_ucomieq_ss:
1134 case Intrinsic::x86_sse_ucomige_ss:
1135 case Intrinsic::x86_sse_ucomigt_ss:
1136 case Intrinsic::x86_sse_ucomile_ss:
1137 case Intrinsic::x86_sse_ucomilt_ss:
1138 case Intrinsic::x86_sse_ucomineq_ss:
1139 case Intrinsic::x86_sse2_comieq_sd:
1140 case Intrinsic::x86_sse2_comige_sd:
1141 case Intrinsic::x86_sse2_comigt_sd:
1142 case Intrinsic::x86_sse2_comile_sd:
1143 case Intrinsic::x86_sse2_comilt_sd:
1144 case Intrinsic::x86_sse2_comineq_sd:
1145 case Intrinsic::x86_sse2_ucomieq_sd:
1146 case Intrinsic::x86_sse2_ucomige_sd:
1147 case Intrinsic::x86_sse2_ucomigt_sd:
1148 case Intrinsic::x86_sse2_ucomile_sd:
1149 case Intrinsic::x86_sse2_ucomilt_sd:
1150 case Intrinsic::x86_sse2_ucomineq_sd:
1151 case Intrinsic::x86_avx512_vcomi_ss:
1152 case Intrinsic::x86_avx512_vcomi_sd:
1153 case Intrinsic::x86_avx512_mask_cmp_ss:
1154 case Intrinsic::x86_avx512_mask_cmp_sd: {
1157 bool MadeChange =
false;
1160 unsigned VWidth = cast<FixedVectorType>(Arg0->
getType())->getNumElements();
1161 if (
Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
1165 if (
Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
1175 case Intrinsic::x86_avx512_add_ps_512:
1176 case Intrinsic::x86_avx512_div_ps_512:
1177 case Intrinsic::x86_avx512_mul_ps_512:
1178 case Intrinsic::x86_avx512_sub_ps_512:
1179 case Intrinsic::x86_avx512_add_pd_512:
1180 case Intrinsic::x86_avx512_div_pd_512:
1181 case Intrinsic::x86_avx512_mul_pd_512:
1182 case Intrinsic::x86_avx512_sub_pd_512:
1186 if (R->getValue() == 4) {
1194 case Intrinsic::x86_avx512_add_ps_512:
1195 case Intrinsic::x86_avx512_add_pd_512:
1198 case Intrinsic::x86_avx512_sub_ps_512:
1199 case Intrinsic::x86_avx512_sub_pd_512:
1202 case Intrinsic::x86_avx512_mul_ps_512:
1203 case Intrinsic::x86_avx512_mul_pd_512:
1206 case Intrinsic::x86_avx512_div_ps_512:
1207 case Intrinsic::x86_avx512_div_pd_512:
1217 case Intrinsic::x86_avx512_mask_add_ss_round:
1218 case Intrinsic::x86_avx512_mask_div_ss_round:
1219 case Intrinsic::x86_avx512_mask_mul_ss_round:
1220 case Intrinsic::x86_avx512_mask_sub_ss_round:
1221 case Intrinsic::x86_avx512_mask_add_sd_round:
1222 case Intrinsic::x86_avx512_mask_div_sd_round:
1223 case Intrinsic::x86_avx512_mask_mul_sd_round:
1224 case Intrinsic::x86_avx512_mask_sub_sd_round:
1228 if (R->getValue() == 4) {
1239 case Intrinsic::x86_avx512_mask_add_ss_round:
1240 case Intrinsic::x86_avx512_mask_add_sd_round:
1243 case Intrinsic::x86_avx512_mask_sub_ss_round:
1244 case Intrinsic::x86_avx512_mask_sub_sd_round:
1247 case Intrinsic::x86_avx512_mask_mul_ss_round:
1248 case Intrinsic::x86_avx512_mask_mul_sd_round:
1251 case Intrinsic::x86_avx512_mask_div_ss_round:
1252 case Intrinsic::x86_avx512_mask_div_sd_round:
1259 auto *
C = dyn_cast<ConstantInt>(
Mask);
1261 if (!
C || !
C->getValue()[0]) {
1285 case Intrinsic::x86_sse2_psrai_d:
1286 case Intrinsic::x86_sse2_psrai_w:
1287 case Intrinsic::x86_avx2_psrai_d:
1288 case Intrinsic::x86_avx2_psrai_w:
1289 case Intrinsic::x86_avx512_psrai_q_128:
1290 case Intrinsic::x86_avx512_psrai_q_256:
1291 case Intrinsic::x86_avx512_psrai_d_512:
1292 case Intrinsic::x86_avx512_psrai_q_512:
1293 case Intrinsic::x86_avx512_psrai_w_512:
1294 case Intrinsic::x86_sse2_psrli_d:
1295 case Intrinsic::x86_sse2_psrli_q:
1296 case Intrinsic::x86_sse2_psrli_w:
1297 case Intrinsic::x86_avx2_psrli_d:
1298 case Intrinsic::x86_avx2_psrli_q:
1299 case Intrinsic::x86_avx2_psrli_w:
1300 case Intrinsic::x86_avx512_psrli_d_512:
1301 case Intrinsic::x86_avx512_psrli_q_512:
1302 case Intrinsic::x86_avx512_psrli_w_512:
1303 case Intrinsic::x86_sse2_pslli_d:
1304 case Intrinsic::x86_sse2_pslli_q:
1305 case Intrinsic::x86_sse2_pslli_w:
1306 case Intrinsic::x86_avx2_pslli_d:
1307 case Intrinsic::x86_avx2_pslli_q:
1308 case Intrinsic::x86_avx2_pslli_w:
1309 case Intrinsic::x86_avx512_pslli_d_512:
1310 case Intrinsic::x86_avx512_pslli_q_512:
1311 case Intrinsic::x86_avx512_pslli_w_512:
1317 case Intrinsic::x86_sse2_psra_d:
1318 case Intrinsic::x86_sse2_psra_w:
1319 case Intrinsic::x86_avx2_psra_d:
1320 case Intrinsic::x86_avx2_psra_w:
1321 case Intrinsic::x86_avx512_psra_q_128:
1322 case Intrinsic::x86_avx512_psra_q_256:
1323 case Intrinsic::x86_avx512_psra_d_512:
1324 case Intrinsic::x86_avx512_psra_q_512:
1325 case Intrinsic::x86_avx512_psra_w_512:
1326 case Intrinsic::x86_sse2_psrl_d:
1327 case Intrinsic::x86_sse2_psrl_q:
1328 case Intrinsic::x86_sse2_psrl_w:
1329 case Intrinsic::x86_avx2_psrl_d:
1330 case Intrinsic::x86_avx2_psrl_q:
1331 case Intrinsic::x86_avx2_psrl_w:
1332 case Intrinsic::x86_avx512_psrl_d_512:
1333 case Intrinsic::x86_avx512_psrl_q_512:
1334 case Intrinsic::x86_avx512_psrl_w_512:
1335 case Intrinsic::x86_sse2_psll_d:
1336 case Intrinsic::x86_sse2_psll_q:
1337 case Intrinsic::x86_sse2_psll_w:
1338 case Intrinsic::x86_avx2_psll_d:
1339 case Intrinsic::x86_avx2_psll_q:
1340 case Intrinsic::x86_avx2_psll_w:
1341 case Intrinsic::x86_avx512_psll_d_512:
1342 case Intrinsic::x86_avx512_psll_q_512:
1343 case Intrinsic::x86_avx512_psll_w_512: {
1352 "Unexpected packed shift size");
1353 unsigned VWidth = cast<FixedVectorType>(Arg1->
getType())->getNumElements();
1355 if (
Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
1361 case Intrinsic::x86_avx2_psllv_d:
1362 case Intrinsic::x86_avx2_psllv_d_256:
1363 case Intrinsic::x86_avx2_psllv_q:
1364 case Intrinsic::x86_avx2_psllv_q_256:
1365 case Intrinsic::x86_avx512_psllv_d_512:
1366 case Intrinsic::x86_avx512_psllv_q_512:
1367 case Intrinsic::x86_avx512_psllv_w_128:
1368 case Intrinsic::x86_avx512_psllv_w_256:
1369 case Intrinsic::x86_avx512_psllv_w_512:
1370 case Intrinsic::x86_avx2_psrav_d:
1371 case Intrinsic::x86_avx2_psrav_d_256:
1372 case Intrinsic::x86_avx512_psrav_q_128:
1373 case Intrinsic::x86_avx512_psrav_q_256:
1374 case Intrinsic::x86_avx512_psrav_d_512:
1375 case Intrinsic::x86_avx512_psrav_q_512:
1376 case Intrinsic::x86_avx512_psrav_w_128:
1377 case Intrinsic::x86_avx512_psrav_w_256:
1378 case Intrinsic::x86_avx512_psrav_w_512:
1379 case Intrinsic::x86_avx2_psrlv_d:
1380 case Intrinsic::x86_avx2_psrlv_d_256:
1381 case Intrinsic::x86_avx2_psrlv_q:
1382 case Intrinsic::x86_avx2_psrlv_q_256:
1383 case Intrinsic::x86_avx512_psrlv_d_512:
1384 case Intrinsic::x86_avx512_psrlv_q_512:
1385 case Intrinsic::x86_avx512_psrlv_w_128:
1386 case Intrinsic::x86_avx512_psrlv_w_256:
1387 case Intrinsic::x86_avx512_psrlv_w_512:
1393 case Intrinsic::x86_sse2_packssdw_128:
1394 case Intrinsic::x86_sse2_packsswb_128:
1395 case Intrinsic::x86_avx2_packssdw:
1396 case Intrinsic::x86_avx2_packsswb:
1397 case Intrinsic::x86_avx512_packssdw_512:
1398 case Intrinsic::x86_avx512_packsswb_512:
1404 case Intrinsic::x86_sse2_packuswb_128:
1405 case Intrinsic::x86_sse41_packusdw:
1406 case Intrinsic::x86_avx2_packusdw:
1407 case Intrinsic::x86_avx2_packuswb:
1408 case Intrinsic::x86_avx512_packusdw_512:
1409 case Intrinsic::x86_avx512_packuswb_512:
1415 case Intrinsic::x86_pclmulqdq:
1416 case Intrinsic::x86_pclmulqdq_256:
1417 case Intrinsic::x86_pclmulqdq_512: {
1419 unsigned Imm =
C->getZExtValue();
1421 bool MadeChange =
false;
1425 cast<FixedVectorType>(Arg0->
getType())->getNumElements();
1427 APInt UndefElts1(VWidth, 0);
1428 APInt DemandedElts1 =
1436 APInt UndefElts2(VWidth, 0);
1437 APInt DemandedElts2 =
1459 case Intrinsic::x86_sse41_insertps:
1465 case Intrinsic::x86_sse4a_extrq: {
1468 unsigned VWidth0 = cast<FixedVectorType>(Op0->
getType())->getNumElements();
1469 unsigned VWidth1 = cast<FixedVectorType>(Op1->
getType())->getNumElements();
1472 VWidth1 == 16 &&
"Unexpected operand sizes");
1477 C1 ? dyn_cast_or_null<ConstantInt>(
C1->getAggregateElement((
unsigned)0))
1480 C1 ? dyn_cast_or_null<ConstantInt>(
C1->getAggregateElement((
unsigned)1))
1490 bool MadeChange =
false;
1491 if (
Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1495 if (
Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
1505 case Intrinsic::x86_sse4a_extrqi: {
1509 unsigned VWidth = cast<FixedVectorType>(Op0->
getType())->getNumElements();
1511 "Unexpected operand size");
1524 if (
Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1530 case Intrinsic::x86_sse4a_insertq: {
1533 unsigned VWidth = cast<FixedVectorType>(Op0->
getType())->getNumElements();
1536 cast<FixedVectorType>(Op1->
getType())->getNumElements() == 2 &&
1537 "Unexpected operand size");
1542 C1 ? dyn_cast_or_null<ConstantInt>(
C1->getAggregateElement((
unsigned)1))
1557 if (
Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1563 case Intrinsic::x86_sse4a_insertqi: {
1569 unsigned VWidth0 = cast<FixedVectorType>(Op0->
getType())->getNumElements();
1570 unsigned VWidth1 = cast<FixedVectorType>(Op1->
getType())->getNumElements();
1573 VWidth1 == 2 &&
"Unexpected operand sizes");
1580 if (CILength && CIIndex) {
1590 bool MadeChange =
false;
1591 if (
Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1595 if (
Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
1605 case Intrinsic::x86_sse41_pblendvb:
1606 case Intrinsic::x86_sse41_blendvps:
1607 case Intrinsic::x86_sse41_blendvpd:
1608 case Intrinsic::x86_avx_blendv_ps_256:
1609 case Intrinsic::x86_avx_blendv_pd_256:
1610 case Intrinsic::x86_avx2_pblendvb: {
1620 if (isa<ConstantAggregateZero>(
Mask)) {
1625 if (
auto *ConstantMask = dyn_cast<ConstantDataVector>(
Mask)) {
1637 assert(
Mask->getType()->getPrimitiveSizeInBits() ==
1639 "Not expecting mask and operands with different sizes");
1641 unsigned NumMaskElts =
1642 cast<FixedVectorType>(
Mask->getType())->getNumElements();
1643 unsigned NumOperandElts =
1644 cast<FixedVectorType>(II.
getType())->getNumElements();
1645 if (NumMaskElts == NumOperandElts) {
1651 if (NumMaskElts < NumOperandElts) {
1662 case Intrinsic::x86_ssse3_pshuf_b_128:
1663 case Intrinsic::x86_avx2_pshuf_b:
1664 case Intrinsic::x86_avx512_pshuf_b_512:
1670 case Intrinsic::x86_avx_vpermilvar_ps:
1671 case Intrinsic::x86_avx_vpermilvar_ps_256:
1672 case Intrinsic::x86_avx512_vpermilvar_ps_512:
1673 case Intrinsic::x86_avx_vpermilvar_pd:
1674 case Intrinsic::x86_avx_vpermilvar_pd_256:
1675 case Intrinsic::x86_avx512_vpermilvar_pd_512:
1681 case Intrinsic::x86_avx2_permd:
1682 case Intrinsic::x86_avx2_permps:
1683 case Intrinsic::x86_avx512_permvar_df_256:
1684 case Intrinsic::x86_avx512_permvar_df_512:
1685 case Intrinsic::x86_avx512_permvar_di_256:
1686 case Intrinsic::x86_avx512_permvar_di_512:
1687 case Intrinsic::x86_avx512_permvar_hi_128:
1688 case Intrinsic::x86_avx512_permvar_hi_256:
1689 case Intrinsic::x86_avx512_permvar_hi_512:
1690 case Intrinsic::x86_avx512_permvar_qi_128:
1691 case Intrinsic::x86_avx512_permvar_qi_256:
1692 case Intrinsic::x86_avx512_permvar_qi_512:
1693 case Intrinsic::x86_avx512_permvar_sf_512:
1694 case Intrinsic::x86_avx512_permvar_si_512:
1700 case Intrinsic::x86_avx_maskload_ps:
1701 case Intrinsic::x86_avx_maskload_pd:
1702 case Intrinsic::x86_avx_maskload_ps_256:
1703 case Intrinsic::x86_avx_maskload_pd_256:
1704 case Intrinsic::x86_avx2_maskload_d:
1705 case Intrinsic::x86_avx2_maskload_q:
1706 case Intrinsic::x86_avx2_maskload_d_256:
1707 case Intrinsic::x86_avx2_maskload_q_256:
1713 case Intrinsic::x86_sse2_maskmov_dqu:
1714 case Intrinsic::x86_avx_maskstore_ps:
1715 case Intrinsic::x86_avx_maskstore_pd:
1716 case Intrinsic::x86_avx_maskstore_ps_256:
1717 case Intrinsic::x86_avx_maskstore_pd_256:
1718 case Intrinsic::x86_avx2_maskstore_d:
1719 case Intrinsic::x86_avx2_maskstore_q:
1720 case Intrinsic::x86_avx2_maskstore_d_256:
1721 case Intrinsic::x86_avx2_maskstore_q_256:
1727 case Intrinsic::x86_addcarry_32:
1728 case Intrinsic::x86_addcarry_64:
1742 bool &KnownBitsComputed)
const {
1746 case Intrinsic::x86_mmx_pmovmskb:
1747 case Intrinsic::x86_sse_movmsk_ps:
1748 case Intrinsic::x86_sse2_movmsk_pd:
1749 case Intrinsic::x86_sse2_pmovmskb_128:
1750 case Intrinsic::x86_avx_movmsk_ps_256:
1751 case Intrinsic::x86_avx_movmsk_pd_256:
1752 case Intrinsic::x86_avx2_pmovmskb: {
1760 auto ArgType = cast<FixedVectorType>(
Arg->getType());
1761 ArgWidth = ArgType->getNumElements();
1774 KnownBitsComputed =
true;
1785 simplifyAndSetOp)
const {
1786 unsigned VWidth = cast<FixedVectorType>(II.
getType())->getNumElements();
1790 case Intrinsic::x86_xop_vfrcz_ss:
1791 case Intrinsic::x86_xop_vfrcz_sd:
1796 if (!DemandedElts[0]) {
1803 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1806 UndefElts = UndefElts[0];
1810 case Intrinsic::x86_sse_rcp_ss:
1811 case Intrinsic::x86_sse_rsqrt_ss:
1812 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1815 if (!DemandedElts[0]) {
1826 case Intrinsic::x86_sse_min_ss:
1827 case Intrinsic::x86_sse_max_ss:
1828 case Intrinsic::x86_sse_cmp_ss:
1829 case Intrinsic::x86_sse2_min_sd:
1830 case Intrinsic::x86_sse2_max_sd:
1831 case Intrinsic::x86_sse2_cmp_sd: {
1832 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1835 if (!DemandedElts[0]) {
1842 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1854 case Intrinsic::x86_sse41_round_ss:
1855 case Intrinsic::x86_sse41_round_sd: {
1857 APInt DemandedElts2 = DemandedElts;
1859 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
1862 if (!DemandedElts[0]) {
1869 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1874 UndefElts |= UndefElts2[0];
1881 case Intrinsic::x86_avx512_mask_add_ss_round:
1882 case Intrinsic::x86_avx512_mask_div_ss_round:
1883 case Intrinsic::x86_avx512_mask_mul_ss_round:
1884 case Intrinsic::x86_avx512_mask_sub_ss_round:
1885 case Intrinsic::x86_avx512_mask_max_ss_round:
1886 case Intrinsic::x86_avx512_mask_min_ss_round:
1887 case Intrinsic::x86_avx512_mask_add_sd_round:
1888 case Intrinsic::x86_avx512_mask_div_sd_round:
1889 case Intrinsic::x86_avx512_mask_mul_sd_round:
1890 case Intrinsic::x86_avx512_mask_sub_sd_round:
1891 case Intrinsic::x86_avx512_mask_max_sd_round:
1892 case Intrinsic::x86_avx512_mask_min_sd_round:
1893 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1896 if (!DemandedElts[0]) {
1903 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1904 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
1908 if (!UndefElts2[0] || !UndefElts3[0])
1913 case Intrinsic::x86_sse3_addsub_pd:
1914 case Intrinsic::x86_sse3_addsub_ps:
1915 case Intrinsic::x86_avx_addsub_pd_256:
1916 case Intrinsic::x86_avx_addsub_ps_256: {
1921 bool IsSubOnly = DemandedElts.
isSubsetOf(SubMask);
1922 bool IsAddOnly = DemandedElts.
isSubsetOf(AddMask);
1923 if (IsSubOnly || IsAddOnly) {
1924 assert((IsSubOnly ^ IsAddOnly) &&
"Can't be both add-only and sub-only");
1929 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
1932 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1933 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1934 UndefElts &= UndefElts2;
1938 case Intrinsic::x86_sse2_packssdw_128:
1939 case Intrinsic::x86_sse2_packsswb_128:
1940 case Intrinsic::x86_sse2_packuswb_128:
1941 case Intrinsic::x86_sse41_packusdw:
1942 case Intrinsic::x86_avx2_packssdw:
1943 case Intrinsic::x86_avx2_packsswb:
1944 case Intrinsic::x86_avx2_packusdw:
1945 case Intrinsic::x86_avx2_packuswb:
1946 case Intrinsic::x86_avx512_packssdw_512:
1947 case Intrinsic::x86_avx512_packsswb_512:
1948 case Intrinsic::x86_avx512_packusdw_512:
1949 case Intrinsic::x86_avx512_packuswb_512: {
1951 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
1952 assert(VWidth == (InnerVWidth * 2) &&
"Unexpected input size");
1954 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
1955 unsigned VWidthPerLane = VWidth / NumLanes;
1956 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
1962 for (
int OpNum = 0; OpNum != 2; ++OpNum) {
1963 APInt OpDemandedElts(InnerVWidth, 0);
1964 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1965 unsigned LaneIdx = Lane * VWidthPerLane;
1966 for (
unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
1967 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
1968 if (DemandedElts[Idx])
1969 OpDemandedElts.
setBit((Lane * InnerVWidthPerLane) + Elt);
1974 APInt OpUndefElts(InnerVWidth, 0);
1975 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
1978 OpUndefElts = OpUndefElts.
zext(VWidth);
1979 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1980 APInt LaneElts = OpUndefElts.
lshr(InnerVWidthPerLane * Lane);
1981 LaneElts = LaneElts.
getLoBits(InnerVWidthPerLane);
1982 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
1983 UndefElts |= LaneElts;
1990 case Intrinsic::x86_ssse3_pshuf_b_128:
1991 case Intrinsic::x86_avx2_pshuf_b:
1992 case Intrinsic::x86_avx512_pshuf_b_512:
1994 case Intrinsic::x86_avx_vpermilvar_ps:
1995 case Intrinsic::x86_avx_vpermilvar_ps_256:
1996 case Intrinsic::x86_avx512_vpermilvar_ps_512:
1997 case Intrinsic::x86_avx_vpermilvar_pd:
1998 case Intrinsic::x86_avx_vpermilvar_pd_256:
1999 case Intrinsic::x86_avx512_vpermilvar_pd_512:
2001 case Intrinsic::x86_avx2_permd:
2002 case Intrinsic::x86_avx2_permps: {
2003 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
2009 case Intrinsic::x86_sse4a_extrq:
2010 case Intrinsic::x86_sse4a_extrqi:
2011 case Intrinsic::x86_sse4a_insertq:
2012 case Intrinsic::x86_sse4a_insertqi: