LLVM  15.0.0git
X86InstCombineIntrinsic.cpp
Go to the documentation of this file.
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "x86tti"
25 
26 /// Return a constant boolean vector that has true elements in all positions
27 /// where the input constant data vector has an element with the sign bit set.
29  VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
30  V = ConstantExpr::getBitCast(V, IntTy);
32  V);
33  return V;
34 }
35 
36 /// Convert the x86 XMM integer vector mask to a vector of bools based on
37 /// each element's most significant bit (the sign bit).
39  // Fold Constant Mask.
40  if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
41  return getNegativeIsTrueBoolVec(ConstantMask);
42 
43  // Mask was extended from a boolean vector.
44  Value *ExtMask;
47  ExtMask->getType()->isIntOrIntVectorTy(1))
48  return ExtMask;
49 
50  return nullptr;
51 }
52 
53 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
54 // XMM register mask efficiently, we could transform all x86 masked intrinsics
55 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
57  Value *Ptr = II.getOperand(0);
58  Value *Mask = II.getOperand(1);
59  Constant *ZeroVec = Constant::getNullValue(II.getType());
60 
61  // Zero Mask - masked load instruction creates a zero vector.
62  if (isa<ConstantAggregateZero>(Mask))
63  return IC.replaceInstUsesWith(II, ZeroVec);
64 
65  // The mask is constant or extended from a bool vector. Convert this x86
66  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
67  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
68  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
69  // the LLVM intrinsic definition for the pointer argument.
70  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
71  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
72  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
73 
74  // The pass-through vector for an x86 masked load is a zero vector.
75  CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
76  II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
77  return IC.replaceInstUsesWith(II, NewMaskedLoad);
78  }
79 
80  return nullptr;
81 }
82 
83 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
84 // XMM register mask efficiently, we could transform all x86 masked intrinsics
85 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
87  Value *Ptr = II.getOperand(0);
88  Value *Mask = II.getOperand(1);
89  Value *Vec = II.getOperand(2);
90 
91  // Zero Mask - this masked store instruction does nothing.
92  if (isa<ConstantAggregateZero>(Mask)) {
93  IC.eraseInstFromFunction(II);
94  return true;
95  }
96 
97  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
98  // anything else at this level.
99  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
100  return false;
101 
102  // The mask is constant or extended from a bool vector. Convert this x86
103  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
104  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
105  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
106  PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
107  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
108 
109  IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
110 
111  // 'Replace uses' doesn't work for stores. Erase the original masked store.
112  IC.eraseInstFromFunction(II);
113  return true;
114  }
115 
116  return false;
117 }
118 
121  bool LogicalShift = false;
122  bool ShiftLeft = false;
123  bool IsImm = false;
124 
125  switch (II.getIntrinsicID()) {
126  default:
127  llvm_unreachable("Unexpected intrinsic!");
128  case Intrinsic::x86_sse2_psrai_d:
129  case Intrinsic::x86_sse2_psrai_w:
130  case Intrinsic::x86_avx2_psrai_d:
131  case Intrinsic::x86_avx2_psrai_w:
132  case Intrinsic::x86_avx512_psrai_q_128:
133  case Intrinsic::x86_avx512_psrai_q_256:
134  case Intrinsic::x86_avx512_psrai_d_512:
135  case Intrinsic::x86_avx512_psrai_q_512:
136  case Intrinsic::x86_avx512_psrai_w_512:
137  IsImm = true;
139  case Intrinsic::x86_sse2_psra_d:
140  case Intrinsic::x86_sse2_psra_w:
141  case Intrinsic::x86_avx2_psra_d:
142  case Intrinsic::x86_avx2_psra_w:
143  case Intrinsic::x86_avx512_psra_q_128:
144  case Intrinsic::x86_avx512_psra_q_256:
145  case Intrinsic::x86_avx512_psra_d_512:
146  case Intrinsic::x86_avx512_psra_q_512:
147  case Intrinsic::x86_avx512_psra_w_512:
148  LogicalShift = false;
149  ShiftLeft = false;
150  break;
151  case Intrinsic::x86_sse2_psrli_d:
152  case Intrinsic::x86_sse2_psrli_q:
153  case Intrinsic::x86_sse2_psrli_w:
154  case Intrinsic::x86_avx2_psrli_d:
155  case Intrinsic::x86_avx2_psrli_q:
156  case Intrinsic::x86_avx2_psrli_w:
157  case Intrinsic::x86_avx512_psrli_d_512:
158  case Intrinsic::x86_avx512_psrli_q_512:
159  case Intrinsic::x86_avx512_psrli_w_512:
160  IsImm = true;
162  case Intrinsic::x86_sse2_psrl_d:
163  case Intrinsic::x86_sse2_psrl_q:
164  case Intrinsic::x86_sse2_psrl_w:
165  case Intrinsic::x86_avx2_psrl_d:
166  case Intrinsic::x86_avx2_psrl_q:
167  case Intrinsic::x86_avx2_psrl_w:
168  case Intrinsic::x86_avx512_psrl_d_512:
169  case Intrinsic::x86_avx512_psrl_q_512:
170  case Intrinsic::x86_avx512_psrl_w_512:
171  LogicalShift = true;
172  ShiftLeft = false;
173  break;
174  case Intrinsic::x86_sse2_pslli_d:
175  case Intrinsic::x86_sse2_pslli_q:
176  case Intrinsic::x86_sse2_pslli_w:
177  case Intrinsic::x86_avx2_pslli_d:
178  case Intrinsic::x86_avx2_pslli_q:
179  case Intrinsic::x86_avx2_pslli_w:
180  case Intrinsic::x86_avx512_pslli_d_512:
181  case Intrinsic::x86_avx512_pslli_q_512:
182  case Intrinsic::x86_avx512_pslli_w_512:
183  IsImm = true;
185  case Intrinsic::x86_sse2_psll_d:
186  case Intrinsic::x86_sse2_psll_q:
187  case Intrinsic::x86_sse2_psll_w:
188  case Intrinsic::x86_avx2_psll_d:
189  case Intrinsic::x86_avx2_psll_q:
190  case Intrinsic::x86_avx2_psll_w:
191  case Intrinsic::x86_avx512_psll_d_512:
192  case Intrinsic::x86_avx512_psll_q_512:
193  case Intrinsic::x86_avx512_psll_w_512:
194  LogicalShift = true;
195  ShiftLeft = true;
196  break;
197  }
198  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
199 
200  Value *Vec = II.getArgOperand(0);
201  Value *Amt = II.getArgOperand(1);
202  auto *VT = cast<FixedVectorType>(Vec->getType());
203  Type *SVT = VT->getElementType();
204  Type *AmtVT = Amt->getType();
205  unsigned VWidth = VT->getNumElements();
206  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
207 
208  // If the shift amount is guaranteed to be in-range we can replace it with a
209  // generic shift. If its guaranteed to be out of range, logical shifts combine
210  // to zero and arithmetic shifts are clamped to (BitWidth - 1).
211  if (IsImm) {
212  assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
213  KnownBits KnownAmtBits =
215  if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
216  Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
217  Amt = Builder.CreateVectorSplat(VWidth, Amt);
218  return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
219  : Builder.CreateLShr(Vec, Amt))
220  : Builder.CreateAShr(Vec, Amt));
221  }
222  if (KnownAmtBits.getMinValue().uge(BitWidth)) {
223  if (LogicalShift)
224  return ConstantAggregateZero::get(VT);
225  Amt = ConstantInt::get(SVT, BitWidth - 1);
226  return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
227  }
228  } else {
229  // Ensure the first element has an in-range value and the rest of the
230  // elements in the bottom 64 bits are zero.
231  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
232  cast<VectorType>(AmtVT)->getElementType() == SVT &&
233  "Unexpected shift-by-scalar type");
234  unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
235  APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
236  APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
237  KnownBits KnownLowerBits = llvm::computeKnownBits(
238  Amt, DemandedLower, II.getModule()->getDataLayout());
239  KnownBits KnownUpperBits = llvm::computeKnownBits(
240  Amt, DemandedUpper, II.getModule()->getDataLayout());
241  if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
242  (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
243  SmallVector<int, 16> ZeroSplat(VWidth, 0);
244  Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
245  return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
246  : Builder.CreateLShr(Vec, Amt))
247  : Builder.CreateAShr(Vec, Amt));
248  }
249  }
250 
251  // Simplify if count is constant vector.
252  auto *CDV = dyn_cast<ConstantDataVector>(Amt);
253  if (!CDV)
254  return nullptr;
255 
256  // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
257  // operand to compute the shift amount.
258  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
259  cast<VectorType>(AmtVT)->getElementType() == SVT &&
260  "Unexpected shift-by-scalar type");
261 
262  // Concatenate the sub-elements to create the 64-bit value.
263  APInt Count(64, 0);
264  for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
265  unsigned SubEltIdx = (NumSubElts - 1) - i;
266  auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
267  Count <<= BitWidth;
268  Count |= SubElt->getValue().zextOrTrunc(64);
269  }
270 
271  // If shift-by-zero then just return the original value.
272  if (Count.isZero())
273  return Vec;
274 
275  // Handle cases when Shift >= BitWidth.
276  if (Count.uge(BitWidth)) {
277  // If LogicalShift - just return zero.
278  if (LogicalShift)
279  return ConstantAggregateZero::get(VT);
280 
281  // If ArithmeticShift - clamp Shift to (BitWidth - 1).
282  Count = APInt(64, BitWidth - 1);
283  }
284 
285  // Get a constant vector of the same type as the first operand.
286  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
287  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
288 
289  if (ShiftLeft)
290  return Builder.CreateShl(Vec, ShiftVec);
291 
292  if (LogicalShift)
293  return Builder.CreateLShr(Vec, ShiftVec);
294 
295  return Builder.CreateAShr(Vec, ShiftVec);
296 }
297 
298 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
299 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
300 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
303  bool LogicalShift = false;
304  bool ShiftLeft = false;
305 
306  switch (II.getIntrinsicID()) {
307  default:
308  llvm_unreachable("Unexpected intrinsic!");
309  case Intrinsic::x86_avx2_psrav_d:
310  case Intrinsic::x86_avx2_psrav_d_256:
311  case Intrinsic::x86_avx512_psrav_q_128:
312  case Intrinsic::x86_avx512_psrav_q_256:
313  case Intrinsic::x86_avx512_psrav_d_512:
314  case Intrinsic::x86_avx512_psrav_q_512:
315  case Intrinsic::x86_avx512_psrav_w_128:
316  case Intrinsic::x86_avx512_psrav_w_256:
317  case Intrinsic::x86_avx512_psrav_w_512:
318  LogicalShift = false;
319  ShiftLeft = false;
320  break;
321  case Intrinsic::x86_avx2_psrlv_d:
322  case Intrinsic::x86_avx2_psrlv_d_256:
323  case Intrinsic::x86_avx2_psrlv_q:
324  case Intrinsic::x86_avx2_psrlv_q_256:
325  case Intrinsic::x86_avx512_psrlv_d_512:
326  case Intrinsic::x86_avx512_psrlv_q_512:
327  case Intrinsic::x86_avx512_psrlv_w_128:
328  case Intrinsic::x86_avx512_psrlv_w_256:
329  case Intrinsic::x86_avx512_psrlv_w_512:
330  LogicalShift = true;
331  ShiftLeft = false;
332  break;
333  case Intrinsic::x86_avx2_psllv_d:
334  case Intrinsic::x86_avx2_psllv_d_256:
335  case Intrinsic::x86_avx2_psllv_q:
336  case Intrinsic::x86_avx2_psllv_q_256:
337  case Intrinsic::x86_avx512_psllv_d_512:
338  case Intrinsic::x86_avx512_psllv_q_512:
339  case Intrinsic::x86_avx512_psllv_w_128:
340  case Intrinsic::x86_avx512_psllv_w_256:
341  case Intrinsic::x86_avx512_psllv_w_512:
342  LogicalShift = true;
343  ShiftLeft = true;
344  break;
345  }
346  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
347 
348  Value *Vec = II.getArgOperand(0);
349  Value *Amt = II.getArgOperand(1);
350  auto *VT = cast<FixedVectorType>(II.getType());
351  Type *SVT = VT->getElementType();
352  int NumElts = VT->getNumElements();
353  int BitWidth = SVT->getIntegerBitWidth();
354 
355  // If the shift amount is guaranteed to be in-range we can replace it with a
356  // generic shift.
357  KnownBits KnownAmt =
359  if (KnownAmt.getMaxValue().ult(BitWidth)) {
360  return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
361  : Builder.CreateLShr(Vec, Amt))
362  : Builder.CreateAShr(Vec, Amt));
363  }
364 
365  // Simplify if all shift amounts are constant/undef.
366  auto *CShift = dyn_cast<Constant>(Amt);
367  if (!CShift)
368  return nullptr;
369 
370  // Collect each element's shift amount.
371  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
372  bool AnyOutOfRange = false;
373  SmallVector<int, 8> ShiftAmts;
374  for (int I = 0; I < NumElts; ++I) {
375  auto *CElt = CShift->getAggregateElement(I);
376  if (isa_and_nonnull<UndefValue>(CElt)) {
377  ShiftAmts.push_back(-1);
378  continue;
379  }
380 
381  auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
382  if (!COp)
383  return nullptr;
384 
385  // Handle out of range shifts.
386  // If LogicalShift - set to BitWidth (special case).
387  // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
388  APInt ShiftVal = COp->getValue();
389  if (ShiftVal.uge(BitWidth)) {
390  AnyOutOfRange = LogicalShift;
391  ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
392  continue;
393  }
394 
395  ShiftAmts.push_back((int)ShiftVal.getZExtValue());
396  }
397 
398  // If all elements out of range or UNDEF, return vector of zeros/undefs.
399  // ArithmeticShift should only hit this if they are all UNDEF.
400  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
401  if (llvm::all_of(ShiftAmts, OutOfRange)) {
402  SmallVector<Constant *, 8> ConstantVec;
403  for (int Idx : ShiftAmts) {
404  if (Idx < 0) {
405  ConstantVec.push_back(UndefValue::get(SVT));
406  } else {
407  assert(LogicalShift && "Logical shift expected");
408  ConstantVec.push_back(ConstantInt::getNullValue(SVT));
409  }
410  }
411  return ConstantVector::get(ConstantVec);
412  }
413 
414  // We can't handle only some out of range values with generic logical shifts.
415  if (AnyOutOfRange)
416  return nullptr;
417 
418  // Build the shift amount constant vector.
419  SmallVector<Constant *, 8> ShiftVecAmts;
420  for (int Idx : ShiftAmts) {
421  if (Idx < 0)
422  ShiftVecAmts.push_back(UndefValue::get(SVT));
423  else
424  ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
425  }
426  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
427 
428  if (ShiftLeft)
429  return Builder.CreateShl(Vec, ShiftVec);
430 
431  if (LogicalShift)
432  return Builder.CreateLShr(Vec, ShiftVec);
433 
434  return Builder.CreateAShr(Vec, ShiftVec);
435 }
436 
438  InstCombiner::BuilderTy &Builder, bool IsSigned) {
439  Value *Arg0 = II.getArgOperand(0);
440  Value *Arg1 = II.getArgOperand(1);
441  Type *ResTy = II.getType();
442 
443  // Fast all undef handling.
444  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
445  return UndefValue::get(ResTy);
446 
447  auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
448  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
449  unsigned NumSrcElts = ArgTy->getNumElements();
450  assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
451  "Unexpected packing types");
452 
453  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
454  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
455  unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
456  assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
457  "Unexpected packing types");
458 
459  // Constant folding.
460  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
461  return nullptr;
462 
463  // Clamp Values - signed/unsigned both use signed clamp values, but they
464  // differ on the min/max values.
465  APInt MinValue, MaxValue;
466  if (IsSigned) {
467  // PACKSS: Truncate signed value with signed saturation.
468  // Source values less than dst minint are saturated to minint.
469  // Source values greater than dst maxint are saturated to maxint.
470  MinValue =
471  APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
472  MaxValue =
473  APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474  } else {
475  // PACKUS: Truncate signed value with unsigned saturation.
476  // Source values less than zero are saturated to zero.
477  // Source values greater than dst maxuint are saturated to maxuint.
478  MinValue = APInt::getZero(SrcScalarSizeInBits);
479  MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
480  }
481 
482  auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
483  auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
484  Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
485  Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
486  Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
487  Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
488 
489  // Shuffle clamped args together at the lane level.
490  SmallVector<int, 32> PackMask;
491  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
492  for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
493  PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
494  for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495  PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
496  }
497  auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
498 
499  // Truncate to dst size.
500  return Builder.CreateTrunc(Shuffle, ResTy);
501 }
502 
505  Value *Arg = II.getArgOperand(0);
506  Type *ResTy = II.getType();
507 
508  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
509  if (isa<UndefValue>(Arg))
510  return Constant::getNullValue(ResTy);
511 
512  auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
513  // We can't easily peek through x86_mmx types.
514  if (!ArgTy)
515  return nullptr;
516 
517  // Expand MOVMSK to compare/bitcast/zext:
518  // e.g. PMOVMSKB(v16i8 x):
519  // %cmp = icmp slt <16 x i8> %x, zeroinitializer
520  // %int = bitcast <16 x i1> %cmp to i16
521  // %res = zext i16 %int to i32
522  unsigned NumElts = ArgTy->getNumElements();
523  Type *IntegerTy = Builder.getIntNTy(NumElts);
524 
525  Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
526  Res = Builder.CreateIsNeg(Res);
527  Res = Builder.CreateBitCast(Res, IntegerTy);
528  Res = Builder.CreateZExtOrTrunc(Res, ResTy);
529  return Res;
530 }
531 
534  Value *CarryIn = II.getArgOperand(0);
535  Value *Op1 = II.getArgOperand(1);
536  Value *Op2 = II.getArgOperand(2);
537  Type *RetTy = II.getType();
538  Type *OpTy = Op1->getType();
539  assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
540  RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
541  "Unexpected types for x86 addcarry");
542 
543  // If carry-in is zero, this is just an unsigned add with overflow.
544  if (match(CarryIn, PatternMatch::m_ZeroInt())) {
545  Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
546  {Op1, Op2});
547  // The types have to be adjusted to match the x86 call types.
548  Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
549  Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
550  Builder.getInt8Ty());
551  Value *Res = UndefValue::get(RetTy);
552  Res = Builder.CreateInsertValue(Res, UAddOV, 0);
553  return Builder.CreateInsertValue(Res, UAddResult, 1);
554  }
555 
556  return nullptr;
557 }
558 
561  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
562  if (!CInt)
563  return nullptr;
564 
565  auto *VecTy = cast<FixedVectorType>(II.getType());
566  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
567 
568  // The immediate permute control byte looks like this:
569  // [3:0] - zero mask for each 32-bit lane
570  // [5:4] - select one 32-bit destination lane
571  // [7:6] - select one 32-bit source lane
572 
573  uint8_t Imm = CInt->getZExtValue();
574  uint8_t ZMask = Imm & 0xf;
575  uint8_t DestLane = (Imm >> 4) & 0x3;
576  uint8_t SourceLane = (Imm >> 6) & 0x3;
577 
579 
580  // If all zero mask bits are set, this was just a weird way to
581  // generate a zero vector.
582  if (ZMask == 0xf)
583  return ZeroVector;
584 
585  // Initialize by passing all of the first source bits through.
586  int ShuffleMask[4] = {0, 1, 2, 3};
587 
588  // We may replace the second operand with the zero vector.
589  Value *V1 = II.getArgOperand(1);
590 
591  if (ZMask) {
592  // If the zero mask is being used with a single input or the zero mask
593  // overrides the destination lane, this is a shuffle with the zero vector.
594  if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
595  (ZMask & (1 << DestLane))) {
596  V1 = ZeroVector;
597  // We may still move 32-bits of the first source vector from one lane
598  // to another.
599  ShuffleMask[DestLane] = SourceLane;
600  // The zero mask may override the previous insert operation.
601  for (unsigned i = 0; i < 4; ++i)
602  if ((ZMask >> i) & 0x1)
603  ShuffleMask[i] = i + 4;
604  } else {
605  // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
606  return nullptr;
607  }
608  } else {
609  // Replace the selected destination lane with the selected source lane.
610  ShuffleMask[DestLane] = SourceLane + 4;
611  }
612 
613  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
614 }
615 
616 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
617 /// or conversion to a shuffle vector.
619  ConstantInt *CILength, ConstantInt *CIIndex,
621  auto LowConstantHighUndef = [&](uint64_t Val) {
622  Type *IntTy64 = Type::getInt64Ty(II.getContext());
623  Constant *Args[] = {ConstantInt::get(IntTy64, Val),
624  UndefValue::get(IntTy64)};
625  return ConstantVector::get(Args);
626  };
627 
628  // See if we're dealing with constant values.
629  auto *C0 = dyn_cast<Constant>(Op0);
630  auto *CI0 =
631  C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
632  : nullptr;
633 
634  // Attempt to constant fold.
635  if (CILength && CIIndex) {
636  // From AMD documentation: "The bit index and field length are each six
637  // bits in length other bits of the field are ignored."
638  APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
639  APInt APLength = CILength->getValue().zextOrTrunc(6);
640 
641  unsigned Index = APIndex.getZExtValue();
642 
643  // From AMD documentation: "a value of zero in the field length is
644  // defined as length of 64".
645  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
646 
647  // From AMD documentation: "If the sum of the bit index + length field
648  // is greater than 64, the results are undefined".
649  unsigned End = Index + Length;
650 
651  // Note that both field index and field length are 8-bit quantities.
652  // Since variables 'Index' and 'Length' are unsigned values
653  // obtained from zero-extending field index and field length
654  // respectively, their sum should never wrap around.
655  if (End > 64)
656  return UndefValue::get(II.getType());
657 
658  // If we are inserting whole bytes, we can convert this to a shuffle.
659  // Lowering can recognize EXTRQI shuffle masks.
660  if ((Length % 8) == 0 && (Index % 8) == 0) {
661  // Convert bit indices to byte indices.
662  Length /= 8;
663  Index /= 8;
664 
665  Type *IntTy8 = Type::getInt8Ty(II.getContext());
666  auto *ShufTy = FixedVectorType::get(IntTy8, 16);
667 
668  SmallVector<int, 16> ShuffleMask;
669  for (int i = 0; i != (int)Length; ++i)
670  ShuffleMask.push_back(i + Index);
671  for (int i = Length; i != 8; ++i)
672  ShuffleMask.push_back(i + 16);
673  for (int i = 8; i != 16; ++i)
674  ShuffleMask.push_back(-1);
675 
676  Value *SV = Builder.CreateShuffleVector(
677  Builder.CreateBitCast(Op0, ShufTy),
678  ConstantAggregateZero::get(ShufTy), ShuffleMask);
679  return Builder.CreateBitCast(SV, II.getType());
680  }
681 
682  // Constant Fold - shift Index'th bit to lowest position and mask off
683  // Length bits.
684  if (CI0) {
685  APInt Elt = CI0->getValue();
686  Elt.lshrInPlace(Index);
687  Elt = Elt.zextOrTrunc(Length);
688  return LowConstantHighUndef(Elt.getZExtValue());
689  }
690 
691  // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
692  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
693  Value *Args[] = {Op0, CILength, CIIndex};
694  Module *M = II.getModule();
695  Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
696  return Builder.CreateCall(F, Args);
697  }
698  }
699 
700  // Constant Fold - extraction from zero is always {zero, undef}.
701  if (CI0 && CI0->isZero())
702  return LowConstantHighUndef(0);
703 
704  return nullptr;
705 }
706 
707 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
708 /// folding or conversion to a shuffle vector.
710  APInt APLength, APInt APIndex,
712  // From AMD documentation: "The bit index and field length are each six bits
713  // in length other bits of the field are ignored."
714  APIndex = APIndex.zextOrTrunc(6);
715  APLength = APLength.zextOrTrunc(6);
716 
717  // Attempt to constant fold.
718  unsigned Index = APIndex.getZExtValue();
719 
720  // From AMD documentation: "a value of zero in the field length is
721  // defined as length of 64".
722  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
723 
724  // From AMD documentation: "If the sum of the bit index + length field
725  // is greater than 64, the results are undefined".
726  unsigned End = Index + Length;
727 
728  // Note that both field index and field length are 8-bit quantities.
729  // Since variables 'Index' and 'Length' are unsigned values
730  // obtained from zero-extending field index and field length
731  // respectively, their sum should never wrap around.
732  if (End > 64)
733  return UndefValue::get(II.getType());
734 
735  // If we are inserting whole bytes, we can convert this to a shuffle.
736  // Lowering can recognize INSERTQI shuffle masks.
737  if ((Length % 8) == 0 && (Index % 8) == 0) {
738  // Convert bit indices to byte indices.
739  Length /= 8;
740  Index /= 8;
741 
742  Type *IntTy8 = Type::getInt8Ty(II.getContext());
743  auto *ShufTy = FixedVectorType::get(IntTy8, 16);
744 
745  SmallVector<int, 16> ShuffleMask;
746  for (int i = 0; i != (int)Index; ++i)
747  ShuffleMask.push_back(i);
748  for (int i = 0; i != (int)Length; ++i)
749  ShuffleMask.push_back(i + 16);
750  for (int i = Index + Length; i != 8; ++i)
751  ShuffleMask.push_back(i);
752  for (int i = 8; i != 16; ++i)
753  ShuffleMask.push_back(-1);
754 
755  Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
756  Builder.CreateBitCast(Op1, ShufTy),
757  ShuffleMask);
758  return Builder.CreateBitCast(SV, II.getType());
759  }
760 
761  // See if we're dealing with constant values.
762  auto *C0 = dyn_cast<Constant>(Op0);
763  auto *C1 = dyn_cast<Constant>(Op1);
764  auto *CI00 =
765  C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
766  : nullptr;
767  auto *CI10 =
768  C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
769  : nullptr;
770 
771  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
772  if (CI00 && CI10) {
773  APInt V00 = CI00->getValue();
774  APInt V10 = CI10->getValue();
775  APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
776  V00 = V00 & ~Mask;
777  V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
778  APInt Val = V00 | V10;
779  Type *IntTy64 = Type::getInt64Ty(II.getContext());
780  Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
781  UndefValue::get(IntTy64)};
782  return ConstantVector::get(Args);
783  }
784 
785  // If we were an INSERTQ call, we'll save demanded elements if we convert to
786  // INSERTQI.
787  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
788  Type *IntTy8 = Type::getInt8Ty(II.getContext());
789  Constant *CILength = ConstantInt::get(IntTy8, Length, false);
790  Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
791 
792  Value *Args[] = {Op0, Op1, CILength, CIIndex};
793  Module *M = II.getModule();
794  Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
795  return Builder.CreateCall(F, Args);
796  }
797 
798  return nullptr;
799 }
800 
801 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
804  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
805  if (!V)
806  return nullptr;
807 
808  auto *VecTy = cast<FixedVectorType>(II.getType());
809  unsigned NumElts = VecTy->getNumElements();
810  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
811  "Unexpected number of elements in shuffle mask!");
812 
813  // Construct a shuffle mask from constant integers or UNDEFs.
814  int Indexes[64];
815 
816  // Each byte in the shuffle control mask forms an index to permute the
817  // corresponding byte in the destination operand.
818  for (unsigned I = 0; I < NumElts; ++I) {
819  Constant *COp = V->getAggregateElement(I);
820  if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
821  return nullptr;
822 
823  if (isa<UndefValue>(COp)) {
824  Indexes[I] = -1;
825  continue;
826  }
827 
828  int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
829 
830  // If the most significant bit (bit[7]) of each byte of the shuffle
831  // control mask is set, then zero is written in the result byte.
832  // The zero vector is in the right-hand side of the resulting
833  // shufflevector.
834 
835  // The value of each index for the high 128-bit lane is the least
836  // significant 4 bits of the respective shuffle control byte.
837  Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
838  Indexes[I] = Index;
839  }
840 
841  auto V1 = II.getArgOperand(0);
842  auto V2 = Constant::getNullValue(VecTy);
843  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
844 }
845 
846 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
849  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
850  if (!V)
851  return nullptr;
852 
853  auto *VecTy = cast<FixedVectorType>(II.getType());
854  unsigned NumElts = VecTy->getNumElements();
855  bool IsPD = VecTy->getScalarType()->isDoubleTy();
856  unsigned NumLaneElts = IsPD ? 2 : 4;
857  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
858 
859  // Construct a shuffle mask from constant integers or UNDEFs.
860  int Indexes[16];
861 
862  // The intrinsics only read one or two bits, clear the rest.
863  for (unsigned I = 0; I < NumElts; ++I) {
864  Constant *COp = V->getAggregateElement(I);
865  if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
866  return nullptr;
867 
868  if (isa<UndefValue>(COp)) {
869  Indexes[I] = -1;
870  continue;
871  }
872 
873  APInt Index = cast<ConstantInt>(COp)->getValue();
874  Index = Index.zextOrTrunc(32).getLoBits(2);
875 
876  // The PD variants uses bit 1 to select per-lane element index, so
877  // shift down to convert to generic shuffle mask index.
878  if (IsPD)
879  Index.lshrInPlace(1);
880 
881  // The _256 variants are a bit trickier since the mask bits always index
882  // into the corresponding 128 half. In order to convert to a generic
883  // shuffle, we have to make that explicit.
884  Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
885 
886  Indexes[I] = Index.getZExtValue();
887  }
888 
889  auto V1 = II.getArgOperand(0);
890  return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
891 }
892 
893 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
896  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
897  if (!V)
898  return nullptr;
899 
900  auto *VecTy = cast<FixedVectorType>(II.getType());
901  unsigned Size = VecTy->getNumElements();
902  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
903  "Unexpected shuffle mask size");
904 
905  // Construct a shuffle mask from constant integers or UNDEFs.
906  int Indexes[64];
907 
908  for (unsigned I = 0; I < Size; ++I) {
909  Constant *COp = V->getAggregateElement(I);
910  if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
911  return nullptr;
912 
913  if (isa<UndefValue>(COp)) {
914  Indexes[I] = -1;
915  continue;
916  }
917 
918  uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
919  Index &= Size - 1;
920  Indexes[I] = Index;
921  }
922 
923  auto V1 = II.getArgOperand(0);
924  return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
925 }
926 
929  auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
930  unsigned DemandedWidth) {
931  APInt UndefElts(Width, 0);
932  APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
933  return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
934  };
935 
936  Intrinsic::ID IID = II.getIntrinsicID();
937  switch (IID) {
938  case Intrinsic::x86_bmi_bextr_32:
939  case Intrinsic::x86_bmi_bextr_64:
940  case Intrinsic::x86_tbm_bextri_u32:
941  case Intrinsic::x86_tbm_bextri_u64:
942  // If the RHS is a constant we can try some simplifications.
943  if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
944  uint64_t Shift = C->getZExtValue();
945  uint64_t Length = (Shift >> 8) & 0xff;
946  Shift &= 0xff;
947  unsigned BitWidth = II.getType()->getIntegerBitWidth();
948  // If the length is 0 or the shift is out of range, replace with zero.
949  if (Length == 0 || Shift >= BitWidth) {
950  return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
951  }
952  // If the LHS is also a constant, we can completely constant fold this.
953  if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
954  uint64_t Result = InC->getZExtValue() >> Shift;
955  if (Length > BitWidth)
956  Length = BitWidth;
957  Result &= maskTrailingOnes<uint64_t>(Length);
958  return IC.replaceInstUsesWith(II,
959  ConstantInt::get(II.getType(), Result));
960  }
961  // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
962  // are only masking bits that a shift already cleared?
963  }
964  break;
965 
966  case Intrinsic::x86_bmi_bzhi_32:
967  case Intrinsic::x86_bmi_bzhi_64:
968  // If the RHS is a constant we can try some simplifications.
969  if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
970  uint64_t Index = C->getZExtValue() & 0xff;
971  unsigned BitWidth = II.getType()->getIntegerBitWidth();
972  if (Index >= BitWidth) {
973  return IC.replaceInstUsesWith(II, II.getArgOperand(0));
974  }
975  if (Index == 0) {
976  return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
977  }
978  // If the LHS is also a constant, we can completely constant fold this.
979  if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
980  uint64_t Result = InC->getZExtValue();
981  Result &= maskTrailingOnes<uint64_t>(Index);
982  return IC.replaceInstUsesWith(II,
983  ConstantInt::get(II.getType(), Result));
984  }
985  // TODO should we convert this to an AND if the RHS is constant?
986  }
987  break;
988  case Intrinsic::x86_bmi_pext_32:
989  case Intrinsic::x86_bmi_pext_64:
990  if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
991  if (MaskC->isNullValue()) {
992  return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
993  }
994  if (MaskC->isAllOnesValue()) {
995  return IC.replaceInstUsesWith(II, II.getArgOperand(0));
996  }
997 
998  unsigned MaskIdx, MaskLen;
999  if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1000  // any single contingous sequence of 1s anywhere in the mask simply
1001  // describes a subset of the input bits shifted to the appropriate
1002  // position. Replace with the straight forward IR.
1003  Value *Input = II.getArgOperand(0);
1004  Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
1005  Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1006  Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
1007  return IC.replaceInstUsesWith(II, Shifted);
1008  }
1009 
1010  if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1011  uint64_t Src = SrcC->getZExtValue();
1012  uint64_t Mask = MaskC->getZExtValue();
1013  uint64_t Result = 0;
1014  uint64_t BitToSet = 1;
1015 
1016  while (Mask) {
1017  // Isolate lowest set bit.
1018  uint64_t BitToTest = Mask & -Mask;
1019  if (BitToTest & Src)
1020  Result |= BitToSet;
1021 
1022  BitToSet <<= 1;
1023  // Clear lowest set bit.
1024  Mask &= Mask - 1;
1025  }
1026 
1027  return IC.replaceInstUsesWith(II,
1028  ConstantInt::get(II.getType(), Result));
1029  }
1030  }
1031  break;
1032  case Intrinsic::x86_bmi_pdep_32:
1033  case Intrinsic::x86_bmi_pdep_64:
1034  if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
1035  if (MaskC->isNullValue()) {
1036  return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
1037  }
1038  if (MaskC->isAllOnesValue()) {
1039  return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1040  }
1041 
1042  unsigned MaskIdx, MaskLen;
1043  if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1044  // any single contingous sequence of 1s anywhere in the mask simply
1045  // describes a subset of the input bits shifted to the appropriate
1046  // position. Replace with the straight forward IR.
1047  Value *Input = II.getArgOperand(0);
1048  Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1049  Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
1050  Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
1051  return IC.replaceInstUsesWith(II, Masked);
1052  }
1053 
1054  if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1055  uint64_t Src = SrcC->getZExtValue();
1056  uint64_t Mask = MaskC->getZExtValue();
1057  uint64_t Result = 0;
1058  uint64_t BitToTest = 1;
1059 
1060  while (Mask) {
1061  // Isolate lowest set bit.
1062  uint64_t BitToSet = Mask & -Mask;
1063  if (BitToTest & Src)
1064  Result |= BitToSet;
1065 
1066  BitToTest <<= 1;
1067  // Clear lowest set bit;
1068  Mask &= Mask - 1;
1069  }
1070 
1071  return IC.replaceInstUsesWith(II,
1072  ConstantInt::get(II.getType(), Result));
1073  }
1074  }
1075  break;
1076 
1077  case Intrinsic::x86_sse_cvtss2si:
1078  case Intrinsic::x86_sse_cvtss2si64:
1079  case Intrinsic::x86_sse_cvttss2si:
1080  case Intrinsic::x86_sse_cvttss2si64:
1081  case Intrinsic::x86_sse2_cvtsd2si:
1082  case Intrinsic::x86_sse2_cvtsd2si64:
1083  case Intrinsic::x86_sse2_cvttsd2si:
1084  case Intrinsic::x86_sse2_cvttsd2si64:
1085  case Intrinsic::x86_avx512_vcvtss2si32:
1086  case Intrinsic::x86_avx512_vcvtss2si64:
1087  case Intrinsic::x86_avx512_vcvtss2usi32:
1088  case Intrinsic::x86_avx512_vcvtss2usi64:
1089  case Intrinsic::x86_avx512_vcvtsd2si32:
1090  case Intrinsic::x86_avx512_vcvtsd2si64:
1091  case Intrinsic::x86_avx512_vcvtsd2usi32:
1092  case Intrinsic::x86_avx512_vcvtsd2usi64:
1093  case Intrinsic::x86_avx512_cvttss2si:
1094  case Intrinsic::x86_avx512_cvttss2si64:
1095  case Intrinsic::x86_avx512_cvttss2usi:
1096  case Intrinsic::x86_avx512_cvttss2usi64:
1097  case Intrinsic::x86_avx512_cvttsd2si:
1098  case Intrinsic::x86_avx512_cvttsd2si64:
1099  case Intrinsic::x86_avx512_cvttsd2usi:
1100  case Intrinsic::x86_avx512_cvttsd2usi64: {
1101  // These intrinsics only demand the 0th element of their input vectors. If
1102  // we can simplify the input based on that, do so now.
1103  Value *Arg = II.getArgOperand(0);
1104  unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
1105  if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
1106  return IC.replaceOperand(II, 0, V);
1107  }
1108  break;
1109  }
1110 
1111  case Intrinsic::x86_mmx_pmovmskb:
1112  case Intrinsic::x86_sse_movmsk_ps:
1113  case Intrinsic::x86_sse2_movmsk_pd:
1114  case Intrinsic::x86_sse2_pmovmskb_128:
1115  case Intrinsic::x86_avx_movmsk_pd_256:
1116  case Intrinsic::x86_avx_movmsk_ps_256:
1117  case Intrinsic::x86_avx2_pmovmskb:
1118  if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
1119  return IC.replaceInstUsesWith(II, V);
1120  }
1121  break;
1122 
1123  case Intrinsic::x86_sse_comieq_ss:
1124  case Intrinsic::x86_sse_comige_ss:
1125  case Intrinsic::x86_sse_comigt_ss:
1126  case Intrinsic::x86_sse_comile_ss:
1127  case Intrinsic::x86_sse_comilt_ss:
1128  case Intrinsic::x86_sse_comineq_ss:
1129  case Intrinsic::x86_sse_ucomieq_ss:
1130  case Intrinsic::x86_sse_ucomige_ss:
1131  case Intrinsic::x86_sse_ucomigt_ss:
1132  case Intrinsic::x86_sse_ucomile_ss:
1133  case Intrinsic::x86_sse_ucomilt_ss:
1134  case Intrinsic::x86_sse_ucomineq_ss:
1135  case Intrinsic::x86_sse2_comieq_sd:
1136  case Intrinsic::x86_sse2_comige_sd:
1137  case Intrinsic::x86_sse2_comigt_sd:
1138  case Intrinsic::x86_sse2_comile_sd:
1139  case Intrinsic::x86_sse2_comilt_sd:
1140  case Intrinsic::x86_sse2_comineq_sd:
1141  case Intrinsic::x86_sse2_ucomieq_sd:
1142  case Intrinsic::x86_sse2_ucomige_sd:
1143  case Intrinsic::x86_sse2_ucomigt_sd:
1144  case Intrinsic::x86_sse2_ucomile_sd:
1145  case Intrinsic::x86_sse2_ucomilt_sd:
1146  case Intrinsic::x86_sse2_ucomineq_sd:
1147  case Intrinsic::x86_avx512_vcomi_ss:
1148  case Intrinsic::x86_avx512_vcomi_sd:
1149  case Intrinsic::x86_avx512_mask_cmp_ss:
1150  case Intrinsic::x86_avx512_mask_cmp_sd: {
1151  // These intrinsics only demand the 0th element of their input vectors. If
1152  // we can simplify the input based on that, do so now.
1153  bool MadeChange = false;
1154  Value *Arg0 = II.getArgOperand(0);
1155  Value *Arg1 = II.getArgOperand(1);
1156  unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
1157  if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
1158  IC.replaceOperand(II, 0, V);
1159  MadeChange = true;
1160  }
1161  if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
1162  IC.replaceOperand(II, 1, V);
1163  MadeChange = true;
1164  }
1165  if (MadeChange) {
1166  return &II;
1167  }
1168  break;
1169  }
1170 
1171  case Intrinsic::x86_avx512_add_ps_512:
1172  case Intrinsic::x86_avx512_div_ps_512:
1173  case Intrinsic::x86_avx512_mul_ps_512:
1174  case Intrinsic::x86_avx512_sub_ps_512:
1175  case Intrinsic::x86_avx512_add_pd_512:
1176  case Intrinsic::x86_avx512_div_pd_512:
1177  case Intrinsic::x86_avx512_mul_pd_512:
1178  case Intrinsic::x86_avx512_sub_pd_512:
1179  // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1180  // IR operations.
1181  if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1182  if (R->getValue() == 4) {
1183  Value *Arg0 = II.getArgOperand(0);
1184  Value *Arg1 = II.getArgOperand(1);
1185 
1186  Value *V;
1187  switch (IID) {
1188  default:
1189  llvm_unreachable("Case stmts out of sync!");
1190  case Intrinsic::x86_avx512_add_ps_512:
1191  case Intrinsic::x86_avx512_add_pd_512:
1192  V = IC.Builder.CreateFAdd(Arg0, Arg1);
1193  break;
1194  case Intrinsic::x86_avx512_sub_ps_512:
1195  case Intrinsic::x86_avx512_sub_pd_512:
1196  V = IC.Builder.CreateFSub(Arg0, Arg1);
1197  break;
1198  case Intrinsic::x86_avx512_mul_ps_512:
1199  case Intrinsic::x86_avx512_mul_pd_512:
1200  V = IC.Builder.CreateFMul(Arg0, Arg1);
1201  break;
1202  case Intrinsic::x86_avx512_div_ps_512:
1203  case Intrinsic::x86_avx512_div_pd_512:
1204  V = IC.Builder.CreateFDiv(Arg0, Arg1);
1205  break;
1206  }
1207 
1208  return IC.replaceInstUsesWith(II, V);
1209  }
1210  }
1211  break;
1212 
1213  case Intrinsic::x86_avx512_mask_add_ss_round:
1214  case Intrinsic::x86_avx512_mask_div_ss_round:
1215  case Intrinsic::x86_avx512_mask_mul_ss_round:
1216  case Intrinsic::x86_avx512_mask_sub_ss_round:
1217  case Intrinsic::x86_avx512_mask_add_sd_round:
1218  case Intrinsic::x86_avx512_mask_div_sd_round:
1219  case Intrinsic::x86_avx512_mask_mul_sd_round:
1220  case Intrinsic::x86_avx512_mask_sub_sd_round:
1221  // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1222  // IR operations.
1223  if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
1224  if (R->getValue() == 4) {
1225  // Extract the element as scalars.
1226  Value *Arg0 = II.getArgOperand(0);
1227  Value *Arg1 = II.getArgOperand(1);
1228  Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
1229  Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
1230 
1231  Value *V;
1232  switch (IID) {
1233  default:
1234  llvm_unreachable("Case stmts out of sync!");
1235  case Intrinsic::x86_avx512_mask_add_ss_round:
1236  case Intrinsic::x86_avx512_mask_add_sd_round:
1237  V = IC.Builder.CreateFAdd(LHS, RHS);
1238  break;
1239  case Intrinsic::x86_avx512_mask_sub_ss_round:
1240  case Intrinsic::x86_avx512_mask_sub_sd_round:
1241  V = IC.Builder.CreateFSub(LHS, RHS);
1242  break;
1243  case Intrinsic::x86_avx512_mask_mul_ss_round:
1244  case Intrinsic::x86_avx512_mask_mul_sd_round:
1245  V = IC.Builder.CreateFMul(LHS, RHS);
1246  break;
1247  case Intrinsic::x86_avx512_mask_div_ss_round:
1248  case Intrinsic::x86_avx512_mask_div_sd_round:
1249  V = IC.Builder.CreateFDiv(LHS, RHS);
1250  break;
1251  }
1252 
1253  // Handle the masking aspect of the intrinsic.
1254  Value *Mask = II.getArgOperand(3);
1255  auto *C = dyn_cast<ConstantInt>(Mask);
1256  // We don't need a select if we know the mask bit is a 1.
1257  if (!C || !C->getValue()[0]) {
1258  // Cast the mask to an i1 vector and then extract the lowest element.
1259  auto *MaskTy = FixedVectorType::get(
1260  IC.Builder.getInt1Ty(),
1261  cast<IntegerType>(Mask->getType())->getBitWidth());
1262  Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
1264  // Extract the lowest element from the passthru operand.
1265  Value *Passthru =
1267  V = IC.Builder.CreateSelect(Mask, V, Passthru);
1268  }
1269 
1270  // Insert the result back into the original argument 0.
1271  V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
1272 
1273  return IC.replaceInstUsesWith(II, V);
1274  }
1275  }
1276  break;
1277 
1278  // Constant fold ashr( <A x Bi>, Ci ).
1279  // Constant fold lshr( <A x Bi>, Ci ).
1280  // Constant fold shl( <A x Bi>, Ci ).
1281  case Intrinsic::x86_sse2_psrai_d:
1282  case Intrinsic::x86_sse2_psrai_w:
1283  case Intrinsic::x86_avx2_psrai_d:
1284  case Intrinsic::x86_avx2_psrai_w:
1285  case Intrinsic::x86_avx512_psrai_q_128:
1286  case Intrinsic::x86_avx512_psrai_q_256:
1287  case Intrinsic::x86_avx512_psrai_d_512:
1288  case Intrinsic::x86_avx512_psrai_q_512:
1289  case Intrinsic::x86_avx512_psrai_w_512:
1290  case Intrinsic::x86_sse2_psrli_d:
1291  case Intrinsic::x86_sse2_psrli_q:
1292  case Intrinsic::x86_sse2_psrli_w:
1293  case Intrinsic::x86_avx2_psrli_d:
1294  case Intrinsic::x86_avx2_psrli_q:
1295  case Intrinsic::x86_avx2_psrli_w:
1296  case Intrinsic::x86_avx512_psrli_d_512:
1297  case Intrinsic::x86_avx512_psrli_q_512:
1298  case Intrinsic::x86_avx512_psrli_w_512:
1299  case Intrinsic::x86_sse2_pslli_d:
1300  case Intrinsic::x86_sse2_pslli_q:
1301  case Intrinsic::x86_sse2_pslli_w:
1302  case Intrinsic::x86_avx2_pslli_d:
1303  case Intrinsic::x86_avx2_pslli_q:
1304  case Intrinsic::x86_avx2_pslli_w:
1305  case Intrinsic::x86_avx512_pslli_d_512:
1306  case Intrinsic::x86_avx512_pslli_q_512:
1307  case Intrinsic::x86_avx512_pslli_w_512:
1308  if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1309  return IC.replaceInstUsesWith(II, V);
1310  }
1311  break;
1312 
1313  case Intrinsic::x86_sse2_psra_d:
1314  case Intrinsic::x86_sse2_psra_w:
1315  case Intrinsic::x86_avx2_psra_d:
1316  case Intrinsic::x86_avx2_psra_w:
1317  case Intrinsic::x86_avx512_psra_q_128:
1318  case Intrinsic::x86_avx512_psra_q_256:
1319  case Intrinsic::x86_avx512_psra_d_512:
1320  case Intrinsic::x86_avx512_psra_q_512:
1321  case Intrinsic::x86_avx512_psra_w_512:
1322  case Intrinsic::x86_sse2_psrl_d:
1323  case Intrinsic::x86_sse2_psrl_q:
1324  case Intrinsic::x86_sse2_psrl_w:
1325  case Intrinsic::x86_avx2_psrl_d:
1326  case Intrinsic::x86_avx2_psrl_q:
1327  case Intrinsic::x86_avx2_psrl_w:
1328  case Intrinsic::x86_avx512_psrl_d_512:
1329  case Intrinsic::x86_avx512_psrl_q_512:
1330  case Intrinsic::x86_avx512_psrl_w_512:
1331  case Intrinsic::x86_sse2_psll_d:
1332  case Intrinsic::x86_sse2_psll_q:
1333  case Intrinsic::x86_sse2_psll_w:
1334  case Intrinsic::x86_avx2_psll_d:
1335  case Intrinsic::x86_avx2_psll_q:
1336  case Intrinsic::x86_avx2_psll_w:
1337  case Intrinsic::x86_avx512_psll_d_512:
1338  case Intrinsic::x86_avx512_psll_q_512:
1339  case Intrinsic::x86_avx512_psll_w_512: {
1340  if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1341  return IC.replaceInstUsesWith(II, V);
1342  }
1343 
1344  // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
1345  // operand to compute the shift amount.
1346  Value *Arg1 = II.getArgOperand(1);
1347  assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
1348  "Unexpected packed shift size");
1349  unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
1350 
1351  if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
1352  return IC.replaceOperand(II, 1, V);
1353  }
1354  break;
1355  }
1356 
1357  case Intrinsic::x86_avx2_psllv_d:
1358  case Intrinsic::x86_avx2_psllv_d_256:
1359  case Intrinsic::x86_avx2_psllv_q:
1360  case Intrinsic::x86_avx2_psllv_q_256:
1361  case Intrinsic::x86_avx512_psllv_d_512:
1362  case Intrinsic::x86_avx512_psllv_q_512:
1363  case Intrinsic::x86_avx512_psllv_w_128:
1364  case Intrinsic::x86_avx512_psllv_w_256:
1365  case Intrinsic::x86_avx512_psllv_w_512:
1366  case Intrinsic::x86_avx2_psrav_d:
1367  case Intrinsic::x86_avx2_psrav_d_256:
1368  case Intrinsic::x86_avx512_psrav_q_128:
1369  case Intrinsic::x86_avx512_psrav_q_256:
1370  case Intrinsic::x86_avx512_psrav_d_512:
1371  case Intrinsic::x86_avx512_psrav_q_512:
1372  case Intrinsic::x86_avx512_psrav_w_128:
1373  case Intrinsic::x86_avx512_psrav_w_256:
1374  case Intrinsic::x86_avx512_psrav_w_512:
1375  case Intrinsic::x86_avx2_psrlv_d:
1376  case Intrinsic::x86_avx2_psrlv_d_256:
1377  case Intrinsic::x86_avx2_psrlv_q:
1378  case Intrinsic::x86_avx2_psrlv_q_256:
1379  case Intrinsic::x86_avx512_psrlv_d_512:
1380  case Intrinsic::x86_avx512_psrlv_q_512:
1381  case Intrinsic::x86_avx512_psrlv_w_128:
1382  case Intrinsic::x86_avx512_psrlv_w_256:
1383  case Intrinsic::x86_avx512_psrlv_w_512:
1384  if (Value *V = simplifyX86varShift(II, IC.Builder)) {
1385  return IC.replaceInstUsesWith(II, V);
1386  }
1387  break;
1388 
1389  case Intrinsic::x86_sse2_packssdw_128:
1390  case Intrinsic::x86_sse2_packsswb_128:
1391  case Intrinsic::x86_avx2_packssdw:
1392  case Intrinsic::x86_avx2_packsswb:
1393  case Intrinsic::x86_avx512_packssdw_512:
1394  case Intrinsic::x86_avx512_packsswb_512:
1395  if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
1396  return IC.replaceInstUsesWith(II, V);
1397  }
1398  break;
1399 
1400  case Intrinsic::x86_sse2_packuswb_128:
1401  case Intrinsic::x86_sse41_packusdw:
1402  case Intrinsic::x86_avx2_packusdw:
1403  case Intrinsic::x86_avx2_packuswb:
1404  case Intrinsic::x86_avx512_packusdw_512:
1405  case Intrinsic::x86_avx512_packuswb_512:
1406  if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
1407  return IC.replaceInstUsesWith(II, V);
1408  }
1409  break;
1410 
1411  case Intrinsic::x86_pclmulqdq:
1412  case Intrinsic::x86_pclmulqdq_256:
1413  case Intrinsic::x86_pclmulqdq_512: {
1414  if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1415  unsigned Imm = C->getZExtValue();
1416 
1417  bool MadeChange = false;
1418  Value *Arg0 = II.getArgOperand(0);
1419  Value *Arg1 = II.getArgOperand(1);
1420  unsigned VWidth =
1421  cast<FixedVectorType>(Arg0->getType())->getNumElements();
1422 
1423  APInt UndefElts1(VWidth, 0);
1424  APInt DemandedElts1 =
1425  APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
1426  if (Value *V =
1427  IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
1428  IC.replaceOperand(II, 0, V);
1429  MadeChange = true;
1430  }
1431 
1432  APInt UndefElts2(VWidth, 0);
1433  APInt DemandedElts2 =
1434  APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
1435  if (Value *V =
1436  IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
1437  IC.replaceOperand(II, 1, V);
1438  MadeChange = true;
1439  }
1440 
1441  // If either input elements are undef, the result is zero.
1442  if (DemandedElts1.isSubsetOf(UndefElts1) ||
1443  DemandedElts2.isSubsetOf(UndefElts2)) {
1444  return IC.replaceInstUsesWith(II,
1446  }
1447 
1448  if (MadeChange) {
1449  return &II;
1450  }
1451  }
1452  break;
1453  }
1454 
1455  case Intrinsic::x86_sse41_insertps:
1456  if (Value *V = simplifyX86insertps(II, IC.Builder)) {
1457  return IC.replaceInstUsesWith(II, V);
1458  }
1459  break;
1460 
1461  case Intrinsic::x86_sse4a_extrq: {
1462  Value *Op0 = II.getArgOperand(0);
1463  Value *Op1 = II.getArgOperand(1);
1464  unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1465  unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1466  assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1467  Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1468  VWidth1 == 16 && "Unexpected operand sizes");
1469 
1470  // See if we're dealing with constant values.
1471  auto *C1 = dyn_cast<Constant>(Op1);
1472  auto *CILength =
1473  C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1474  : nullptr;
1475  auto *CIIndex =
1476  C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1477  : nullptr;
1478 
1479  // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
1480  if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1481  return IC.replaceInstUsesWith(II, V);
1482  }
1483 
1484  // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
1485  // operands and the lowest 16-bits of the second.
1486  bool MadeChange = false;
1487  if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1488  IC.replaceOperand(II, 0, V);
1489  MadeChange = true;
1490  }
1491  if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
1492  IC.replaceOperand(II, 1, V);
1493  MadeChange = true;
1494  }
1495  if (MadeChange) {
1496  return &II;
1497  }
1498  break;
1499  }
1500 
1501  case Intrinsic::x86_sse4a_extrqi: {
1502  // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
1503  // bits of the lower 64-bits. The upper 64-bits are undefined.
1504  Value *Op0 = II.getArgOperand(0);
1505  unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1506  assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1507  "Unexpected operand size");
1508 
1509  // See if we're dealing with constant values.
1510  auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
1511  auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
1512 
1513  // Attempt to simplify to a constant or shuffle vector.
1514  if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1515  return IC.replaceInstUsesWith(II, V);
1516  }
1517 
1518  // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
1519  // operand.
1520  if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1521  return IC.replaceOperand(II, 0, V);
1522  }
1523  break;
1524  }
1525 
1526  case Intrinsic::x86_sse4a_insertq: {
1527  Value *Op0 = II.getArgOperand(0);
1528  Value *Op1 = II.getArgOperand(1);
1529  unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1530  assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1531  Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1532  cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
1533  "Unexpected operand size");
1534 
1535  // See if we're dealing with constant values.
1536  auto *C1 = dyn_cast<Constant>(Op1);
1537  auto *CI11 =
1538  C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1539  : nullptr;
1540 
1541  // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
1542  if (CI11) {
1543  const APInt &V11 = CI11->getValue();
1544  APInt Len = V11.zextOrTrunc(6);
1545  APInt Idx = V11.lshr(8).zextOrTrunc(6);
1546  if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1547  return IC.replaceInstUsesWith(II, V);
1548  }
1549  }
1550 
1551  // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
1552  // operand.
1553  if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1554  return IC.replaceOperand(II, 0, V);
1555  }
1556  break;
1557  }
1558 
1559  case Intrinsic::x86_sse4a_insertqi: {
1560  // INSERTQI: Extract lowest Length bits from lower half of second source and
1561  // insert over first source starting at Index bit. The upper 64-bits are
1562  // undefined.
1563  Value *Op0 = II.getArgOperand(0);
1564  Value *Op1 = II.getArgOperand(1);
1565  unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1566  unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1567  assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1568  Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1569  VWidth1 == 2 && "Unexpected operand sizes");
1570 
1571  // See if we're dealing with constant values.
1572  auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
1573  auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
1574 
1575  // Attempt to simplify to a constant or shuffle vector.
1576  if (CILength && CIIndex) {
1577  APInt Len = CILength->getValue().zextOrTrunc(6);
1578  APInt Idx = CIIndex->getValue().zextOrTrunc(6);
1579  if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1580  return IC.replaceInstUsesWith(II, V);
1581  }
1582  }
1583 
1584  // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
1585  // operands.
1586  bool MadeChange = false;
1587  if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1588  IC.replaceOperand(II, 0, V);
1589  MadeChange = true;
1590  }
1591  if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
1592  IC.replaceOperand(II, 1, V);
1593  MadeChange = true;
1594  }
1595  if (MadeChange) {
1596  return &II;
1597  }
1598  break;
1599  }
1600 
1601  case Intrinsic::x86_sse41_pblendvb:
1602  case Intrinsic::x86_sse41_blendvps:
1603  case Intrinsic::x86_sse41_blendvpd:
1604  case Intrinsic::x86_avx_blendv_ps_256:
1605  case Intrinsic::x86_avx_blendv_pd_256:
1606  case Intrinsic::x86_avx2_pblendvb: {
1607  // fold (blend A, A, Mask) -> A
1608  Value *Op0 = II.getArgOperand(0);
1609  Value *Op1 = II.getArgOperand(1);
1610  Value *Mask = II.getArgOperand(2);
1611  if (Op0 == Op1) {
1612  return IC.replaceInstUsesWith(II, Op0);
1613  }
1614 
1615  // Zero Mask - select 1st argument.
1616  if (isa<ConstantAggregateZero>(Mask)) {
1617  return IC.replaceInstUsesWith(II, Op0);
1618  }
1619 
1620  // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
1621  if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
1622  Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
1623  return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
1624  }
1625 
1626  // Convert to a vector select if we can bypass casts and find a boolean
1627  // vector condition value.
1628  Value *BoolVec;
1631  BoolVec->getType()->isVectorTy() &&
1632  BoolVec->getType()->getScalarSizeInBits() == 1) {
1633  assert(Mask->getType()->getPrimitiveSizeInBits() ==
1634  II.getType()->getPrimitiveSizeInBits() &&
1635  "Not expecting mask and operands with different sizes");
1636 
1637  unsigned NumMaskElts =
1638  cast<FixedVectorType>(Mask->getType())->getNumElements();
1639  unsigned NumOperandElts =
1640  cast<FixedVectorType>(II.getType())->getNumElements();
1641  if (NumMaskElts == NumOperandElts) {
1642  return SelectInst::Create(BoolVec, Op1, Op0);
1643  }
1644 
1645  // If the mask has less elements than the operands, each mask bit maps to
1646  // multiple elements of the operands. Bitcast back and forth.
1647  if (NumMaskElts < NumOperandElts) {
1648  Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
1649  Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
1650  Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
1651  return new BitCastInst(Sel, II.getType());
1652  }
1653  }
1654 
1655  break;
1656  }
1657 
1658  case Intrinsic::x86_ssse3_pshuf_b_128:
1659  case Intrinsic::x86_avx2_pshuf_b:
1660  case Intrinsic::x86_avx512_pshuf_b_512:
1661  if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
1662  return IC.replaceInstUsesWith(II, V);
1663  }
1664  break;
1665 
1666  case Intrinsic::x86_avx_vpermilvar_ps:
1667  case Intrinsic::x86_avx_vpermilvar_ps_256:
1668  case Intrinsic::x86_avx512_vpermilvar_ps_512:
1669  case Intrinsic::x86_avx_vpermilvar_pd:
1670  case Intrinsic::x86_avx_vpermilvar_pd_256:
1671  case Intrinsic::x86_avx512_vpermilvar_pd_512:
1672  if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
1673  return IC.replaceInstUsesWith(II, V);
1674  }
1675  break;
1676 
1677  case Intrinsic::x86_avx2_permd:
1678  case Intrinsic::x86_avx2_permps:
1679  case Intrinsic::x86_avx512_permvar_df_256:
1680  case Intrinsic::x86_avx512_permvar_df_512:
1681  case Intrinsic::x86_avx512_permvar_di_256:
1682  case Intrinsic::x86_avx512_permvar_di_512:
1683  case Intrinsic::x86_avx512_permvar_hi_128:
1684  case Intrinsic::x86_avx512_permvar_hi_256:
1685  case Intrinsic::x86_avx512_permvar_hi_512:
1686  case Intrinsic::x86_avx512_permvar_qi_128:
1687  case Intrinsic::x86_avx512_permvar_qi_256:
1688  case Intrinsic::x86_avx512_permvar_qi_512:
1689  case Intrinsic::x86_avx512_permvar_sf_512:
1690  case Intrinsic::x86_avx512_permvar_si_512:
1691  if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
1692  return IC.replaceInstUsesWith(II, V);
1693  }
1694  break;
1695 
1696  case Intrinsic::x86_avx_maskload_ps:
1697  case Intrinsic::x86_avx_maskload_pd:
1698  case Intrinsic::x86_avx_maskload_ps_256:
1699  case Intrinsic::x86_avx_maskload_pd_256:
1700  case Intrinsic::x86_avx2_maskload_d:
1701  case Intrinsic::x86_avx2_maskload_q:
1702  case Intrinsic::x86_avx2_maskload_d_256:
1703  case Intrinsic::x86_avx2_maskload_q_256:
1704  if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
1705  return I;
1706  }
1707  break;
1708 
1709  case Intrinsic::x86_sse2_maskmov_dqu:
1710  case Intrinsic::x86_avx_maskstore_ps:
1711  case Intrinsic::x86_avx_maskstore_pd:
1712  case Intrinsic::x86_avx_maskstore_ps_256:
1713  case Intrinsic::x86_avx_maskstore_pd_256:
1714  case Intrinsic::x86_avx2_maskstore_d:
1715  case Intrinsic::x86_avx2_maskstore_q:
1716  case Intrinsic::x86_avx2_maskstore_d_256:
1717  case Intrinsic::x86_avx2_maskstore_q_256:
1718  if (simplifyX86MaskedStore(II, IC)) {
1719  return nullptr;
1720  }
1721  break;
1722 
1723  case Intrinsic::x86_addcarry_32:
1724  case Intrinsic::x86_addcarry_64:
1725  if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
1726  return IC.replaceInstUsesWith(II, V);
1727  }
1728  break;
1729 
1730  default:
1731  break;
1732  }
1733  return None;
1734 }
1735 
1737  InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
1738  bool &KnownBitsComputed) const {
1739  switch (II.getIntrinsicID()) {
1740  default:
1741  break;
1742  case Intrinsic::x86_mmx_pmovmskb:
1743  case Intrinsic::x86_sse_movmsk_ps:
1744  case Intrinsic::x86_sse2_movmsk_pd:
1745  case Intrinsic::x86_sse2_pmovmskb_128:
1746  case Intrinsic::x86_avx_movmsk_ps_256:
1747  case Intrinsic::x86_avx_movmsk_pd_256:
1748  case Intrinsic::x86_avx2_pmovmskb: {
1749  // MOVMSK copies the vector elements' sign bits to the low bits
1750  // and zeros the high bits.
1751  unsigned ArgWidth;
1752  if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
1753  ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
1754  } else {
1755  auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
1756  ArgWidth = ArgType->getNumElements();
1757  }
1758 
1759  // If we don't need any of low bits then return zero,
1760  // we know that DemandedMask is non-zero already.
1761  APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
1762  Type *VTy = II.getType();
1763  if (DemandedElts.isZero()) {
1764  return ConstantInt::getNullValue(VTy);
1765  }
1766 
1767  // We know that the upper bits are set to zero.
1768  Known.Zero.setBitsFrom(ArgWidth);
1769  KnownBitsComputed = true;
1770  break;
1771  }
1772  }
1773  return None;
1774 }
1775 
1777  InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1778  APInt &UndefElts2, APInt &UndefElts3,
1779  std::function<void(Instruction *, unsigned, APInt, APInt &)>
1780  simplifyAndSetOp) const {
1781  unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
1782  switch (II.getIntrinsicID()) {
1783  default:
1784  break;
1785  case Intrinsic::x86_xop_vfrcz_ss:
1786  case Intrinsic::x86_xop_vfrcz_sd:
1787  // The instructions for these intrinsics are speced to zero upper bits not
1788  // pass them through like other scalar intrinsics. So we shouldn't just
1789  // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
1790  // Instead we should return a zero vector.
1791  if (!DemandedElts[0]) {
1792  IC.addToWorklist(&II);
1793  return ConstantAggregateZero::get(II.getType());
1794  }
1795 
1796  // Only the lower element is used.
1797  DemandedElts = 1;
1798  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1799 
1800  // Only the lower element is undefined. The high elements are zero.
1801  UndefElts = UndefElts[0];
1802  break;
1803 
1804  // Unary scalar-as-vector operations that work column-wise.
1805  case Intrinsic::x86_sse_rcp_ss:
1806  case Intrinsic::x86_sse_rsqrt_ss:
1807  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1808 
1809  // If lowest element of a scalar op isn't used then use Arg0.
1810  if (!DemandedElts[0]) {
1811  IC.addToWorklist(&II);
1812  return II.getArgOperand(0);
1813  }
1814  // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
1815  // checks).
1816  break;
1817 
1818  // Binary scalar-as-vector operations that work column-wise. The high
1819  // elements come from operand 0. The low element is a function of both
1820  // operands.
1821  case Intrinsic::x86_sse_min_ss:
1822  case Intrinsic::x86_sse_max_ss:
1823  case Intrinsic::x86_sse_cmp_ss:
1824  case Intrinsic::x86_sse2_min_sd:
1825  case Intrinsic::x86_sse2_max_sd:
1826  case Intrinsic::x86_sse2_cmp_sd: {
1827  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1828 
1829  // If lowest element of a scalar op isn't used then use Arg0.
1830  if (!DemandedElts[0]) {
1831  IC.addToWorklist(&II);
1832  return II.getArgOperand(0);
1833  }
1834 
1835  // Only lower element is used for operand 1.
1836  DemandedElts = 1;
1837  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1838 
1839  // Lower element is undefined if both lower elements are undefined.
1840  // Consider things like undef&0. The result is known zero, not undef.
1841  if (!UndefElts2[0])
1842  UndefElts.clearBit(0);
1843 
1844  break;
1845  }
1846 
1847  // Binary scalar-as-vector operations that work column-wise. The high
1848  // elements come from operand 0 and the low element comes from operand 1.
1849  case Intrinsic::x86_sse41_round_ss:
1850  case Intrinsic::x86_sse41_round_sd: {
1851  // Don't use the low element of operand 0.
1852  APInt DemandedElts2 = DemandedElts;
1853  DemandedElts2.clearBit(0);
1854  simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
1855 
1856  // If lowest element of a scalar op isn't used then use Arg0.
1857  if (!DemandedElts[0]) {
1858  IC.addToWorklist(&II);
1859  return II.getArgOperand(0);
1860  }
1861 
1862  // Only lower element is used for operand 1.
1863  DemandedElts = 1;
1864  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1865 
1866  // Take the high undef elements from operand 0 and take the lower element
1867  // from operand 1.
1868  UndefElts.clearBit(0);
1869  UndefElts |= UndefElts2[0];
1870  break;
1871  }
1872 
1873  // Three input scalar-as-vector operations that work column-wise. The high
1874  // elements come from operand 0 and the low element is a function of all
1875  // three inputs.
1876  case Intrinsic::x86_avx512_mask_add_ss_round:
1877  case Intrinsic::x86_avx512_mask_div_ss_round:
1878  case Intrinsic::x86_avx512_mask_mul_ss_round:
1879  case Intrinsic::x86_avx512_mask_sub_ss_round:
1880  case Intrinsic::x86_avx512_mask_max_ss_round:
1881  case Intrinsic::x86_avx512_mask_min_ss_round:
1882  case Intrinsic::x86_avx512_mask_add_sd_round:
1883  case Intrinsic::x86_avx512_mask_div_sd_round:
1884  case Intrinsic::x86_avx512_mask_mul_sd_round:
1885  case Intrinsic::x86_avx512_mask_sub_sd_round:
1886  case Intrinsic::x86_avx512_mask_max_sd_round:
1887  case Intrinsic::x86_avx512_mask_min_sd_round:
1888  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1889 
1890  // If lowest element of a scalar op isn't used then use Arg0.
1891  if (!DemandedElts[0]) {
1892  IC.addToWorklist(&II);
1893  return II.getArgOperand(0);
1894  }
1895 
1896  // Only lower element is used for operand 1 and 2.
1897  DemandedElts = 1;
1898  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1899  simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
1900 
1901  // Lower element is undefined if all three lower elements are undefined.
1902  // Consider things like undef&0. The result is known zero, not undef.
1903  if (!UndefElts2[0] || !UndefElts3[0])
1904  UndefElts.clearBit(0);
1905  break;
1906 
1907  // TODO: Add fmaddsub support?
1908  case Intrinsic::x86_sse3_addsub_pd:
1909  case Intrinsic::x86_sse3_addsub_ps:
1910  case Intrinsic::x86_avx_addsub_pd_256:
1911  case Intrinsic::x86_avx_addsub_ps_256: {
1912  // If none of the even or none of the odd lanes are required, turn this
1913  // into a generic FP math instruction.
1914  APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
1915  APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
1916  bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
1917  bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
1918  if (IsSubOnly || IsAddOnly) {
1919  assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
1921  IC.Builder.SetInsertPoint(&II);
1922  Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
1923  return IC.Builder.CreateBinOp(
1924  IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
1925  }
1926 
1927  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1928  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1929  UndefElts &= UndefElts2;
1930  break;
1931  }
1932 
1933  // General per-element vector operations.
1934  case Intrinsic::x86_avx2_psllv_d:
1935  case Intrinsic::x86_avx2_psllv_d_256:
1936  case Intrinsic::x86_avx2_psllv_q:
1937  case Intrinsic::x86_avx2_psllv_q_256:
1938  case Intrinsic::x86_avx2_psrlv_d:
1939  case Intrinsic::x86_avx2_psrlv_d_256:
1940  case Intrinsic::x86_avx2_psrlv_q:
1941  case Intrinsic::x86_avx2_psrlv_q_256:
1942  case Intrinsic::x86_avx2_psrav_d:
1943  case Intrinsic::x86_avx2_psrav_d_256: {
1944  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1945  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1946  UndefElts &= UndefElts2;
1947  break;
1948  }
1949 
1950  case Intrinsic::x86_sse2_packssdw_128:
1951  case Intrinsic::x86_sse2_packsswb_128:
1952  case Intrinsic::x86_sse2_packuswb_128:
1953  case Intrinsic::x86_sse41_packusdw:
1954  case Intrinsic::x86_avx2_packssdw:
1955  case Intrinsic::x86_avx2_packsswb:
1956  case Intrinsic::x86_avx2_packusdw:
1957  case Intrinsic::x86_avx2_packuswb:
1958  case Intrinsic::x86_avx512_packssdw_512:
1959  case Intrinsic::x86_avx512_packsswb_512:
1960  case Intrinsic::x86_avx512_packusdw_512:
1961  case Intrinsic::x86_avx512_packuswb_512: {
1962  auto *Ty0 = II.getArgOperand(0)->getType();
1963  unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
1964  assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
1965 
1966  unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
1967  unsigned VWidthPerLane = VWidth / NumLanes;
1968  unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
1969 
1970  // Per lane, pack the elements of the first input and then the second.
1971  // e.g.
1972  // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
1973  // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
1974  for (int OpNum = 0; OpNum != 2; ++OpNum) {
1975  APInt OpDemandedElts(InnerVWidth, 0);
1976  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1977  unsigned LaneIdx = Lane * VWidthPerLane;
1978  for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
1979  unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
1980  if (DemandedElts[Idx])
1981  OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
1982  }
1983  }
1984 
1985  // Demand elements from the operand.
1986  APInt OpUndefElts(InnerVWidth, 0);
1987  simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
1988 
1989  // Pack the operand's UNDEF elements, one lane at a time.
1990  OpUndefElts = OpUndefElts.zext(VWidth);
1991  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1992  APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
1993  LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
1994  LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
1995  UndefElts |= LaneElts;
1996  }
1997  }
1998  break;
1999  }
2000 
2001  // PSHUFB
2002  case Intrinsic::x86_ssse3_pshuf_b_128:
2003  case Intrinsic::x86_avx2_pshuf_b:
2004  case Intrinsic::x86_avx512_pshuf_b_512:
2005  // PERMILVAR
2006  case Intrinsic::x86_avx_vpermilvar_ps:
2007  case Intrinsic::x86_avx_vpermilvar_ps_256:
2008  case Intrinsic::x86_avx512_vpermilvar_ps_512:
2009  case Intrinsic::x86_avx_vpermilvar_pd:
2010  case Intrinsic::x86_avx_vpermilvar_pd_256:
2011  case Intrinsic::x86_avx512_vpermilvar_pd_512:
2012  // PERMV
2013  case Intrinsic::x86_avx2_permd:
2014  case Intrinsic::x86_avx2_permps: {
2015  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
2016  break;
2017  }
2018 
2019  // SSE4A instructions leave the upper 64-bits of the 128-bit result
2020  // in an undefined state.
2021  case Intrinsic::x86_sse4a_extrq:
2022  case Intrinsic::x86_sse4a_extrqi:
2023  case Intrinsic::x86_sse4a_insertq:
2024  case Intrinsic::x86_sse4a_insertqi:
2025  UndefElts.setHighBits(VWidth / 2);
2026  break;
2027  }
2028  return None;
2029 }
i
i
Definition: README.txt:29
llvm::X86TTIImpl::simplifyDemandedVectorEltsIntrinsic
Optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: X86InstCombineIntrinsic.cpp:1776
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:179
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:65
InstCombiner.h
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1410
IntrinsicInst.h
llvm::KnownBits::getMinValue
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:120
llvm::Function
Definition: Function.h:60
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
simplifyX86insertq
static Value * simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, APInt APLength, APInt APIndex, InstCombiner::BuilderTy &Builder)
Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant folding or conversion to a shu...
Definition: X86InstCombineIntrinsic.cpp:709
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:727
simplifyX86vpermilvar
static Value * simplifyX86vpermilvar(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert vpermilvar* to shufflevector if the mask is constant.
Definition: X86InstCombineIntrinsic.cpp:847
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5225
llvm::KnownBits::Zero
APInt Zero
Definition: KnownBits.h:24
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:133
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
simplifyX86pack
static Value * simplifyX86pack(IntrinsicInst &II, InstCombiner::BuilderTy &Builder, bool IsSigned)
Definition: X86InstCombineIntrinsic.cpp:437
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:58
llvm::ConstantExpr::getICmp
static Constant * getICmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced=false)
get* - Return some common constants without having to specify the full Instruction::OPCODE identifier...
Definition: Constants.cpp:2527
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::APInt::zextOrTrunc
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:994
llvm::ConstantExpr::getBitCast
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2258
llvm::APInt::getBitsSet
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:241
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:746
llvm::APInt::getSignedMaxValue
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:189
Shift
bool Shift
Definition: README.txt:468
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::ConstantAggregateZero
All zero aggregate value.
Definition: Constants.h:336
llvm::IRBuilderBase::CreateFSub
Value * CreateFSub(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1432
llvm::Optional
Definition: APInt.h:33
llvm::KnownBits::isZero
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:72
llvm::InstCombiner::addToWorklist
void addToWorklist(Instruction *I)
Definition: InstCombiner.h:366
llvm::IRBuilderBase::CreateInsertElement
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2289
llvm::APInt::lshr
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:832
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:237
llvm::APInt::getZero
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:177
F
#define F(x, y, z)
Definition: MD5.cpp:55
KnownBits.h
llvm::IRBuilderBase::CreateBinOp
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1532
llvm::APInt::setHighBits
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1354
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:186
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::APInt::uge
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1171
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
simplifyX86pshufb
static Value * simplifyX86pshufb(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert pshufb* to shufflevector if the mask is constant.
Definition: X86InstCombineIntrinsic.cpp:802
x3
In x86 we generate this spiffy xmm0 xmm0 ret in x86 we generate this which could be xmm1 movss xmm1 xmm0 ret In sse4 we could use insertps to make both better Here s another testcase that could use x3
Definition: README-SSE.txt:547
llvm::IRBuilderBase::CreateFMul
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1457
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:438
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1605
llvm::SelectInst::Create
static SelectInst * Create(Value *C, Value *S1, Value *S2, const Twine &NameStr="", Instruction *InsertBefore=nullptr, Instruction *MDFrom=nullptr)
Definition: Instructions.h:1768
simplifyX86varShift
static Value * simplifyX86varShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:301
llvm::APInt::setBit
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1292
llvm::APInt::lshrInPlace
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:839
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::APInt::isZero
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:359
getBoolVecFromMask
static Value * getBoolVecFromMask(Value *Mask)
Convert the x86 XMM integer vector mask to a vector of bools based on each element's most significant...
Definition: X86InstCombineIntrinsic.cpp:38
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
int
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
Definition: README.txt:536
llvm::VectorType::getInteger
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:440
llvm::X86TTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: X86InstCombineIntrinsic.cpp:928
simplifyX86extrq
static Value * simplifyX86extrq(IntrinsicInst &II, Value *Op0, ConstantInt *CILength, ConstantInt *CIIndex, InstCombiner::BuilderTy &Builder)
Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding or conversion to a shuffle...
Definition: X86InstCombineIntrinsic.cpp:618
simplifyX86movmsk
static Value * simplifyX86movmsk(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:503
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:227
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1478
llvm::InstCombiner::SimplifyDemandedVectorElts
virtual Value * SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, unsigned Depth=0, bool AllowMultipleUsers=false)=0
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1769
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:919
Align
uint64_t Align
Definition: ELFObjHandler.cpp:81
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::None
const NoneType None
Definition: None.h:24
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
llvm::IRBuilderBase::CreateAnd
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1350
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:191
llvm::APInt::isSubsetOf
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1207
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:222
getNegativeIsTrueBoolVec
static Constant * getNegativeIsTrueBoolVec(Constant *V)
Return a constant boolean vector that has true elements in all positions where the input constant dat...
Definition: X86InstCombineIntrinsic.cpp:28
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::IRBuilderBase::CreateFAdd
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1407
llvm::IRBuilderBase::CreateFDiv
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1482
simplifyX86MaskedStore
static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC)
Definition: X86InstCombineIntrinsic.cpp:86
llvm::IRBuilderBase::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1952
x2
gcc mainline compiles it x2(%rip)
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
uint64_t
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::IRBuilderBase::CreateSelect
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:991
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::KnownBits::getMaxValue
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:136
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:222
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::IRBuilderBase::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2276
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::X86TTIImpl::simplifyDemandedUseBitsIntrinsic
Optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
Definition: X86InstCombineIntrinsic.cpp:1736
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::PatternMatch::m_SExt
CastClass_match< OpTy, Instruction::SExt > m_SExt(const OpTy &Op)
Matches SExt.
Definition: PatternMatch.h:1633
llvm::Type::getStructElementType
Type * getStructElementType(unsigned N) const
Definition: DerivedTypes.h:352
X86TargetTransformInfo.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Constant::getAggregateElement
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:410
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
uint32_t
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
llvm::IRBuilderBase::InsertPointGuard
Definition: IRBuilder.h:350
llvm::ConstantVector::get
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1389
llvm::APInt::ult
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1061
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:280
llvm::APInt::clearBit
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1369
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:417
llvm::APInt::zext
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:973
llvm::NVPTX::PTXLdStInstCode::V2
@ V2
Definition: NVPTX.h:123
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:240
simplifyX86insertps
static Value * simplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:559
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:350
llvm::KnownBits
Definition: KnownBits.h:23
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:341
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:197
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::APInt::getLoBits
APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition: APInt.cpp:605
llvm::APInt::getSplat
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:612
simplifyX86vpermv
static Value * simplifyX86vpermv(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
Definition: X86InstCombineIntrinsic.cpp:894
llvm::PatternMatch::m_ZeroInt
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:522
llvm::APInt::getSignedMinValue
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:199
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:432
llvm::APInt::sext
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:946
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:46
llvm::makeArrayRef
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:475
llvm::IRBuilderBase::getInt1Ty
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:486
llvm::Constant::getIntegerValue
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:378
simplifyX86addcarry
static Value * simplifyX86addcarry(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:532
llvm::IRBuilderBase::CreateLShr
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1310
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::IRBuilderBase::CreateMaskedStore
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:551
simplifyX86MaskedLoad
static Instruction * simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC)
Definition: X86InstCombineIntrinsic.cpp:56
llvm::IRBuilderBase::CreateShl
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1289
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:289
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
llvm::APInt::setBitsFrom
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1348
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::InstCombiner::peekThroughBitcast
static Value * peekThroughBitcast(Value *V, bool OneUseOnly=false)
Return the source operand of a potentially bitcasted value while optionally checking if it has one us...
Definition: InstCombiner.h:101
llvm::APInt::shl
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:854
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::ConstantAggregateZero::get
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1648
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::IRBuilderBase::CreateMaskedLoad
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:530
simplifyX86immShift
static Value * simplifyX86immShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:119
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:164
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37