Go to the documentation of this file.
23 #include "llvm/IR/IntrinsicsX86.h"
31 #define DEBUG_TYPE "x86-partial-reduction"
51 return "X86 Partial Reduction";
61 return new X86PartialReduction();
67 "X86 Partial Reduction",
false,
false)
72 if (!
ST->hasVNNI() && !
ST->hasAVXVNNI())
78 if (isa<SExtInst>(
LHS))
82 if (
auto *Cast = dyn_cast<CastInst>(
Op)) {
84 (Cast->getOpcode() == Instruction::SExt ||
85 Cast->getOpcode() == Instruction::ZExt) &&
90 return isa<Constant>(
Op);
104 bool X86PartialReduction::tryMAddReplacement(
Instruction *
Op,
105 bool ReduceInOneBB) {
110 if (cast<FixedVectorType>(
Op->getType())->getNumElements() < 8)
114 if (!cast<VectorType>(
Op->getType())->getElementType()->isIntegerTy(32))
117 auto *
Mul = dyn_cast<BinaryOperator>(
Op);
128 if (ReduceInOneBB && matchVPDPBUSDPattern(
ST,
Mul,
DL))
135 if (
ST->hasSSE41()) {
147 auto CanShrinkOp = [&](
Value *
Op) {
149 if (
auto *Cast = dyn_cast<CastInst>(
Op)) {
151 (Cast->getOpcode() == Instruction::SExt ||
152 Cast->getOpcode() == Instruction::ZExt) &&
157 return isa<Constant>(
Op);
168 if (
auto *BO = dyn_cast<BinaryOperator>(
Op)) {
180 if (!CanShrinkOp(
LHS) && !CanShrinkOp(
RHS))
185 auto *MulTy = cast<FixedVectorType>(
Op->getType());
186 unsigned NumElts = MulTy->getNumElements();
193 for (
int i = 0,
e = NumElts / 2;
i !=
e; ++
i) {
195 OddMask[
i] =
i * 2 + 1;
200 Value *EvenElts =
Builder.CreateShuffleVector(NewMul, NewMul, EvenMask);
201 Value *OddElts =
Builder.CreateShuffleVector(NewMul, NewMul, OddMask);
206 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
216 bool X86PartialReduction::trySADReplacement(
Instruction *
Op) {
222 if (!cast<VectorType>(
Op->getType())->getElementType()->isIntegerTy(32))
226 if (
match(
Op, PatternMatch::m_Intrinsic<Intrinsic::abs>())) {
227 LHS =
Op->getOperand(0);
230 auto *
SI = dyn_cast<SelectInst>(
Op);
242 auto *Sub = dyn_cast<BinaryOperator>(
LHS);
243 if (!Sub || Sub->getOpcode() != Instruction::Sub)
248 if (
auto *ZExt = dyn_cast<ZExtInst>(
Op))
249 if (cast<VectorType>(ZExt->getOperand(0)->getType())
252 return ZExt->getOperand(0);
258 Value *Op0 = getZeroExtendedVal(Sub->getOperand(0));
259 Value *Op1 = getZeroExtendedVal(Sub->getOperand(1));
265 auto *OpTy = cast<FixedVectorType>(
Op->getType());
266 unsigned NumElts = OpTy->getNumElements();
268 unsigned IntrinsicNumElts;
270 if (
ST->hasBWI() && NumElts >= 64) {
271 IID = Intrinsic::x86_avx512_psad_bw_512;
272 IntrinsicNumElts = 64;
273 }
else if (
ST->hasAVX2() && NumElts >= 32) {
274 IID = Intrinsic::x86_avx2_psad_bw;
275 IntrinsicNumElts = 32;
277 IID = Intrinsic::x86_sse2_psad_bw;
278 IntrinsicNumElts = 16;
286 for (
unsigned i = 0;
i != NumElts; ++
i)
288 for (
unsigned i = NumElts;
i != 16; ++
i)
289 ConcatMask[
i] = (
i % NumElts) + NumElts;
292 Op0 =
Builder.CreateShuffleVector(Op0, Zero, ConcatMask);
293 Op1 =
Builder.CreateShuffleVector(Op1, Zero, ConcatMask);
301 assert(NumElts % IntrinsicNumElts == 0 &&
"Unexpected number of elements!");
302 unsigned NumSplits = NumElts / IntrinsicNumElts;
306 for (
unsigned i = 0;
i != NumSplits; ++
i) {
308 std::iota(ExtractMask.begin(), ExtractMask.end(),
i * IntrinsicNumElts);
309 Value *ExtractOp0 =
Builder.CreateShuffleVector(Op0, Op0, ExtractMask);
310 Value *ExtractOp1 =
Builder.CreateShuffleVector(Op1, Op0, ExtractMask);
311 Ops[
i] =
Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1});
312 Ops[
i] =
Builder.CreateBitCast(Ops[
i], I32Ty);
316 unsigned Stages =
Log2_32(NumSplits);
317 for (
unsigned s = Stages;
s > 0; --
s) {
318 unsigned NumConcatElts =
319 cast<FixedVectorType>(Ops[0]->
getType())->getNumElements() * 2;
320 for (
unsigned i = 0;
i != 1U << (
s - 1); ++
i) {
322 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
323 Ops[
i] =
Builder.CreateShuffleVector(Ops[
i*2], Ops[
i*2+1], ConcatMask);
329 NumElts = cast<FixedVectorType>(OpTy)->getNumElements();
333 }
else if (NumElts >= 8) {
336 cast<FixedVectorType>(Ops[0]->
getType())->getNumElements();
337 for (
unsigned i = 0;
i != SubElts; ++
i)
339 for (
unsigned i = SubElts;
i != NumElts; ++
i)
340 ConcatMask[
i] = (
i % SubElts) + SubElts;
343 Ops[0] =
Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
346 Op->replaceAllUsesWith(Ops[0]);
347 Op->eraseFromParent();
355 bool &ReduceInOneBB) {
356 ReduceInOneBB =
true;
366 ReduceInOneBB =
false;
368 unsigned NumElems = cast<FixedVectorType>(BO->getType())->getNumElements();
374 unsigned Stages =
Log2_32(NumElems);
375 for (
unsigned i = 0;
i != Stages; ++
i) {
376 const auto *BO = dyn_cast<BinaryOperator>(
Op);
380 ReduceInOneBB =
false;
384 if (
i != 0 && !BO->hasNUses(2))
390 auto *Shuffle = dyn_cast<ShuffleVectorInst>(
LHS);
394 Shuffle = dyn_cast<ShuffleVectorInst>(
RHS);
400 if (!Shuffle || Shuffle->getOperand(0) !=
Op)
404 unsigned MaskEnd = 1 <<
i;
406 if (Shuffle->getMaskValue(
Index) != (
int)(MaskEnd +
Index))
410 return const_cast<Value *
>(
Op);
438 Worklist.push_back(Root);
440 while (!Worklist.empty()) {
442 if (!Visited.
insert(V).second)
445 if (
auto *PN = dyn_cast<PHINode>(V)) {
448 if (!PN->hasNUses(PN == Root ? 2 : 1))
457 if (
auto *BO = dyn_cast<BinaryOperator>(V)) {
460 if (BO->hasNUses(BO == Root ? 2 : 1)) {
467 if (BO->hasNUses(BO == Root ? 3 : 2)) {
469 for (
auto *U : Root->
users())
470 if (
auto *
P = dyn_cast<PHINode>(U))
490 if (
auto *
I = dyn_cast<Instruction>(V)) {
504 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
509 ST =
TM.getSubtargetImpl(
F);
511 DL = &
F.getParent()->getDataLayout();
513 bool MadeChange =
false;
516 auto *EE = dyn_cast<ExtractElementInst>(&
I);
531 if (tryMAddReplacement(
I, ReduceInOneBB)) {
538 if (
I != Root && trySADReplacement(
I))
This is an optimization pass for GlobalISel generic memory operations.
INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, "X86 Partial Reduction", false, false) static bool matchVPDPBUSDPattern(const X86Subtarget *ST
static void collectLeaves(Value *Root, SmallVectorImpl< Instruction * > &Leaves)
A parsed version of the target data layout string in and methods for querying it.
bool hasOneUse() const
Return true if there is exactly one use of this value.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
const Function * getParent() const
Return the enclosing method, or null if none.
BinaryOperator const DataLayout * DL
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static Value * matchAddReduction(const ExtractElementInst &EE, bool &ReduceInOneBB)
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
user_iterator user_begin()
LLVM_NODISCARD T pop_back_val()
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool match(Val *V, const Pattern &P)
Represent the analysis usage information of a pass.
FunctionPass * createX86PartialReductionPass()
This pass optimizes arithmetic based on knowledge that is only used by a reduction sequence and is th...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
BinaryOps getOpcode() const
static constexpr int Concat[]
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static bool isReachableFromPHI(PHINode *Phi, BinaryOperator *BO)
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
unsigned getNumIncomingValues() const
Return the number of incoming edges.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
multiplies can be turned into SHL s
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
@ SPF_ABS
Floating point maxnum.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
StandardInstrumentations SI(Debug, VerifyEach)
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
StringRef - Represent a constant reference to a string, i.e.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
static bool runOnFunction(Function &F, bool PostInlining)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
const BasicBlock * getParent() const
const char LLVMTargetMachineRef TM
FunctionPass class - This class is used to implement most global optimizations.
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Value * getOperand(unsigned i) const
LLVM Value Representation.
iterator_range< user_iterator > users()
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.