LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoad(Instruction &I);
133 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
134 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
135 bool scalarizeExtExtract(Instruction &I);
136 bool foldConcatOfBoolMasks(Instruction &I);
137 bool foldPermuteOfBinops(Instruction &I);
138 bool foldShuffleOfBinops(Instruction &I);
139 bool foldShuffleOfSelects(Instruction &I);
140 bool foldShuffleOfCastops(Instruction &I);
141 bool foldShuffleOfShuffles(Instruction &I);
142 bool foldPermuteOfIntrinsic(Instruction &I);
143 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
144 bool foldShuffleOfIntrinsics(Instruction &I);
145 bool foldShuffleToIdentity(Instruction &I);
146 bool foldShuffleFromReductions(Instruction &I);
147 bool foldShuffleChainsToReduce(Instruction &I);
148 bool foldCastFromReductions(Instruction &I);
149 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
150 bool foldInterleaveIntrinsics(Instruction &I);
151 bool shrinkType(Instruction &I);
152 bool shrinkLoadForShuffles(Instruction &I);
153 bool shrinkPhiOfShuffles(Instruction &I);
154
155 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
156 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
157 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
158 Old.replaceAllUsesWith(&New);
159 if (auto *NewI = dyn_cast<Instruction>(&New)) {
160 New.takeName(&Old);
161 Worklist.pushUsersToWorkList(*NewI);
162 Worklist.pushValue(NewI);
163 }
164 if (Erase && isInstructionTriviallyDead(&Old)) {
165 eraseInstruction(Old);
166 } else {
167 Worklist.push(&Old);
168 }
169 }
170
171 void eraseInstruction(Instruction &I) {
172 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
173 SmallVector<Value *> Ops(I.operands());
174 Worklist.remove(&I);
175 I.eraseFromParent();
176
177 // Push remaining users of the operands and then the operand itself - allows
178 // further folds that were hindered by OneUse limits.
179 SmallPtrSet<Value *, 4> Visited;
180 for (Value *Op : Ops) {
181 if (!Visited.contains(Op)) {
182 if (auto *OpI = dyn_cast<Instruction>(Op)) {
184 OpI, nullptr, nullptr, [&](Value *V) {
185 if (auto *I = dyn_cast<Instruction>(V)) {
186 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
187 Worklist.remove(I);
188 if (I == NextInst)
189 NextInst = NextInst->getNextNode();
190 Visited.insert(I);
191 }
192 }))
193 continue;
194 Worklist.pushUsersToWorkList(*OpI);
195 Worklist.pushValue(OpI);
196 }
197 }
198 }
199 }
200};
201} // namespace
202
203/// Return the source operand of a potentially bitcasted value. If there is no
204/// bitcast, return the input value itself.
206 while (auto *BitCast = dyn_cast<BitCastInst>(V))
207 V = BitCast->getOperand(0);
208 return V;
209}
210
211static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
212 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
213 // The widened load may load data from dirty regions or create data races
214 // non-existent in the source.
215 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
216 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
218 return false;
219
220 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
221 // sure we have all of our type-based constraints in place for this target.
222 Type *ScalarTy = Load->getType()->getScalarType();
223 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
224 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
225 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
226 ScalarSize % 8 != 0)
227 return false;
228
229 return true;
230}
231
232bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
233 // Match insert into fixed vector of scalar value.
234 // TODO: Handle non-zero insert index.
235 Value *Scalar;
236 if (!match(&I,
238 return false;
239
240 // Optionally match an extract from another vector.
241 Value *X;
242 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
243 if (!HasExtract)
244 X = Scalar;
245
246 auto *Load = dyn_cast<LoadInst>(X);
247 if (!canWidenLoad(Load, TTI))
248 return false;
249
250 Type *ScalarTy = Scalar->getType();
251 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
252 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
253
254 // Check safety of replacing the scalar load with a larger vector load.
255 // We use minimal alignment (maximum flexibility) because we only care about
256 // the dereferenceable region. When calculating cost and creating a new op,
257 // we may use a larger value based on alignment attributes.
258 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
259 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
260
261 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
262 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
263 unsigned OffsetEltIndex = 0;
264 Align Alignment = Load->getAlign();
265 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
266 &DT)) {
267 // It is not safe to load directly from the pointer, but we can still peek
268 // through gep offsets and check if it safe to load from a base address with
269 // updated alignment. If it is, we can shuffle the element(s) into place
270 // after loading.
271 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
272 APInt Offset(OffsetBitWidth, 0);
274
275 // We want to shuffle the result down from a high element of a vector, so
276 // the offset must be positive.
277 if (Offset.isNegative())
278 return false;
279
280 // The offset must be a multiple of the scalar element to shuffle cleanly
281 // in the element's size.
282 uint64_t ScalarSizeInBytes = ScalarSize / 8;
283 if (Offset.urem(ScalarSizeInBytes) != 0)
284 return false;
285
286 // If we load MinVecNumElts, will our target element still be loaded?
287 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
288 if (OffsetEltIndex >= MinVecNumElts)
289 return false;
290
291 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
292 &DT))
293 return false;
294
295 // Update alignment with offset value. Note that the offset could be negated
296 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
297 // negation does not change the result of the alignment calculation.
298 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
299 }
300
301 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
302 // Use the greater of the alignment on the load or its source pointer.
303 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
304 Type *LoadTy = Load->getType();
305 unsigned AS = Load->getPointerAddressSpace();
306 InstructionCost OldCost =
307 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
308 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
309 OldCost +=
310 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
311 /* Insert */ true, HasExtract, CostKind);
312
313 // New pattern: load VecPtr
314 InstructionCost NewCost =
315 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
316 // Optionally, we are shuffling the loaded vector element(s) into place.
317 // For the mask set everything but element 0 to undef to prevent poison from
318 // propagating from the extra loaded memory. This will also optionally
319 // shrink/grow the vector from the loaded size to the output size.
320 // We assume this operation has no cost in codegen if there was no offset.
321 // Note that we could use freeze to avoid poison problems, but then we might
322 // still need a shuffle to change the vector size.
323 auto *Ty = cast<FixedVectorType>(I.getType());
324 unsigned OutputNumElts = Ty->getNumElements();
325 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
326 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
327 Mask[0] = OffsetEltIndex;
328 if (OffsetEltIndex)
329 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
330 CostKind);
331
332 // We can aggressively convert to the vector form because the backend can
333 // invert this transform if it does not result in a performance win.
334 if (OldCost < NewCost || !NewCost.isValid())
335 return false;
336
337 // It is safe and potentially profitable to load a vector directly:
338 // inselt undef, load Scalar, 0 --> load VecPtr
339 IRBuilder<> Builder(Load);
340 Value *CastedPtr =
341 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
342 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
343 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
344
345 replaceValue(I, *VecLd);
346 ++NumVecLoad;
347 return true;
348}
349
350/// If we are loading a vector and then inserting it into a larger vector with
351/// undefined elements, try to load the larger vector and eliminate the insert.
352/// This removes a shuffle in IR and may allow combining of other loaded values.
353bool VectorCombine::widenSubvectorLoad(Instruction &I) {
354 // Match subvector insert of fixed vector.
355 auto *Shuf = cast<ShuffleVectorInst>(&I);
356 if (!Shuf->isIdentityWithPadding())
357 return false;
358
359 // Allow a non-canonical shuffle mask that is choosing elements from op1.
360 unsigned NumOpElts =
361 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
362 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
363 return M >= (int)(NumOpElts);
364 });
365
366 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
367 if (!canWidenLoad(Load, TTI))
368 return false;
369
370 // We use minimal alignment (maximum flexibility) because we only care about
371 // the dereferenceable region. When calculating cost and creating a new op,
372 // we may use a larger value based on alignment attributes.
373 auto *Ty = cast<FixedVectorType>(I.getType());
374 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
375 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
376 Align Alignment = Load->getAlign();
377 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
378 return false;
379
380 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
381 Type *LoadTy = Load->getType();
382 unsigned AS = Load->getPointerAddressSpace();
383
384 // Original pattern: insert_subvector (load PtrOp)
385 // This conservatively assumes that the cost of a subvector insert into an
386 // undef value is 0. We could add that cost if the cost model accurately
387 // reflects the real cost of that operation.
388 InstructionCost OldCost =
389 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
390
391 // New pattern: load PtrOp
392 InstructionCost NewCost =
393 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
394
395 // We can aggressively convert to the vector form because the backend can
396 // invert this transform if it does not result in a performance win.
397 if (OldCost < NewCost || !NewCost.isValid())
398 return false;
399
400 IRBuilder<> Builder(Load);
401 Value *CastedPtr =
402 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
403 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
404 replaceValue(I, *VecLd);
405 ++NumVecLoad;
406 return true;
407}
408
409/// Determine which, if any, of the inputs should be replaced by a shuffle
410/// followed by extract from a different index.
411ExtractElementInst *VectorCombine::getShuffleExtract(
412 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
413 unsigned PreferredExtractIndex = InvalidIndex) const {
414 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
415 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
416 assert(Index0C && Index1C && "Expected constant extract indexes");
417
418 unsigned Index0 = Index0C->getZExtValue();
419 unsigned Index1 = Index1C->getZExtValue();
420
421 // If the extract indexes are identical, no shuffle is needed.
422 if (Index0 == Index1)
423 return nullptr;
424
425 Type *VecTy = Ext0->getVectorOperand()->getType();
426 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
427 InstructionCost Cost0 =
428 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
429 InstructionCost Cost1 =
430 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
431
432 // If both costs are invalid no shuffle is needed
433 if (!Cost0.isValid() && !Cost1.isValid())
434 return nullptr;
435
436 // We are extracting from 2 different indexes, so one operand must be shuffled
437 // before performing a vector operation and/or extract. The more expensive
438 // extract will be replaced by a shuffle.
439 if (Cost0 > Cost1)
440 return Ext0;
441 if (Cost1 > Cost0)
442 return Ext1;
443
444 // If the costs are equal and there is a preferred extract index, shuffle the
445 // opposite operand.
446 if (PreferredExtractIndex == Index0)
447 return Ext1;
448 if (PreferredExtractIndex == Index1)
449 return Ext0;
450
451 // Otherwise, replace the extract with the higher index.
452 return Index0 > Index1 ? Ext0 : Ext1;
453}
454
455/// Compare the relative costs of 2 extracts followed by scalar operation vs.
456/// vector operation(s) followed by extract. Return true if the existing
457/// instructions are cheaper than a vector alternative. Otherwise, return false
458/// and if one of the extracts should be transformed to a shufflevector, set
459/// \p ConvertToShuffle to that extract instruction.
460bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
461 ExtractElementInst *Ext1,
462 const Instruction &I,
463 ExtractElementInst *&ConvertToShuffle,
464 unsigned PreferredExtractIndex) {
465 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
466 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
467 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
468
469 unsigned Opcode = I.getOpcode();
470 Value *Ext0Src = Ext0->getVectorOperand();
471 Value *Ext1Src = Ext1->getVectorOperand();
472 Type *ScalarTy = Ext0->getType();
473 auto *VecTy = cast<VectorType>(Ext0Src->getType());
474 InstructionCost ScalarOpCost, VectorOpCost;
475
476 // Get cost estimates for scalar and vector versions of the operation.
477 bool IsBinOp = Instruction::isBinaryOp(Opcode);
478 if (IsBinOp) {
479 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
480 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
481 } else {
482 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
483 "Expected a compare");
484 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
485 ScalarOpCost = TTI.getCmpSelInstrCost(
486 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
487 VectorOpCost = TTI.getCmpSelInstrCost(
488 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
489 }
490
491 // Get cost estimates for the extract elements. These costs will factor into
492 // both sequences.
493 unsigned Ext0Index = Ext0IndexC->getZExtValue();
494 unsigned Ext1Index = Ext1IndexC->getZExtValue();
495
496 InstructionCost Extract0Cost =
497 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
498 InstructionCost Extract1Cost =
499 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
500
501 // A more expensive extract will always be replaced by a splat shuffle.
502 // For example, if Ext0 is more expensive:
503 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
504 // extelt (opcode (splat V0, Ext0), V1), Ext1
505 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
506 // check the cost of creating a broadcast shuffle and shuffling both
507 // operands to element 0.
508 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
509 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
510 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
511
512 // Extra uses of the extracts mean that we include those costs in the
513 // vector total because those instructions will not be eliminated.
514 InstructionCost OldCost, NewCost;
515 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
516 // Handle a special case. If the 2 extracts are identical, adjust the
517 // formulas to account for that. The extra use charge allows for either the
518 // CSE'd pattern or an unoptimized form with identical values:
519 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
520 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
521 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
522 OldCost = CheapExtractCost + ScalarOpCost;
523 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
524 } else {
525 // Handle the general case. Each extract is actually a different value:
526 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
527 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
528 NewCost = VectorOpCost + CheapExtractCost +
529 !Ext0->hasOneUse() * Extract0Cost +
530 !Ext1->hasOneUse() * Extract1Cost;
531 }
532
533 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
534 if (ConvertToShuffle) {
535 if (IsBinOp && DisableBinopExtractShuffle)
536 return true;
537
538 // If we are extracting from 2 different indexes, then one operand must be
539 // shuffled before performing the vector operation. The shuffle mask is
540 // poison except for 1 lane that is being translated to the remaining
541 // extraction lane. Therefore, it is a splat shuffle. Ex:
542 // ShufMask = { poison, poison, 0, poison }
543 // TODO: The cost model has an option for a "broadcast" shuffle
544 // (splat-from-element-0), but no option for a more general splat.
545 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
546 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
548 ShuffleMask[BestInsIndex] = BestExtIndex;
550 VecTy, VecTy, ShuffleMask, CostKind, 0,
551 nullptr, {ConvertToShuffle});
552 } else {
554 VecTy, VecTy, {}, CostKind, 0, nullptr,
555 {ConvertToShuffle});
556 }
557 }
558
559 // Aggressively form a vector op if the cost is equal because the transform
560 // may enable further optimization.
561 // Codegen can reverse this transform (scalarize) if it was not profitable.
562 return OldCost < NewCost;
563}
564
565/// Create a shuffle that translates (shifts) 1 element from the input vector
566/// to a new element location.
567static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
568 unsigned NewIndex, IRBuilderBase &Builder) {
569 // The shuffle mask is poison except for 1 lane that is being translated
570 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
571 // ShufMask = { 2, poison, poison, poison }
572 auto *VecTy = cast<FixedVectorType>(Vec->getType());
573 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
574 ShufMask[NewIndex] = OldIndex;
575 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
576}
577
578/// Given an extract element instruction with constant index operand, shuffle
579/// the source vector (shift the scalar element) to a NewIndex for extraction.
580/// Return null if the input can be constant folded, so that we are not creating
581/// unnecessary instructions.
582static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
583 IRBuilderBase &Builder) {
584 // Shufflevectors can only be created for fixed-width vectors.
585 Value *X = ExtElt->getVectorOperand();
586 if (!isa<FixedVectorType>(X->getType()))
587 return nullptr;
588
589 // If the extract can be constant-folded, this code is unsimplified. Defer
590 // to other passes to handle that.
591 Value *C = ExtElt->getIndexOperand();
592 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
593 if (isa<Constant>(X))
594 return nullptr;
595
596 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
597 NewIndex, Builder);
598 return Shuf;
599}
600
601/// Try to reduce extract element costs by converting scalar compares to vector
602/// compares followed by extract.
603/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
604Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
605 Instruction &I) {
606 assert(isa<CmpInst>(&I) && "Expected a compare");
607
608 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
609 // --> extelt (cmp Pred V0, V1), ExtIndex
610 ++NumVecCmp;
611 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
612 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
613 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
614}
615
616/// Try to reduce extract element costs by converting scalar binops to vector
617/// binops followed by extract.
618/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
619Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
620 Instruction &I) {
621 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
622
623 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
624 // --> extelt (bo V0, V1), ExtIndex
625 ++NumVecBO;
626 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
627 V1, "foldExtExtBinop");
628
629 // All IR flags are safe to back-propagate because any potential poison
630 // created in unused vector elements is discarded by the extract.
631 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
632 VecBOInst->copyIRFlags(&I);
633
634 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
635}
636
637/// Match an instruction with extracted vector operands.
638bool VectorCombine::foldExtractExtract(Instruction &I) {
639 // It is not safe to transform things like div, urem, etc. because we may
640 // create undefined behavior when executing those on unknown vector elements.
642 return false;
643
644 Instruction *I0, *I1;
645 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
646 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
648 return false;
649
650 Value *V0, *V1;
651 uint64_t C0, C1;
652 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
653 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
654 V0->getType() != V1->getType())
655 return false;
656
657 // If the scalar value 'I' is going to be re-inserted into a vector, then try
658 // to create an extract to that same element. The extract/insert can be
659 // reduced to a "select shuffle".
660 // TODO: If we add a larger pattern match that starts from an insert, this
661 // probably becomes unnecessary.
662 auto *Ext0 = cast<ExtractElementInst>(I0);
663 auto *Ext1 = cast<ExtractElementInst>(I1);
664 uint64_t InsertIndex = InvalidIndex;
665 if (I.hasOneUse())
666 match(I.user_back(),
667 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
668
669 ExtractElementInst *ExtractToChange;
670 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
671 return false;
672
673 Value *ExtOp0 = Ext0->getVectorOperand();
674 Value *ExtOp1 = Ext1->getVectorOperand();
675
676 if (ExtractToChange) {
677 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
678 Value *NewExtOp =
679 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
680 if (!NewExtOp)
681 return false;
682 if (ExtractToChange == Ext0)
683 ExtOp0 = NewExtOp;
684 else
685 ExtOp1 = NewExtOp;
686 }
687
688 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
689 : Ext0->getIndexOperand();
690 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
691 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
692 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
693 Worklist.push(Ext0);
694 Worklist.push(Ext1);
695 replaceValue(I, *NewExt);
696 return true;
697}
698
699/// Try to replace an extract + scalar fneg + insert with a vector fneg +
700/// shuffle.
701bool VectorCombine::foldInsExtFNeg(Instruction &I) {
702 // Match an insert (op (extract)) pattern.
703 Value *DstVec;
704 uint64_t ExtIdx, InsIdx;
705 Instruction *FNeg;
706 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
707 m_ConstantInt(InsIdx))))
708 return false;
709
710 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
711 Value *SrcVec;
712 Instruction *Extract;
713 if (!match(FNeg, m_FNeg(m_CombineAnd(
714 m_Instruction(Extract),
715 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
716 return false;
717
718 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
719 auto *DstVecScalarTy = DstVecTy->getScalarType();
720 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
721 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
722 return false;
723
724 // Ignore if insert/extract index is out of bounds or destination vector has
725 // one element
726 unsigned NumDstElts = DstVecTy->getNumElements();
727 unsigned NumSrcElts = SrcVecTy->getNumElements();
728 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
729 return false;
730
731 // We are inserting the negated element into the same lane that we extracted
732 // from. This is equivalent to a select-shuffle that chooses all but the
733 // negated element from the destination vector.
734 SmallVector<int> Mask(NumDstElts);
735 std::iota(Mask.begin(), Mask.end(), 0);
736 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
737 InstructionCost OldCost =
738 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
739 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
740
741 // If the extract has one use, it will be eliminated, so count it in the
742 // original cost. If it has more than one use, ignore the cost because it will
743 // be the same before/after.
744 if (Extract->hasOneUse())
745 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
746
747 InstructionCost NewCost =
748 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
750 DstVecTy, Mask, CostKind);
751
752 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
753 // If the lengths of the two vectors are not equal,
754 // we need to add a length-change vector. Add this cost.
755 SmallVector<int> SrcMask;
756 if (NeedLenChg) {
757 SrcMask.assign(NumDstElts, PoisonMaskElem);
758 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
760 DstVecTy, SrcVecTy, SrcMask, CostKind);
761 }
762
763 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
764 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
765 << "\n");
766 if (NewCost > OldCost)
767 return false;
768
769 Value *NewShuf, *LenChgShuf = nullptr;
770 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
771 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
772 if (NeedLenChg) {
773 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
774 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
775 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
776 Worklist.pushValue(LenChgShuf);
777 } else {
778 // shuffle DstVec, (fneg SrcVec), Mask
779 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
780 }
781
782 Worklist.pushValue(VecFNeg);
783 replaceValue(I, *NewShuf);
784 return true;
785}
786
787/// Try to fold insert(binop(x,y),binop(a,b),idx)
788/// --> binop(insert(x,a,idx),insert(y,b,idx))
789bool VectorCombine::foldInsExtBinop(Instruction &I) {
790 BinaryOperator *VecBinOp, *SclBinOp;
791 uint64_t Index;
792 if (!match(&I,
793 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
794 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
795 return false;
796
797 // TODO: Add support for addlike etc.
798 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
799 if (BinOpcode != SclBinOp->getOpcode())
800 return false;
801
802 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
803 if (!ResultTy)
804 return false;
805
806 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
807 // shuffle?
808
810 TTI.getInstructionCost(VecBinOp, CostKind) +
812 InstructionCost NewCost =
813 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
814 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
815 Index, VecBinOp->getOperand(0),
816 SclBinOp->getOperand(0)) +
817 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
818 Index, VecBinOp->getOperand(1),
819 SclBinOp->getOperand(1));
820
821 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
822 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
823 << "\n");
824 if (NewCost > OldCost)
825 return false;
826
827 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
828 SclBinOp->getOperand(0), Index);
829 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
830 SclBinOp->getOperand(1), Index);
831 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
832
833 // Intersect flags from the old binops.
834 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
835 NewInst->copyIRFlags(VecBinOp);
836 NewInst->andIRFlags(SclBinOp);
837 }
838
839 Worklist.pushValue(NewIns0);
840 Worklist.pushValue(NewIns1);
841 replaceValue(I, *NewBO);
842 return true;
843}
844
845/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
846/// Supports: bitcast, trunc, sext, zext
847bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
848 // Check if this is a bitwise logic operation
849 auto *BinOp = dyn_cast<BinaryOperator>(&I);
850 if (!BinOp || !BinOp->isBitwiseLogicOp())
851 return false;
852
853 // Get the cast instructions
854 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
855 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
856 if (!LHSCast || !RHSCast) {
857 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
858 return false;
859 }
860
861 // Both casts must be the same type
862 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
863 if (CastOpcode != RHSCast->getOpcode())
864 return false;
865
866 // Only handle supported cast operations
867 switch (CastOpcode) {
868 case Instruction::BitCast:
869 case Instruction::Trunc:
870 case Instruction::SExt:
871 case Instruction::ZExt:
872 break;
873 default:
874 return false;
875 }
876
877 Value *LHSSrc = LHSCast->getOperand(0);
878 Value *RHSSrc = RHSCast->getOperand(0);
879
880 // Source types must match
881 if (LHSSrc->getType() != RHSSrc->getType())
882 return false;
883
884 auto *SrcTy = LHSSrc->getType();
885 auto *DstTy = I.getType();
886 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
887 // Other casts only handle vector types with integer elements.
888 if (CastOpcode != Instruction::BitCast &&
889 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
890 return false;
891
892 // Only integer scalar/vector values are legal for bitwise logic operations.
893 if (!SrcTy->getScalarType()->isIntegerTy() ||
894 !DstTy->getScalarType()->isIntegerTy())
895 return false;
896
897 // Cost Check :
898 // OldCost = bitlogic + 2*casts
899 // NewCost = bitlogic + cast
900
901 // Calculate specific costs for each cast with instruction context
903 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
905 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
906
907 InstructionCost OldCost =
908 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
909 LHSCastCost + RHSCastCost;
910
911 // For new cost, we can't provide an instruction (it doesn't exist yet)
912 InstructionCost GenericCastCost = TTI.getCastInstrCost(
913 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
914
915 InstructionCost NewCost =
916 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
917 GenericCastCost;
918
919 // Account for multi-use casts using specific costs
920 if (!LHSCast->hasOneUse())
921 NewCost += LHSCastCost;
922 if (!RHSCast->hasOneUse())
923 NewCost += RHSCastCost;
924
925 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
926 << " NewCost=" << NewCost << "\n");
927
928 if (NewCost > OldCost)
929 return false;
930
931 // Create the operation on the source type
932 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
933 BinOp->getName() + ".inner");
934 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
935 NewBinOp->copyIRFlags(BinOp);
936
937 Worklist.pushValue(NewOp);
938
939 // Create the cast operation directly to ensure we get a new instruction
940 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
941
942 // Preserve cast instruction flags
943 NewCast->copyIRFlags(LHSCast);
944 NewCast->andIRFlags(RHSCast);
945
946 // Insert the new instruction
947 Value *Result = Builder.Insert(NewCast);
948
949 replaceValue(I, *Result);
950 return true;
951}
952
953/// Match:
954// bitop(castop(x), C) ->
955// bitop(castop(x), castop(InvC)) ->
956// castop(bitop(x, InvC))
957// Supports: bitcast
958bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
960 Constant *C;
961
962 // Check if this is a bitwise logic operation
964 return false;
965
966 // Get the cast instructions
967 auto *LHSCast = dyn_cast<CastInst>(LHS);
968 if (!LHSCast)
969 return false;
970
971 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
972
973 // Only handle supported cast operations
974 switch (CastOpcode) {
975 case Instruction::BitCast:
976 case Instruction::ZExt:
977 case Instruction::SExt:
978 case Instruction::Trunc:
979 break;
980 default:
981 return false;
982 }
983
984 Value *LHSSrc = LHSCast->getOperand(0);
985
986 auto *SrcTy = LHSSrc->getType();
987 auto *DstTy = I.getType();
988 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
989 // Other casts only handle vector types with integer elements.
990 if (CastOpcode != Instruction::BitCast &&
991 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
992 return false;
993
994 // Only integer scalar/vector values are legal for bitwise logic operations.
995 if (!SrcTy->getScalarType()->isIntegerTy() ||
996 !DstTy->getScalarType()->isIntegerTy())
997 return false;
998
999 // Find the constant InvC, such that castop(InvC) equals to C.
1000 PreservedCastFlags RHSFlags;
1001 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1002 if (!InvC)
1003 return false;
1004
1005 // Cost Check :
1006 // OldCost = bitlogic + cast
1007 // NewCost = bitlogic + cast
1008
1009 // Calculate specific costs for each cast with instruction context
1010 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1011 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1012
1013 InstructionCost OldCost =
1014 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1015
1016 // For new cost, we can't provide an instruction (it doesn't exist yet)
1017 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1018 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1019
1020 InstructionCost NewCost =
1021 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1022 GenericCastCost;
1023
1024 // Account for multi-use casts using specific costs
1025 if (!LHSCast->hasOneUse())
1026 NewCost += LHSCastCost;
1027
1028 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1029 << " NewCost=" << NewCost << "\n");
1030
1031 if (NewCost > OldCost)
1032 return false;
1033
1034 // Create the operation on the source type
1035 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1036 LHSSrc, InvC, I.getName() + ".inner");
1037 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1038 NewBinOp->copyIRFlags(&I);
1039
1040 Worklist.pushValue(NewOp);
1041
1042 // Create the cast operation directly to ensure we get a new instruction
1043 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1044
1045 // Preserve cast instruction flags
1046 if (RHSFlags.NNeg)
1047 NewCast->setNonNeg();
1048 if (RHSFlags.NUW)
1049 NewCast->setHasNoUnsignedWrap();
1050 if (RHSFlags.NSW)
1051 NewCast->setHasNoSignedWrap();
1052
1053 NewCast->andIRFlags(LHSCast);
1054
1055 // Insert the new instruction
1056 Value *Result = Builder.Insert(NewCast);
1057
1058 replaceValue(I, *Result);
1059 return true;
1060}
1061
1062/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1063/// destination type followed by shuffle. This can enable further transforms by
1064/// moving bitcasts or shuffles together.
1065bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1066 Value *V0, *V1;
1067 ArrayRef<int> Mask;
1068 if (!match(&I, m_BitCast(m_OneUse(
1069 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1070 return false;
1071
1072 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1073 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1074 // mask for scalable type is a splat or not.
1075 // 2) Disallow non-vector casts.
1076 // TODO: We could allow any shuffle.
1077 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1078 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1079 if (!DestTy || !SrcTy)
1080 return false;
1081
1082 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1083 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1084 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1085 return false;
1086
1087 bool IsUnary = isa<UndefValue>(V1);
1088
1089 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1090 // if it won't increase the number of bitcasts.
1091 if (!IsUnary) {
1094 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1095 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1096 return false;
1097 }
1098
1099 SmallVector<int, 16> NewMask;
1100 if (DestEltSize <= SrcEltSize) {
1101 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1102 // always be expanded to the equivalent form choosing narrower elements.
1103 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1104 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1105 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1106 } else {
1107 // The bitcast is from narrow elements to wide elements. The shuffle mask
1108 // must choose consecutive elements to allow casting first.
1109 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1110 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1111 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1112 return false;
1113 }
1114
1115 // Bitcast the shuffle src - keep its original width but using the destination
1116 // scalar type.
1117 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1118 auto *NewShuffleTy =
1119 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1120 auto *OldShuffleTy =
1121 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1122 unsigned NumOps = IsUnary ? 1 : 2;
1123
1124 // The new shuffle must not cost more than the old shuffle.
1128
1129 InstructionCost NewCost =
1130 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1131 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1132 TargetTransformInfo::CastContextHint::None,
1133 CostKind));
1134 InstructionCost OldCost =
1135 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1136 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1137 TargetTransformInfo::CastContextHint::None,
1138 CostKind);
1139
1140 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1141 << OldCost << " vs NewCost: " << NewCost << "\n");
1142
1143 if (NewCost > OldCost || !NewCost.isValid())
1144 return false;
1145
1146 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1147 ++NumShufOfBitcast;
1148 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1149 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1150 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1151 replaceValue(I, *Shuf);
1152 return true;
1153}
1154
1155/// VP Intrinsics whose vector operands are both splat values may be simplified
1156/// into the scalar version of the operation and the result splatted. This
1157/// can lead to scalarization down the line.
1158bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1159 if (!isa<VPIntrinsic>(I))
1160 return false;
1161 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1162 Value *Op0 = VPI.getArgOperand(0);
1163 Value *Op1 = VPI.getArgOperand(1);
1164
1165 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1166 return false;
1167
1168 // Check getSplatValue early in this function, to avoid doing unnecessary
1169 // work.
1170 Value *ScalarOp0 = getSplatValue(Op0);
1171 Value *ScalarOp1 = getSplatValue(Op1);
1172 if (!ScalarOp0 || !ScalarOp1)
1173 return false;
1174
1175 // For the binary VP intrinsics supported here, the result on disabled lanes
1176 // is a poison value. For now, only do this simplification if all lanes
1177 // are active.
1178 // TODO: Relax the condition that all lanes are active by using insertelement
1179 // on inactive lanes.
1180 auto IsAllTrueMask = [](Value *MaskVal) {
1181 if (Value *SplattedVal = getSplatValue(MaskVal))
1182 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1183 return ConstValue->isAllOnesValue();
1184 return false;
1185 };
1186 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1187 return false;
1188
1189 // Check to make sure we support scalarization of the intrinsic
1190 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1191 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1192 return false;
1193
1194 // Calculate cost of splatting both operands into vectors and the vector
1195 // intrinsic
1196 VectorType *VecTy = cast<VectorType>(VPI.getType());
1197 SmallVector<int> Mask;
1198 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1199 Mask.resize(FVTy->getNumElements(), 0);
1200 InstructionCost SplatCost =
1201 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1203 CostKind);
1204
1205 // Calculate the cost of the VP Intrinsic
1207 for (Value *V : VPI.args())
1208 Args.push_back(V->getType());
1209 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1210 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1211 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1212
1213 // Determine scalar opcode
1214 std::optional<unsigned> FunctionalOpcode =
1215 VPI.getFunctionalOpcode();
1216 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1217 if (!FunctionalOpcode) {
1218 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1219 if (!ScalarIntrID)
1220 return false;
1221 }
1222
1223 // Calculate cost of scalarizing
1224 InstructionCost ScalarOpCost = 0;
1225 if (ScalarIntrID) {
1226 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1227 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1228 } else {
1229 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1230 VecTy->getScalarType(), CostKind);
1231 }
1232
1233 // The existing splats may be kept around if other instructions use them.
1234 InstructionCost CostToKeepSplats =
1235 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1236 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1237
1238 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1239 << "\n");
1240 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1241 << ", Cost of scalarizing:" << NewCost << "\n");
1242
1243 // We want to scalarize unless the vector variant actually has lower cost.
1244 if (OldCost < NewCost || !NewCost.isValid())
1245 return false;
1246
1247 // Scalarize the intrinsic
1248 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1249 Value *EVL = VPI.getArgOperand(3);
1250
1251 // If the VP op might introduce UB or poison, we can scalarize it provided
1252 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1253 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1254 // scalarizing it.
1255 bool SafeToSpeculate;
1256 if (ScalarIntrID)
1257 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1258 .hasAttribute(Attribute::AttrKind::Speculatable);
1259 else
1261 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1262 if (!SafeToSpeculate &&
1263 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1264 return false;
1265
1266 Value *ScalarVal =
1267 ScalarIntrID
1268 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1269 {ScalarOp0, ScalarOp1})
1270 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1271 ScalarOp0, ScalarOp1);
1272
1273 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1274 return true;
1275}
1276
1277/// Match a vector op/compare/intrinsic with at least one
1278/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1279/// by insertelement.
1280bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1281 auto *UO = dyn_cast<UnaryOperator>(&I);
1282 auto *BO = dyn_cast<BinaryOperator>(&I);
1283 auto *CI = dyn_cast<CmpInst>(&I);
1284 auto *II = dyn_cast<IntrinsicInst>(&I);
1285 if (!UO && !BO && !CI && !II)
1286 return false;
1287
1288 // TODO: Allow intrinsics with different argument types
1289 if (II) {
1290 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1291 return false;
1292 for (auto [Idx, Arg] : enumerate(II->args()))
1293 if (Arg->getType() != II->getType() &&
1294 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1295 return false;
1296 }
1297
1298 // Do not convert the vector condition of a vector select into a scalar
1299 // condition. That may cause problems for codegen because of differences in
1300 // boolean formats and register-file transfers.
1301 // TODO: Can we account for that in the cost model?
1302 if (CI)
1303 for (User *U : I.users())
1304 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1305 return false;
1306
1307 // Match constant vectors or scalars being inserted into constant vectors:
1308 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1309 SmallVector<Value *> VecCs, ScalarOps;
1310 std::optional<uint64_t> Index;
1311
1312 auto Ops = II ? II->args() : I.operands();
1313 for (auto [OpNum, Op] : enumerate(Ops)) {
1314 Constant *VecC;
1315 Value *V;
1316 uint64_t InsIdx = 0;
1317 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1318 m_ConstantInt(InsIdx)))) {
1319 // Bail if any inserts are out of bounds.
1320 VectorType *OpTy = cast<VectorType>(Op->getType());
1321 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1322 return false;
1323 // All inserts must have the same index.
1324 // TODO: Deal with mismatched index constants and variable indexes?
1325 if (!Index)
1326 Index = InsIdx;
1327 else if (InsIdx != *Index)
1328 return false;
1329 VecCs.push_back(VecC);
1330 ScalarOps.push_back(V);
1331 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1332 OpNum, &TTI)) {
1333 VecCs.push_back(Op.get());
1334 ScalarOps.push_back(Op.get());
1335 } else if (match(Op.get(), m_Constant(VecC))) {
1336 VecCs.push_back(VecC);
1337 ScalarOps.push_back(nullptr);
1338 } else {
1339 return false;
1340 }
1341 }
1342
1343 // Bail if all operands are constant.
1344 if (!Index.has_value())
1345 return false;
1346
1347 VectorType *VecTy = cast<VectorType>(I.getType());
1348 Type *ScalarTy = VecTy->getScalarType();
1349 assert(VecTy->isVectorTy() &&
1350 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1351 ScalarTy->isPointerTy()) &&
1352 "Unexpected types for insert element into binop or cmp");
1353
1354 unsigned Opcode = I.getOpcode();
1355 InstructionCost ScalarOpCost, VectorOpCost;
1356 if (CI) {
1357 CmpInst::Predicate Pred = CI->getPredicate();
1358 ScalarOpCost = TTI.getCmpSelInstrCost(
1359 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1360 VectorOpCost = TTI.getCmpSelInstrCost(
1361 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1362 } else if (UO || BO) {
1363 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1364 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1365 } else {
1366 IntrinsicCostAttributes ScalarICA(
1367 II->getIntrinsicID(), ScalarTy,
1368 SmallVector<Type *>(II->arg_size(), ScalarTy));
1369 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1370 IntrinsicCostAttributes VectorICA(
1371 II->getIntrinsicID(), VecTy,
1372 SmallVector<Type *>(II->arg_size(), VecTy));
1373 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1374 }
1375
1376 // Fold the vector constants in the original vectors into a new base vector to
1377 // get more accurate cost modelling.
1378 Value *NewVecC = nullptr;
1379 if (CI)
1380 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1381 else if (UO)
1382 NewVecC =
1383 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1384 else if (BO)
1385 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1386 else if (II)
1387 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1388
1389 if (!NewVecC)
1390 return false;
1391
1392 // Get cost estimate for the insert element. This cost will factor into
1393 // both sequences.
1394 InstructionCost OldCost = VectorOpCost;
1395 InstructionCost NewCost =
1396 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1397 CostKind, *Index, NewVecC);
1398
1399 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1400 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1401 II->getIntrinsicID(), Idx, &TTI)))
1402 continue;
1404 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1405 OldCost += InsertCost;
1406 NewCost += !Op->hasOneUse() * InsertCost;
1407 }
1408
1409 // We want to scalarize unless the vector variant actually has lower cost.
1410 if (OldCost < NewCost || !NewCost.isValid())
1411 return false;
1412
1413 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1414 // inselt NewVecC, (scalar_op V0, V1), Index
1415 if (CI)
1416 ++NumScalarCmp;
1417 else if (UO || BO)
1418 ++NumScalarOps;
1419 else
1420 ++NumScalarIntrinsic;
1421
1422 // For constant cases, extract the scalar element, this should constant fold.
1423 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1424 if (!Scalar)
1426 cast<Constant>(VecC), Builder.getInt64(*Index));
1427
1428 Value *Scalar;
1429 if (CI)
1430 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1431 else if (UO || BO)
1432 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1433 else
1434 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1435
1436 Scalar->setName(I.getName() + ".scalar");
1437
1438 // All IR flags are safe to back-propagate. There is no potential for extra
1439 // poison to be created by the scalar instruction.
1440 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1441 ScalarInst->copyIRFlags(&I);
1442
1443 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1444 replaceValue(I, *Insert);
1445 return true;
1446}
1447
1448/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1449/// a vector into vector operations followed by extract. Note: The SLP pass
1450/// may miss this pattern because of implementation problems.
1451bool VectorCombine::foldExtractedCmps(Instruction &I) {
1452 auto *BI = dyn_cast<BinaryOperator>(&I);
1453
1454 // We are looking for a scalar binop of booleans.
1455 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1456 if (!BI || !I.getType()->isIntegerTy(1))
1457 return false;
1458
1459 // The compare predicates should match, and each compare should have a
1460 // constant operand.
1461 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1462 Instruction *I0, *I1;
1463 Constant *C0, *C1;
1464 CmpPredicate P0, P1;
1465 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1466 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1467 return false;
1468
1469 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1470 if (!MatchingPred)
1471 return false;
1472
1473 // The compare operands must be extracts of the same vector with constant
1474 // extract indexes.
1475 Value *X;
1476 uint64_t Index0, Index1;
1477 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1478 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1479 return false;
1480
1481 auto *Ext0 = cast<ExtractElementInst>(I0);
1482 auto *Ext1 = cast<ExtractElementInst>(I1);
1483 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1484 if (!ConvertToShuf)
1485 return false;
1486 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1487 "Unknown ExtractElementInst");
1488
1489 // The original scalar pattern is:
1490 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1491 CmpInst::Predicate Pred = *MatchingPred;
1492 unsigned CmpOpcode =
1493 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1494 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1495 if (!VecTy)
1496 return false;
1497
1498 InstructionCost Ext0Cost =
1499 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1500 InstructionCost Ext1Cost =
1501 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1503 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1504 CostKind);
1505
1506 InstructionCost OldCost =
1507 Ext0Cost + Ext1Cost + CmpCost * 2 +
1508 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1509
1510 // The proposed vector pattern is:
1511 // vcmp = cmp Pred X, VecC
1512 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1513 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1514 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1517 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1518 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1519 ShufMask[CheapIndex] = ExpensiveIndex;
1521 CmpTy, ShufMask, CostKind);
1522 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1523 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1524 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1525 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1526
1527 // Aggressively form vector ops if the cost is equal because the transform
1528 // may enable further optimization.
1529 // Codegen can reverse this transform (scalarize) if it was not profitable.
1530 if (OldCost < NewCost || !NewCost.isValid())
1531 return false;
1532
1533 // Create a vector constant from the 2 scalar constants.
1534 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1535 PoisonValue::get(VecTy->getElementType()));
1536 CmpC[Index0] = C0;
1537 CmpC[Index1] = C1;
1538 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1539 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1540 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1541 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1542 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1543 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1544 replaceValue(I, *NewExt);
1545 ++NumVecCmpBO;
1546 return true;
1547}
1548
1551 const TargetTransformInfo &TTI,
1552 InstructionCost &CostBeforeReduction,
1553 InstructionCost &CostAfterReduction) {
1554 Instruction *Op0, *Op1;
1555 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1556 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1557 unsigned ReductionOpc =
1558 getArithmeticReductionInstruction(II.getIntrinsicID());
1559 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1560 bool IsUnsigned = isa<ZExtInst>(RedOp);
1561 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1562
1563 CostBeforeReduction =
1564 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1566 CostAfterReduction =
1567 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1568 ExtType, FastMathFlags(), CostKind);
1569 return;
1570 }
1571 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1572 match(RedOp,
1574 match(Op0, m_ZExtOrSExt(m_Value())) &&
1575 Op0->getOpcode() == Op1->getOpcode() &&
1576 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1577 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1578 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1579 bool IsUnsigned = isa<ZExtInst>(Op0);
1580 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1581 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1582
1583 InstructionCost ExtCost =
1584 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1586 InstructionCost MulCost =
1587 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1588 InstructionCost Ext2Cost =
1589 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1591
1592 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1593 CostAfterReduction = TTI.getMulAccReductionCost(
1594 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1595 return;
1596 }
1597 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1598 std::nullopt, CostKind);
1599}
1600
1601bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1602 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1603 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1604 if (BinOpOpc == Instruction::Sub)
1605 ReductionIID = Intrinsic::vector_reduce_add;
1606 if (ReductionIID == Intrinsic::not_intrinsic)
1607 return false;
1608
1609 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1610 Intrinsic::ID IID) -> Value * {
1611 auto *II = dyn_cast<IntrinsicInst>(V);
1612 if (!II)
1613 return nullptr;
1614 if (II->getIntrinsicID() == IID && II->hasOneUse())
1615 return II->getArgOperand(0);
1616 return nullptr;
1617 };
1618
1619 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1620 if (!V0)
1621 return false;
1622 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1623 if (!V1)
1624 return false;
1625
1626 auto *VTy = cast<VectorType>(V0->getType());
1627 if (V1->getType() != VTy)
1628 return false;
1629 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1630 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1631 unsigned ReductionOpc =
1632 getArithmeticReductionInstruction(II0.getIntrinsicID());
1633
1634 InstructionCost OldCost = 0;
1635 InstructionCost NewCost = 0;
1636 InstructionCost CostOfRedOperand0 = 0;
1637 InstructionCost CostOfRed0 = 0;
1638 InstructionCost CostOfRedOperand1 = 0;
1639 InstructionCost CostOfRed1 = 0;
1640 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1641 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1642 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1643 NewCost =
1644 CostOfRedOperand0 + CostOfRedOperand1 +
1645 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1646 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1647 if (NewCost >= OldCost || !NewCost.isValid())
1648 return false;
1649
1650 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1651 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1652 << "\n");
1653 Value *VectorBO;
1654 if (BinOpOpc == Instruction::Or)
1655 VectorBO = Builder.CreateOr(V0, V1, "",
1656 cast<PossiblyDisjointInst>(I).isDisjoint());
1657 else
1658 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1659
1660 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1661 replaceValue(I, *Rdx);
1662 return true;
1663}
1664
1665// Check if memory loc modified between two instrs in the same BB
1668 const MemoryLocation &Loc, AAResults &AA) {
1669 unsigned NumScanned = 0;
1670 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1671 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1672 ++NumScanned > MaxInstrsToScan;
1673 });
1674}
1675
1676namespace {
1677/// Helper class to indicate whether a vector index can be safely scalarized and
1678/// if a freeze needs to be inserted.
1679class ScalarizationResult {
1680 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1681
1682 StatusTy Status;
1683 Value *ToFreeze;
1684
1685 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1686 : Status(Status), ToFreeze(ToFreeze) {}
1687
1688public:
1689 ScalarizationResult(const ScalarizationResult &Other) = default;
1690 ~ScalarizationResult() {
1691 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1692 }
1693
1694 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1695 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1696 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1697 return {StatusTy::SafeWithFreeze, ToFreeze};
1698 }
1699
1700 /// Returns true if the index can be scalarize without requiring a freeze.
1701 bool isSafe() const { return Status == StatusTy::Safe; }
1702 /// Returns true if the index cannot be scalarized.
1703 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1704 /// Returns true if the index can be scalarize, but requires inserting a
1705 /// freeze.
1706 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1707
1708 /// Reset the state of Unsafe and clear ToFreze if set.
1709 void discard() {
1710 ToFreeze = nullptr;
1711 Status = StatusTy::Unsafe;
1712 }
1713
1714 /// Freeze the ToFreeze and update the use in \p User to use it.
1715 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1716 assert(isSafeWithFreeze() &&
1717 "should only be used when freezing is required");
1718 assert(is_contained(ToFreeze->users(), &UserI) &&
1719 "UserI must be a user of ToFreeze");
1720 IRBuilder<>::InsertPointGuard Guard(Builder);
1721 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1722 Value *Frozen =
1723 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1724 for (Use &U : make_early_inc_range((UserI.operands())))
1725 if (U.get() == ToFreeze)
1726 U.set(Frozen);
1727
1728 ToFreeze = nullptr;
1729 }
1730};
1731} // namespace
1732
1733/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1734/// Idx. \p Idx must access a valid vector element.
1735static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1736 Instruction *CtxI,
1737 AssumptionCache &AC,
1738 const DominatorTree &DT) {
1739 // We do checks for both fixed vector types and scalable vector types.
1740 // This is the number of elements of fixed vector types,
1741 // or the minimum number of elements of scalable vector types.
1742 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1743 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1744
1745 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1746 if (C->getValue().ult(NumElements))
1747 return ScalarizationResult::safe();
1748 return ScalarizationResult::unsafe();
1749 }
1750
1751 // Always unsafe if the index type can't handle all inbound values.
1752 if (!llvm::isUIntN(IntWidth, NumElements))
1753 return ScalarizationResult::unsafe();
1754
1755 APInt Zero(IntWidth, 0);
1756 APInt MaxElts(IntWidth, NumElements);
1757 ConstantRange ValidIndices(Zero, MaxElts);
1758 ConstantRange IdxRange(IntWidth, true);
1759
1760 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1761 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1762 true, &AC, CtxI, &DT)))
1763 return ScalarizationResult::safe();
1764 return ScalarizationResult::unsafe();
1765 }
1766
1767 // If the index may be poison, check if we can insert a freeze before the
1768 // range of the index is restricted.
1769 Value *IdxBase;
1770 ConstantInt *CI;
1771 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1772 IdxRange = IdxRange.binaryAnd(CI->getValue());
1773 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1774 IdxRange = IdxRange.urem(CI->getValue());
1775 }
1776
1777 if (ValidIndices.contains(IdxRange))
1778 return ScalarizationResult::safeWithFreeze(IdxBase);
1779 return ScalarizationResult::unsafe();
1780}
1781
1782/// The memory operation on a vector of \p ScalarType had alignment of
1783/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1784/// alignment that will be valid for the memory operation on a single scalar
1785/// element of the same type with index \p Idx.
1787 Type *ScalarType, Value *Idx,
1788 const DataLayout &DL) {
1789 if (auto *C = dyn_cast<ConstantInt>(Idx))
1790 return commonAlignment(VectorAlignment,
1791 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1792 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1793}
1794
1795// Combine patterns like:
1796// %0 = load <4 x i32>, <4 x i32>* %a
1797// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1798// store <4 x i32> %1, <4 x i32>* %a
1799// to:
1800// %0 = bitcast <4 x i32>* %a to i32*
1801// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1802// store i32 %b, i32* %1
1803bool VectorCombine::foldSingleElementStore(Instruction &I) {
1805 return false;
1806 auto *SI = cast<StoreInst>(&I);
1807 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1808 return false;
1809
1810 // TODO: Combine more complicated patterns (multiple insert) by referencing
1811 // TargetTransformInfo.
1813 Value *NewElement;
1814 Value *Idx;
1815 if (!match(SI->getValueOperand(),
1816 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1817 m_Value(Idx))))
1818 return false;
1819
1820 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1821 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1822 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1823 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1824 // modified between, vector type matches store size, and index is inbounds.
1825 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1826 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1827 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1828 return false;
1829
1830 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1831 if (ScalarizableIdx.isUnsafe() ||
1832 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1833 MemoryLocation::get(SI), AA))
1834 return false;
1835
1836 // Ensure we add the load back to the worklist BEFORE its users so they can
1837 // erased in the correct order.
1838 Worklist.push(Load);
1839
1840 if (ScalarizableIdx.isSafeWithFreeze())
1841 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1842 Value *GEP = Builder.CreateInBoundsGEP(
1843 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1844 {ConstantInt::get(Idx->getType(), 0), Idx});
1845 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1846 NSI->copyMetadata(*SI);
1847 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1848 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1849 *DL);
1850 NSI->setAlignment(ScalarOpAlignment);
1851 replaceValue(I, *NSI);
1853 return true;
1854 }
1855
1856 return false;
1857}
1858
1859/// Try to scalarize vector loads feeding extractelement or bitcast
1860/// instructions.
1861bool VectorCombine::scalarizeLoad(Instruction &I) {
1862 Value *Ptr;
1863 if (!match(&I, m_Load(m_Value(Ptr))))
1864 return false;
1865
1866 auto *LI = cast<LoadInst>(&I);
1867 auto *VecTy = cast<VectorType>(LI->getType());
1868 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1869 return false;
1870
1871 bool AllExtracts = true;
1872 bool AllBitcasts = true;
1873 Instruction *LastCheckedInst = LI;
1874 unsigned NumInstChecked = 0;
1875
1876 // Check what type of users we have (must either all be extracts or
1877 // bitcasts) and ensure no memory modifications between the load and
1878 // its users.
1879 for (User *U : LI->users()) {
1880 auto *UI = dyn_cast<Instruction>(U);
1881 if (!UI || UI->getParent() != LI->getParent())
1882 return false;
1883
1884 // If any user is waiting to be erased, then bail out as this will
1885 // distort the cost calculation and possibly lead to infinite loops.
1886 if (UI->use_empty())
1887 return false;
1888
1889 if (!isa<ExtractElementInst>(UI))
1890 AllExtracts = false;
1891 if (!isa<BitCastInst>(UI))
1892 AllBitcasts = false;
1893
1894 // Check if any instruction between the load and the user may modify memory.
1895 if (LastCheckedInst->comesBefore(UI)) {
1896 for (Instruction &I :
1897 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1898 // Bail out if we reached the check limit or the instruction may write
1899 // to memory.
1900 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1901 return false;
1902 NumInstChecked++;
1903 }
1904 LastCheckedInst = UI;
1905 }
1906 }
1907
1908 if (AllExtracts)
1909 return scalarizeLoadExtract(LI, VecTy, Ptr);
1910 if (AllBitcasts)
1911 return scalarizeLoadBitcast(LI, VecTy, Ptr);
1912 return false;
1913}
1914
1915/// Try to scalarize vector loads feeding extractelement instructions.
1916bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
1917 Value *Ptr) {
1919 return false;
1920
1921 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1922 auto FailureGuard = make_scope_exit([&]() {
1923 // If the transform is aborted, discard the ScalarizationResults.
1924 for (auto &Pair : NeedFreeze)
1925 Pair.second.discard();
1926 });
1927
1928 InstructionCost OriginalCost =
1929 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1931 InstructionCost ScalarizedCost = 0;
1932
1933 for (User *U : LI->users()) {
1934 auto *UI = cast<ExtractElementInst>(U);
1935
1936 auto ScalarIdx =
1937 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1938 if (ScalarIdx.isUnsafe())
1939 return false;
1940 if (ScalarIdx.isSafeWithFreeze()) {
1941 NeedFreeze.try_emplace(UI, ScalarIdx);
1942 ScalarIdx.discard();
1943 }
1944
1945 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1946 OriginalCost +=
1947 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1948 Index ? Index->getZExtValue() : -1);
1949 ScalarizedCost +=
1950 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1952 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1953 nullptr, nullptr, CostKind);
1954 }
1955
1956 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
1957 << "\n LoadExtractCost: " << OriginalCost
1958 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1959
1960 if (ScalarizedCost >= OriginalCost)
1961 return false;
1962
1963 // Ensure we add the load back to the worklist BEFORE its users so they can
1964 // erased in the correct order.
1965 Worklist.push(LI);
1966
1967 Type *ElemType = VecTy->getElementType();
1968
1969 // Replace extracts with narrow scalar loads.
1970 for (User *U : LI->users()) {
1971 auto *EI = cast<ExtractElementInst>(U);
1972 Value *Idx = EI->getIndexOperand();
1973
1974 // Insert 'freeze' for poison indexes.
1975 auto It = NeedFreeze.find(EI);
1976 if (It != NeedFreeze.end())
1977 It->second.freeze(Builder, *cast<Instruction>(Idx));
1978
1979 Builder.SetInsertPoint(EI);
1980 Value *GEP =
1981 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1982 auto *NewLoad = cast<LoadInst>(
1983 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1984
1985 Align ScalarOpAlignment =
1986 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1987 NewLoad->setAlignment(ScalarOpAlignment);
1988
1989 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1990 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1991 AAMDNodes OldAAMD = LI->getAAMetadata();
1992 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1993 }
1994
1995 replaceValue(*EI, *NewLoad, false);
1996 }
1997
1998 FailureGuard.release();
1999 return true;
2000}
2001
2002/// Try to scalarize vector loads feeding bitcast instructions.
2003bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2004 Value *Ptr) {
2005 InstructionCost OriginalCost =
2006 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2008
2009 Type *TargetScalarType = nullptr;
2010 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2011
2012 for (User *U : LI->users()) {
2013 auto *BC = cast<BitCastInst>(U);
2014
2015 Type *DestTy = BC->getDestTy();
2016 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2017 return false;
2018
2019 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2020 if (DestBitWidth != VecBitWidth)
2021 return false;
2022
2023 // All bitcasts must target the same scalar type.
2024 if (!TargetScalarType)
2025 TargetScalarType = DestTy;
2026 else if (TargetScalarType != DestTy)
2027 return false;
2028
2029 OriginalCost +=
2030 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2032 }
2033
2034 if (!TargetScalarType)
2035 return false;
2036
2037 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2038 InstructionCost ScalarizedCost =
2039 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2041
2042 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2043 << "\n OriginalCost: " << OriginalCost
2044 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2045
2046 if (ScalarizedCost >= OriginalCost)
2047 return false;
2048
2049 // Ensure we add the load back to the worklist BEFORE its users so they can
2050 // erased in the correct order.
2051 Worklist.push(LI);
2052
2053 Builder.SetInsertPoint(LI);
2054 auto *ScalarLoad =
2055 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2056 ScalarLoad->setAlignment(LI->getAlign());
2057 ScalarLoad->copyMetadata(*LI);
2058
2059 // Replace all bitcast users with the scalar load.
2060 for (User *U : LI->users()) {
2061 auto *BC = cast<BitCastInst>(U);
2062 replaceValue(*BC, *ScalarLoad, false);
2063 }
2064
2065 return true;
2066}
2067
2068bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2070 return false;
2071 auto *Ext = dyn_cast<ZExtInst>(&I);
2072 if (!Ext)
2073 return false;
2074
2075 // Try to convert a vector zext feeding only extracts to a set of scalar
2076 // (Src << ExtIdx *Size) & (Size -1)
2077 // if profitable .
2078 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2079 if (!SrcTy)
2080 return false;
2081 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2082
2083 Type *ScalarDstTy = DstTy->getElementType();
2084 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2085 return false;
2086
2087 InstructionCost VectorCost =
2088 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2090 unsigned ExtCnt = 0;
2091 bool ExtLane0 = false;
2092 for (User *U : Ext->users()) {
2093 uint64_t Idx;
2094 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2095 return false;
2096 if (cast<Instruction>(U)->use_empty())
2097 continue;
2098 ExtCnt += 1;
2099 ExtLane0 |= !Idx;
2100 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2101 CostKind, Idx, U);
2102 }
2103
2104 InstructionCost ScalarCost =
2105 ExtCnt * TTI.getArithmeticInstrCost(
2106 Instruction::And, ScalarDstTy, CostKind,
2109 (ExtCnt - ExtLane0) *
2111 Instruction::LShr, ScalarDstTy, CostKind,
2114 if (ScalarCost > VectorCost)
2115 return false;
2116
2117 Value *ScalarV = Ext->getOperand(0);
2118 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2119 &DT)) {
2120 // Check wether all lanes are extracted, all extracts trigger UB
2121 // on poison, and the last extract (and hence all previous ones)
2122 // are guaranteed to execute if Ext executes. If so, we do not
2123 // need to insert a freeze.
2124 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2125 bool AllExtractsTriggerUB = true;
2126 ExtractElementInst *LastExtract = nullptr;
2127 BasicBlock *ExtBB = Ext->getParent();
2128 for (User *U : Ext->users()) {
2129 auto *Extract = cast<ExtractElementInst>(U);
2130 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2131 AllExtractsTriggerUB = false;
2132 break;
2133 }
2134 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2135 if (!LastExtract || LastExtract->comesBefore(Extract))
2136 LastExtract = Extract;
2137 }
2138 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2139 !AllExtractsTriggerUB ||
2141 LastExtract->getIterator()))
2142 ScalarV = Builder.CreateFreeze(ScalarV);
2143 }
2144 ScalarV = Builder.CreateBitCast(
2145 ScalarV,
2146 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2147 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2148 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2149 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2150 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2151 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2152 for (User *U : Ext->users()) {
2153 auto *Extract = cast<ExtractElementInst>(U);
2154 uint64_t Idx =
2155 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2156 uint64_t ShiftAmt =
2157 DL->isBigEndian()
2158 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2159 : (Idx * SrcEltSizeInBits);
2160 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2161 Value *And = Builder.CreateAnd(LShr, Mask);
2162 U->replaceAllUsesWith(And);
2163 }
2164 return true;
2165}
2166
2167/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2168/// to "(bitcast (concat X, Y))"
2169/// where X/Y are bitcasted from i1 mask vectors.
2170bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2171 Type *Ty = I.getType();
2172 if (!Ty->isIntegerTy())
2173 return false;
2174
2175 // TODO: Add big endian test coverage
2176 if (DL->isBigEndian())
2177 return false;
2178
2179 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2180 Instruction *X, *Y;
2182 return false;
2183
2184 // Allow both sources to contain shl, to handle more generic pattern:
2185 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2186 Value *SrcX;
2187 uint64_t ShAmtX = 0;
2188 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2189 !match(X, m_OneUse(
2191 m_ConstantInt(ShAmtX)))))
2192 return false;
2193
2194 Value *SrcY;
2195 uint64_t ShAmtY = 0;
2196 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2197 !match(Y, m_OneUse(
2199 m_ConstantInt(ShAmtY)))))
2200 return false;
2201
2202 // Canonicalize larger shift to the RHS.
2203 if (ShAmtX > ShAmtY) {
2204 std::swap(X, Y);
2205 std::swap(SrcX, SrcY);
2206 std::swap(ShAmtX, ShAmtY);
2207 }
2208
2209 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2210 // difference is the mask width so they can be easily concatenated together.
2211 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2212 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2213 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2214 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2215 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2216 !MaskTy->getElementType()->isIntegerTy(1) ||
2217 MaskTy->getNumElements() != ShAmtDiff ||
2218 MaskTy->getNumElements() > (BitWidth / 2))
2219 return false;
2220
2221 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2222 auto *ConcatIntTy =
2223 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2224 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2225
2226 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2227 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2228
2229 // TODO: Is it worth supporting multi use cases?
2230 InstructionCost OldCost = 0;
2231 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2232 OldCost +=
2233 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2234 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2236 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2238
2239 InstructionCost NewCost = 0;
2241 MaskTy, ConcatMask, CostKind);
2242 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2244 if (Ty != ConcatIntTy)
2245 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2247 if (ShAmtX > 0)
2248 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2249
2250 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2251 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2252 << "\n");
2253
2254 if (NewCost > OldCost)
2255 return false;
2256
2257 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2258 // any residual zero-extension or shifting.
2259 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2260 Worklist.pushValue(Concat);
2261
2262 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2263
2264 if (Ty != ConcatIntTy) {
2265 Worklist.pushValue(Result);
2266 Result = Builder.CreateZExt(Result, Ty);
2267 }
2268
2269 if (ShAmtX > 0) {
2270 Worklist.pushValue(Result);
2271 Result = Builder.CreateShl(Result, ShAmtX);
2272 }
2273
2274 replaceValue(I, *Result);
2275 return true;
2276}
2277
2278/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2279/// --> "binop (shuffle), (shuffle)".
2280bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2281 BinaryOperator *BinOp;
2282 ArrayRef<int> OuterMask;
2283 if (!match(&I,
2284 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2285 return false;
2286
2287 // Don't introduce poison into div/rem.
2288 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2289 return false;
2290
2291 Value *Op00, *Op01, *Op10, *Op11;
2292 ArrayRef<int> Mask0, Mask1;
2293 bool Match0 =
2294 match(BinOp->getOperand(0),
2295 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2296 bool Match1 =
2297 match(BinOp->getOperand(1),
2298 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2299 if (!Match0 && !Match1)
2300 return false;
2301
2302 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2303 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2304 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2305 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2306
2307 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2308 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2309 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2310 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2311 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2312 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2313 return false;
2314
2315 unsigned NumSrcElts = BinOpTy->getNumElements();
2316
2317 // Don't accept shuffles that reference the second operand in
2318 // div/rem or if its an undef arg.
2319 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2320 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2321 return false;
2322
2323 // Merge outer / inner (or identity if no match) shuffles.
2324 SmallVector<int> NewMask0, NewMask1;
2325 for (int M : OuterMask) {
2326 if (M < 0 || M >= (int)NumSrcElts) {
2327 NewMask0.push_back(PoisonMaskElem);
2328 NewMask1.push_back(PoisonMaskElem);
2329 } else {
2330 NewMask0.push_back(Match0 ? Mask0[M] : M);
2331 NewMask1.push_back(Match1 ? Mask1[M] : M);
2332 }
2333 }
2334
2335 unsigned NumOpElts = Op0Ty->getNumElements();
2336 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2337 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2338 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2339 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2340 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2341 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2342
2343 // Try to merge shuffles across the binop if the new shuffles are not costly.
2344 InstructionCost OldCost =
2345 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2347 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2348 if (Match0)
2349 OldCost += TTI.getShuffleCost(
2350 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2351 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2352 if (Match1)
2353 OldCost += TTI.getShuffleCost(
2354 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2355 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2356
2357 InstructionCost NewCost =
2358 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2359
2360 if (!IsIdentity0)
2361 NewCost +=
2363 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2364 if (!IsIdentity1)
2365 NewCost +=
2367 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2368
2369 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2370 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2371 << "\n");
2372
2373 // If costs are equal, still fold as we reduce instruction count.
2374 if (NewCost > OldCost)
2375 return false;
2376
2377 Value *LHS =
2378 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2379 Value *RHS =
2380 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2381 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2382
2383 // Intersect flags from the old binops.
2384 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2385 NewInst->copyIRFlags(BinOp);
2386
2387 Worklist.pushValue(LHS);
2388 Worklist.pushValue(RHS);
2389 replaceValue(I, *NewBO);
2390 return true;
2391}
2392
2393/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2394/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2395bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2396 ArrayRef<int> OldMask;
2397 Instruction *LHS, *RHS;
2399 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2400 return false;
2401
2402 // TODO: Add support for addlike etc.
2403 if (LHS->getOpcode() != RHS->getOpcode())
2404 return false;
2405
2406 Value *X, *Y, *Z, *W;
2407 bool IsCommutative = false;
2408 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2409 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2410 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2411 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2412 auto *BO = cast<BinaryOperator>(LHS);
2413 // Don't introduce poison into div/rem.
2414 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2415 return false;
2416 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2417 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2418 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2419 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2420 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2421 } else
2422 return false;
2423
2424 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2425 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2426 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2427 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2428 return false;
2429
2430 unsigned NumSrcElts = BinOpTy->getNumElements();
2431
2432 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2433 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2434 std::swap(X, Y);
2435
2436 auto ConvertToUnary = [NumSrcElts](int &M) {
2437 if (M >= (int)NumSrcElts)
2438 M -= NumSrcElts;
2439 };
2440
2441 SmallVector<int> NewMask0(OldMask);
2443 if (X == Z) {
2444 llvm::for_each(NewMask0, ConvertToUnary);
2446 Z = PoisonValue::get(BinOpTy);
2447 }
2448
2449 SmallVector<int> NewMask1(OldMask);
2451 if (Y == W) {
2452 llvm::for_each(NewMask1, ConvertToUnary);
2454 W = PoisonValue::get(BinOpTy);
2455 }
2456
2457 // Try to replace a binop with a shuffle if the shuffle is not costly.
2458 InstructionCost OldCost =
2462 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2463 &I);
2464
2465 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2466 // where one use shuffles have gotten split across the binop/cmp. These
2467 // often allow a major reduction in total cost that wouldn't happen as
2468 // individual folds.
2469 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2470 TTI::TargetCostKind CostKind) -> bool {
2471 Value *InnerOp;
2472 ArrayRef<int> InnerMask;
2473 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2474 m_Mask(InnerMask)))) &&
2475 InnerOp->getType() == Op->getType() &&
2476 all_of(InnerMask,
2477 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2478 for (int &M : Mask)
2479 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2480 M = InnerMask[M - Offset];
2481 M = 0 <= M ? M + Offset : M;
2482 }
2484 Op = InnerOp;
2485 return true;
2486 }
2487 return false;
2488 };
2489 bool ReducedInstCount = false;
2490 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2491 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2492 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2493 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2494 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2495 ReducedInstCount |= SingleSrcBinOp;
2496
2497 auto *ShuffleCmpTy =
2498 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2500 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2501 if (!SingleSrcBinOp)
2502 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2503 CostKind, 0, nullptr, {Y, W});
2504
2505 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2506 NewCost +=
2507 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2508 } else {
2509 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2510 ShuffleDstTy, PredLHS, CostKind);
2511 }
2512
2513 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2514 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2515 << "\n");
2516
2517 // If either shuffle will constant fold away, then fold for the same cost as
2518 // we will reduce the instruction count.
2519 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2520 (isa<Constant>(Y) && isa<Constant>(W));
2521 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2522 return false;
2523
2524 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2525 Value *Shuf1 =
2526 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2527 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2528 ? Builder.CreateBinOp(
2529 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2530 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2531
2532 // Intersect flags from the old binops.
2533 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2534 NewInst->copyIRFlags(LHS);
2535 NewInst->andIRFlags(RHS);
2536 }
2537
2538 Worklist.pushValue(Shuf0);
2539 Worklist.pushValue(Shuf1);
2540 replaceValue(I, *NewBO);
2541 return true;
2542}
2543
2544/// Try to convert,
2545/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2546/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2547bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2548 ArrayRef<int> Mask;
2549 Value *C1, *T1, *F1, *C2, *T2, *F2;
2550 if (!match(&I, m_Shuffle(
2552 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2553 m_Mask(Mask))))
2554 return false;
2555
2556 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2557 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2558 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2559 return false;
2560
2561 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2562 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2563 // SelectInsts must have the same FMF.
2564 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2565 ((SI0FOp != nullptr) &&
2566 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2567 return false;
2568
2569 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2570 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2572 auto SelOp = Instruction::Select;
2574 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2575 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2577 OldCost +=
2578 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2579 {I.getOperand(0), I.getOperand(1)}, &I);
2580
2582 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2583 Mask, CostKind, 0, nullptr, {C1, C2});
2584 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2585 nullptr, {T1, T2});
2586 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2587 nullptr, {F1, F2});
2588 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2589 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2590 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2592
2593 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2594 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2595 << "\n");
2596 if (NewCost > OldCost)
2597 return false;
2598
2599 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2600 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2601 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2602 Value *NewSel;
2603 // We presuppose that the SelectInsts have the same FMF.
2604 if (SI0FOp)
2605 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2606 SI0FOp->getFastMathFlags());
2607 else
2608 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2609
2610 Worklist.pushValue(ShuffleCmp);
2611 Worklist.pushValue(ShuffleTrue);
2612 Worklist.pushValue(ShuffleFalse);
2613 replaceValue(I, *NewSel);
2614 return true;
2615}
2616
2617/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2618/// into "castop (shuffle)".
2619bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2620 Value *V0, *V1;
2621 ArrayRef<int> OldMask;
2622 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2623 return false;
2624
2625 // Check whether this is a binary shuffle.
2626 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2627
2628 auto *C0 = dyn_cast<CastInst>(V0);
2629 auto *C1 = dyn_cast<CastInst>(V1);
2630 if (!C0 || (IsBinaryShuffle && !C1))
2631 return false;
2632
2633 Instruction::CastOps Opcode = C0->getOpcode();
2634
2635 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2636 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2637 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2638 return false;
2639
2640 if (IsBinaryShuffle) {
2641 if (C0->getSrcTy() != C1->getSrcTy())
2642 return false;
2643 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2644 if (Opcode != C1->getOpcode()) {
2645 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2646 Opcode = Instruction::SExt;
2647 else
2648 return false;
2649 }
2650 }
2651
2652 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2653 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2654 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2655 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2656 return false;
2657
2658 unsigned NumSrcElts = CastSrcTy->getNumElements();
2659 unsigned NumDstElts = CastDstTy->getNumElements();
2660 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2661 "Only bitcasts expected to alter src/dst element counts");
2662
2663 // Check for bitcasting of unscalable vector types.
2664 // e.g. <32 x i40> -> <40 x i32>
2665 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2666 (NumDstElts % NumSrcElts) != 0)
2667 return false;
2668
2669 SmallVector<int, 16> NewMask;
2670 if (NumSrcElts >= NumDstElts) {
2671 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2672 // always be expanded to the equivalent form choosing narrower elements.
2673 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2674 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2675 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2676 } else {
2677 // The bitcast is from narrow elements to wide elements. The shuffle mask
2678 // must choose consecutive elements to allow casting first.
2679 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2680 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2681 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2682 return false;
2683 }
2684
2685 auto *NewShuffleDstTy =
2686 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2687
2688 // Try to replace a castop with a shuffle if the shuffle is not costly.
2689 InstructionCost CostC0 =
2690 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2692
2694 if (IsBinaryShuffle)
2696 else
2698
2699 InstructionCost OldCost = CostC0;
2700 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2701 CostKind, 0, nullptr, {}, &I);
2702
2703 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2704 CastSrcTy, NewMask, CostKind);
2705 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2707 if (!C0->hasOneUse())
2708 NewCost += CostC0;
2709 if (IsBinaryShuffle) {
2710 InstructionCost CostC1 =
2711 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2713 OldCost += CostC1;
2714 if (!C1->hasOneUse())
2715 NewCost += CostC1;
2716 }
2717
2718 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2719 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2720 << "\n");
2721 if (NewCost > OldCost)
2722 return false;
2723
2724 Value *Shuf;
2725 if (IsBinaryShuffle)
2726 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2727 NewMask);
2728 else
2729 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2730
2731 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2732
2733 // Intersect flags from the old casts.
2734 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2735 NewInst->copyIRFlags(C0);
2736 if (IsBinaryShuffle)
2737 NewInst->andIRFlags(C1);
2738 }
2739
2740 Worklist.pushValue(Shuf);
2741 replaceValue(I, *Cast);
2742 return true;
2743}
2744
2745/// Try to convert any of:
2746/// "shuffle (shuffle x, y), (shuffle y, x)"
2747/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2748/// "shuffle (shuffle x, undef), y"
2749/// "shuffle x, (shuffle y, undef)"
2750/// into "shuffle x, y".
2751bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2752 ArrayRef<int> OuterMask;
2753 Value *OuterV0, *OuterV1;
2754 if (!match(&I,
2755 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2756 return false;
2757
2758 ArrayRef<int> InnerMask0, InnerMask1;
2759 Value *X0, *X1, *Y0, *Y1;
2760 bool Match0 =
2761 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2762 bool Match1 =
2763 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2764 if (!Match0 && !Match1)
2765 return false;
2766
2767 // If the outer shuffle is a permute, then create a fake inner all-poison
2768 // shuffle. This is easier than accounting for length-changing shuffles below.
2769 SmallVector<int, 16> PoisonMask1;
2770 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2771 X1 = X0;
2772 Y1 = Y0;
2773 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2774 InnerMask1 = PoisonMask1;
2775 Match1 = true; // fake match
2776 }
2777
2778 X0 = Match0 ? X0 : OuterV0;
2779 Y0 = Match0 ? Y0 : OuterV0;
2780 X1 = Match1 ? X1 : OuterV1;
2781 Y1 = Match1 ? Y1 : OuterV1;
2782 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2783 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2784 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2785 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2786 X0->getType() != X1->getType())
2787 return false;
2788
2789 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2790 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2791
2792 // Attempt to merge shuffles, matching upto 2 source operands.
2793 // Replace index to a poison arg with PoisonMaskElem.
2794 // Bail if either inner masks reference an undef arg.
2795 SmallVector<int, 16> NewMask(OuterMask);
2796 Value *NewX = nullptr, *NewY = nullptr;
2797 for (int &M : NewMask) {
2798 Value *Src = nullptr;
2799 if (0 <= M && M < (int)NumImmElts) {
2800 Src = OuterV0;
2801 if (Match0) {
2802 M = InnerMask0[M];
2803 Src = M >= (int)NumSrcElts ? Y0 : X0;
2804 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2805 }
2806 } else if (M >= (int)NumImmElts) {
2807 Src = OuterV1;
2808 M -= NumImmElts;
2809 if (Match1) {
2810 M = InnerMask1[M];
2811 Src = M >= (int)NumSrcElts ? Y1 : X1;
2812 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2813 }
2814 }
2815 if (Src && M != PoisonMaskElem) {
2816 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2817 if (isa<UndefValue>(Src)) {
2818 // We've referenced an undef element - if its poison, update the shuffle
2819 // mask, else bail.
2820 if (!isa<PoisonValue>(Src))
2821 return false;
2822 M = PoisonMaskElem;
2823 continue;
2824 }
2825 if (!NewX || NewX == Src) {
2826 NewX = Src;
2827 continue;
2828 }
2829 if (!NewY || NewY == Src) {
2830 M += NumSrcElts;
2831 NewY = Src;
2832 continue;
2833 }
2834 return false;
2835 }
2836 }
2837
2838 if (!NewX)
2839 return PoisonValue::get(ShuffleDstTy);
2840 if (!NewY)
2841 NewY = PoisonValue::get(ShuffleSrcTy);
2842
2843 // Have we folded to an Identity shuffle?
2844 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2845 replaceValue(I, *NewX);
2846 return true;
2847 }
2848
2849 // Try to merge the shuffles if the new shuffle is not costly.
2850 InstructionCost InnerCost0 = 0;
2851 if (Match0)
2852 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2853
2854 InstructionCost InnerCost1 = 0;
2855 if (Match1)
2856 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2857
2859
2860 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2861
2862 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2866 InstructionCost NewCost =
2867 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2868 nullptr, {NewX, NewY});
2869 if (!OuterV0->hasOneUse())
2870 NewCost += InnerCost0;
2871 if (!OuterV1->hasOneUse())
2872 NewCost += InnerCost1;
2873
2874 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2875 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2876 << "\n");
2877 if (NewCost > OldCost)
2878 return false;
2879
2880 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2881 replaceValue(I, *Shuf);
2882 return true;
2883}
2884
2885/// Try to convert a chain of length-preserving shuffles that are fed by
2886/// length-changing shuffles from the same source, e.g. a chain of length 3:
2887///
2888/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
2889/// (shuffle y, undef)),
2890// (shuffle y, undef)"
2891///
2892/// into a single shuffle fed by a length-changing shuffle:
2893///
2894/// "shuffle x, (shuffle y, undef)"
2895///
2896/// Such chains arise e.g. from folding extract/insert sequences.
2897bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
2898 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
2899 if (!TrunkType)
2900 return false;
2901
2902 unsigned ChainLength = 0;
2903 SmallVector<int> Mask;
2904 SmallVector<int> YMask;
2905 InstructionCost OldCost = 0;
2906 InstructionCost NewCost = 0;
2907 Value *Trunk = &I;
2908 unsigned NumTrunkElts = TrunkType->getNumElements();
2909 Value *Y = nullptr;
2910
2911 for (;;) {
2912 // Match the current trunk against (commutations of) the pattern
2913 // "shuffle trunk', (shuffle y, undef)"
2914 ArrayRef<int> OuterMask;
2915 Value *OuterV0, *OuterV1;
2916 if (ChainLength != 0 && !Trunk->hasOneUse())
2917 break;
2918 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
2919 m_Mask(OuterMask))))
2920 break;
2921 if (OuterV0->getType() != TrunkType) {
2922 // This shuffle is not length-preserving, so it cannot be part of the
2923 // chain.
2924 break;
2925 }
2926
2927 ArrayRef<int> InnerMask0, InnerMask1;
2928 Value *A0, *A1, *B0, *B1;
2929 bool Match0 =
2930 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
2931 bool Match1 =
2932 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
2933 bool Match0Leaf = Match0 && A0->getType() != I.getType();
2934 bool Match1Leaf = Match1 && A1->getType() != I.getType();
2935 if (Match0Leaf == Match1Leaf) {
2936 // Only handle the case of exactly one leaf in each step. The "two leaves"
2937 // case is handled by foldShuffleOfShuffles.
2938 break;
2939 }
2940
2941 SmallVector<int> CommutedOuterMask;
2942 if (Match0Leaf) {
2943 std::swap(OuterV0, OuterV1);
2944 std::swap(InnerMask0, InnerMask1);
2945 std::swap(A0, A1);
2946 std::swap(B0, B1);
2947 llvm::append_range(CommutedOuterMask, OuterMask);
2948 for (int &M : CommutedOuterMask) {
2949 if (M == PoisonMaskElem)
2950 continue;
2951 if (M < (int)NumTrunkElts)
2952 M += NumTrunkElts;
2953 else
2954 M -= NumTrunkElts;
2955 }
2956 OuterMask = CommutedOuterMask;
2957 }
2958 if (!OuterV1->hasOneUse())
2959 break;
2960
2961 if (!isa<UndefValue>(A1)) {
2962 if (!Y)
2963 Y = A1;
2964 else if (Y != A1)
2965 break;
2966 }
2967 if (!isa<UndefValue>(B1)) {
2968 if (!Y)
2969 Y = B1;
2970 else if (Y != B1)
2971 break;
2972 }
2973
2974 auto *YType = cast<FixedVectorType>(A1->getType());
2975 int NumLeafElts = YType->getNumElements();
2976 SmallVector<int> LocalYMask(InnerMask1);
2977 for (int &M : LocalYMask) {
2978 if (M >= NumLeafElts)
2979 M -= NumLeafElts;
2980 }
2981
2982 InstructionCost LocalOldCost =
2985
2986 // Handle the initial (start of chain) case.
2987 if (!ChainLength) {
2988 Mask.assign(OuterMask);
2989 YMask.assign(LocalYMask);
2990 OldCost = NewCost = LocalOldCost;
2991 Trunk = OuterV0;
2992 ChainLength++;
2993 continue;
2994 }
2995
2996 // For the non-root case, first attempt to combine masks.
2997 SmallVector<int> NewYMask(YMask);
2998 bool Valid = true;
2999 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3000 if (LeafM == -1 || CombinedM == LeafM)
3001 continue;
3002 if (CombinedM == -1) {
3003 CombinedM = LeafM;
3004 } else {
3005 Valid = false;
3006 break;
3007 }
3008 }
3009 if (!Valid)
3010 break;
3011
3012 SmallVector<int> NewMask;
3013 NewMask.reserve(NumTrunkElts);
3014 for (int M : Mask) {
3015 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3016 NewMask.push_back(M);
3017 else
3018 NewMask.push_back(OuterMask[M]);
3019 }
3020
3021 // Break the chain if adding this new step complicates the shuffles such
3022 // that it would increase the new cost by more than the old cost of this
3023 // step.
3024 InstructionCost LocalNewCost =
3026 YType, NewYMask, CostKind) +
3028 TrunkType, NewMask, CostKind);
3029
3030 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3031 break;
3032
3033 LLVM_DEBUG({
3034 if (ChainLength == 1) {
3035 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3036 << I << '\n';
3037 }
3038 dbgs() << " next chain link: " << *Trunk << '\n'
3039 << " old cost: " << (OldCost + LocalOldCost)
3040 << " new cost: " << LocalNewCost << '\n';
3041 });
3042
3043 Mask = NewMask;
3044 YMask = NewYMask;
3045 OldCost += LocalOldCost;
3046 NewCost = LocalNewCost;
3047 Trunk = OuterV0;
3048 ChainLength++;
3049 }
3050 if (ChainLength <= 1)
3051 return false;
3052
3053 if (llvm::all_of(Mask, [&](int M) {
3054 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3055 })) {
3056 // Produce a canonical simplified form if all elements are sourced from Y.
3057 for (int &M : Mask) {
3058 if (M >= static_cast<int>(NumTrunkElts))
3059 M = YMask[M - NumTrunkElts];
3060 }
3061 Value *Root =
3062 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3063 replaceValue(I, *Root);
3064 return true;
3065 }
3066
3067 Value *Leaf =
3068 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3069 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3070 replaceValue(I, *Root);
3071 return true;
3072}
3073
3074/// Try to convert
3075/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3076bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3077 Value *V0, *V1;
3078 ArrayRef<int> OldMask;
3079 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
3080 m_Mask(OldMask))))
3081 return false;
3082
3083 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3084 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3085 if (!II0 || !II1)
3086 return false;
3087
3088 Intrinsic::ID IID = II0->getIntrinsicID();
3089 if (IID != II1->getIntrinsicID())
3090 return false;
3091
3092 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3093 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3094 if (!ShuffleDstTy || !II0Ty)
3095 return false;
3096
3097 if (!isTriviallyVectorizable(IID))
3098 return false;
3099
3100 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3102 II0->getArgOperand(I) != II1->getArgOperand(I))
3103 return false;
3104
3105 InstructionCost OldCost =
3106 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
3107 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
3109 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3110
3111 SmallVector<Type *> NewArgsTy;
3112 InstructionCost NewCost = 0;
3113 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3114 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3116 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3117 } else {
3118 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3119 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3120 ShuffleDstTy->getNumElements());
3121 NewArgsTy.push_back(ArgTy);
3122 std::pair<Value *, Value *> OperandPair =
3123 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3124 if (!SeenOperandPairs.insert(OperandPair).second) {
3125 // We've already computed the cost for this operand pair.
3126 continue;
3127 }
3128 NewCost += TTI.getShuffleCost(
3129 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3130 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3131 }
3132 }
3133 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3134 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3135
3136 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3137 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3138 << "\n");
3139
3140 if (NewCost > OldCost)
3141 return false;
3142
3143 SmallVector<Value *> NewArgs;
3144 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3145 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3147 NewArgs.push_back(II0->getArgOperand(I));
3148 } else {
3149 std::pair<Value *, Value *> OperandPair =
3150 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3151 auto It = ShuffleCache.find(OperandPair);
3152 if (It != ShuffleCache.end()) {
3153 // Reuse previously created shuffle for this operand pair.
3154 NewArgs.push_back(It->second);
3155 continue;
3156 }
3157 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3158 II1->getArgOperand(I), OldMask);
3159 ShuffleCache[OperandPair] = Shuf;
3160 NewArgs.push_back(Shuf);
3161 Worklist.pushValue(Shuf);
3162 }
3163 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3164
3165 // Intersect flags from the old intrinsics.
3166 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3167 NewInst->copyIRFlags(II0);
3168 NewInst->andIRFlags(II1);
3169 }
3170
3171 replaceValue(I, *NewIntrinsic);
3172 return true;
3173}
3174
3175/// Try to convert
3176/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3177bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3178 Value *V0;
3179 ArrayRef<int> Mask;
3180 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Undef(), m_Mask(Mask))))
3181 return false;
3182
3183 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3184 if (!II0)
3185 return false;
3186
3187 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3188 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3189 if (!ShuffleDstTy || !IntrinsicSrcTy)
3190 return false;
3191
3192 // Validate it's a pure permute, mask should only reference the first vector
3193 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3194 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3195 return false;
3196
3197 Intrinsic::ID IID = II0->getIntrinsicID();
3198 if (!isTriviallyVectorizable(IID))
3199 return false;
3200
3201 // Cost analysis
3202 InstructionCost OldCost =
3203 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
3205 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3206
3207 SmallVector<Type *> NewArgsTy;
3208 InstructionCost NewCost = 0;
3209 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3211 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3212 } else {
3213 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3214 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3215 ShuffleDstTy->getNumElements());
3216 NewArgsTy.push_back(ArgTy);
3218 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3219 {II0->getArgOperand(I)});
3220 }
3221 }
3222 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3223 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3224
3225 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3226 << OldCost << " vs NewCost: " << NewCost << "\n");
3227
3228 if (NewCost > OldCost)
3229 return false;
3230
3231 // Transform
3232 SmallVector<Value *> NewArgs;
3233 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3235 NewArgs.push_back(II0->getArgOperand(I));
3236 } else {
3237 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3238 NewArgs.push_back(Shuf);
3239 Worklist.pushValue(Shuf);
3240 }
3241 }
3242
3243 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3244
3245 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3246 NewInst->copyIRFlags(II0);
3247
3248 replaceValue(I, *NewIntrinsic);
3249 return true;
3250}
3251
3252using InstLane = std::pair<Use *, int>;
3253
3254static InstLane lookThroughShuffles(Use *U, int Lane) {
3255 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
3256 unsigned NumElts =
3257 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3258 int M = SV->getMaskValue(Lane);
3259 if (M < 0)
3260 return {nullptr, PoisonMaskElem};
3261 if (static_cast<unsigned>(M) < NumElts) {
3262 U = &SV->getOperandUse(0);
3263 Lane = M;
3264 } else {
3265 U = &SV->getOperandUse(1);
3266 Lane = M - NumElts;
3267 }
3268 }
3269 return InstLane{U, Lane};
3270}
3271
3275 for (InstLane IL : Item) {
3276 auto [U, Lane] = IL;
3277 InstLane OpLane =
3278 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
3279 Lane)
3280 : InstLane{nullptr, PoisonMaskElem};
3281 NItem.emplace_back(OpLane);
3282 }
3283 return NItem;
3284}
3285
3286/// Detect concat of multiple values into a vector
3288 const TargetTransformInfo &TTI) {
3289 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3290 unsigned NumElts = Ty->getNumElements();
3291 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3292 return false;
3293
3294 // Check that the concat is free, usually meaning that the type will be split
3295 // during legalization.
3296 SmallVector<int, 16> ConcatMask(NumElts * 2);
3297 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3298 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3299 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3300 Ty, ConcatMask, CostKind) != 0)
3301 return false;
3302
3303 unsigned NumSlices = Item.size() / NumElts;
3304 // Currently we generate a tree of shuffles for the concats, which limits us
3305 // to a power2.
3306 if (!isPowerOf2_32(NumSlices))
3307 return false;
3308 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3309 Use *SliceV = Item[Slice * NumElts].first;
3310 if (!SliceV || SliceV->get()->getType() != Ty)
3311 return false;
3312 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3313 auto [V, Lane] = Item[Slice * NumElts + Elt];
3314 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3315 return false;
3316 }
3317 }
3318 return true;
3319}
3320
3322 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3323 const SmallPtrSet<Use *, 4> &SplatLeafs,
3324 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3325 IRBuilderBase &Builder,
3326 const TargetTransformInfo *TTI) {
3327 auto [FrontU, FrontLane] = Item.front();
3328
3329 if (IdentityLeafs.contains(FrontU)) {
3330 return FrontU->get();
3331 }
3332 if (SplatLeafs.contains(FrontU)) {
3333 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3334 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3335 }
3336 if (ConcatLeafs.contains(FrontU)) {
3337 unsigned NumElts =
3338 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3339 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3340 for (unsigned S = 0; S < Values.size(); ++S)
3341 Values[S] = Item[S * NumElts].first->get();
3342
3343 while (Values.size() > 1) {
3344 NumElts *= 2;
3345 SmallVector<int, 16> Mask(NumElts, 0);
3346 std::iota(Mask.begin(), Mask.end(), 0);
3347 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3348 for (unsigned S = 0; S < NewValues.size(); ++S)
3349 NewValues[S] =
3350 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3351 Values = NewValues;
3352 }
3353 return Values[0];
3354 }
3355
3356 auto *I = cast<Instruction>(FrontU->get());
3357 auto *II = dyn_cast<IntrinsicInst>(I);
3358 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3360 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3361 if (II &&
3362 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3363 Ops[Idx] = II->getOperand(Idx);
3364 continue;
3365 }
3367 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3368 Builder, TTI);
3369 }
3370
3371 SmallVector<Value *, 8> ValueList;
3372 for (const auto &Lane : Item)
3373 if (Lane.first)
3374 ValueList.push_back(Lane.first->get());
3375
3376 Type *DstTy =
3377 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3378 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3379 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3380 Ops[0], Ops[1]);
3381 propagateIRFlags(Value, ValueList);
3382 return Value;
3383 }
3384 if (auto *CI = dyn_cast<CmpInst>(I)) {
3385 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3386 propagateIRFlags(Value, ValueList);
3387 return Value;
3388 }
3389 if (auto *SI = dyn_cast<SelectInst>(I)) {
3390 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3391 propagateIRFlags(Value, ValueList);
3392 return Value;
3393 }
3394 if (auto *CI = dyn_cast<CastInst>(I)) {
3395 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3396 propagateIRFlags(Value, ValueList);
3397 return Value;
3398 }
3399 if (II) {
3400 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3401 propagateIRFlags(Value, ValueList);
3402 return Value;
3403 }
3404 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3405 auto *Value =
3406 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3407 propagateIRFlags(Value, ValueList);
3408 return Value;
3409}
3410
3411// Starting from a shuffle, look up through operands tracking the shuffled index
3412// of each lane. If we can simplify away the shuffles to identities then
3413// do so.
3414bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3415 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3416 if (!Ty || I.use_empty())
3417 return false;
3418
3419 SmallVector<InstLane> Start(Ty->getNumElements());
3420 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3421 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3422
3424 Worklist.push_back(Start);
3425 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3426 unsigned NumVisited = 0;
3427
3428 while (!Worklist.empty()) {
3429 if (++NumVisited > MaxInstrsToScan)
3430 return false;
3431
3432 SmallVector<InstLane> Item = Worklist.pop_back_val();
3433 auto [FrontU, FrontLane] = Item.front();
3434
3435 // If we found an undef first lane then bail out to keep things simple.
3436 if (!FrontU)
3437 return false;
3438
3439 // Helper to peek through bitcasts to the same value.
3440 auto IsEquiv = [&](Value *X, Value *Y) {
3441 return X->getType() == Y->getType() &&
3443 };
3444
3445 // Look for an identity value.
3446 if (FrontLane == 0 &&
3447 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3448 Ty->getNumElements() &&
3449 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3450 Value *FrontV = Item.front().first->get();
3451 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3452 E.value().second == (int)E.index());
3453 })) {
3454 IdentityLeafs.insert(FrontU);
3455 continue;
3456 }
3457 // Look for constants, for the moment only supporting constant splats.
3458 if (auto *C = dyn_cast<Constant>(FrontU);
3459 C && C->getSplatValue() &&
3460 all_of(drop_begin(Item), [Item](InstLane &IL) {
3461 Value *FrontV = Item.front().first->get();
3462 Use *U = IL.first;
3463 return !U || (isa<Constant>(U->get()) &&
3464 cast<Constant>(U->get())->getSplatValue() ==
3465 cast<Constant>(FrontV)->getSplatValue());
3466 })) {
3467 SplatLeafs.insert(FrontU);
3468 continue;
3469 }
3470 // Look for a splat value.
3471 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3472 auto [FrontU, FrontLane] = Item.front();
3473 auto [U, Lane] = IL;
3474 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3475 })) {
3476 SplatLeafs.insert(FrontU);
3477 continue;
3478 }
3479
3480 // We need each element to be the same type of value, and check that each
3481 // element has a single use.
3482 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3483 Value *FrontV = Item.front().first->get();
3484 if (!IL.first)
3485 return true;
3486 Value *V = IL.first->get();
3487 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3488 return false;
3489 if (V->getValueID() != FrontV->getValueID())
3490 return false;
3491 if (auto *CI = dyn_cast<CmpInst>(V))
3492 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3493 return false;
3494 if (auto *CI = dyn_cast<CastInst>(V))
3495 if (CI->getSrcTy()->getScalarType() !=
3496 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3497 return false;
3498 if (auto *SI = dyn_cast<SelectInst>(V))
3499 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3500 SI->getOperand(0)->getType() !=
3501 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3502 return false;
3503 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3504 return false;
3505 auto *II = dyn_cast<IntrinsicInst>(V);
3506 return !II || (isa<IntrinsicInst>(FrontV) &&
3507 II->getIntrinsicID() ==
3508 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3509 !II->hasOperandBundles());
3510 };
3511 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3512 // Check the operator is one that we support.
3513 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3514 // We exclude div/rem in case they hit UB from poison lanes.
3515 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3516 BO && BO->isIntDivRem())
3517 return false;
3520 continue;
3521 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3522 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3524 continue;
3525 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3526 // TODO: Handle vector widening/narrowing bitcasts.
3527 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3528 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3529 if (DstTy && SrcTy &&
3530 SrcTy->getNumElements() == DstTy->getNumElements()) {
3532 continue;
3533 }
3534 } else if (isa<SelectInst>(FrontU)) {
3538 continue;
3539 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3540 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3541 !II->hasOperandBundles()) {
3542 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3543 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3544 &TTI)) {
3545 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3546 Value *FrontV = Item.front().first->get();
3547 Use *U = IL.first;
3548 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3549 cast<Instruction>(FrontV)->getOperand(Op));
3550 }))
3551 return false;
3552 continue;
3553 }
3555 }
3556 continue;
3557 }
3558 }
3559
3560 if (isFreeConcat(Item, CostKind, TTI)) {
3561 ConcatLeafs.insert(FrontU);
3562 continue;
3563 }
3564
3565 return false;
3566 }
3567
3568 if (NumVisited <= 1)
3569 return false;
3570
3571 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3572
3573 // If we got this far, we know the shuffles are superfluous and can be
3574 // removed. Scan through again and generate the new tree of instructions.
3575 Builder.SetInsertPoint(&I);
3576 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3577 ConcatLeafs, Builder, &TTI);
3578 replaceValue(I, *V);
3579 return true;
3580}
3581
3582/// Given a commutative reduction, the order of the input lanes does not alter
3583/// the results. We can use this to remove certain shuffles feeding the
3584/// reduction, removing the need to shuffle at all.
3585bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3586 auto *II = dyn_cast<IntrinsicInst>(&I);
3587 if (!II)
3588 return false;
3589 switch (II->getIntrinsicID()) {
3590 case Intrinsic::vector_reduce_add:
3591 case Intrinsic::vector_reduce_mul:
3592 case Intrinsic::vector_reduce_and:
3593 case Intrinsic::vector_reduce_or:
3594 case Intrinsic::vector_reduce_xor:
3595 case Intrinsic::vector_reduce_smin:
3596 case Intrinsic::vector_reduce_smax:
3597 case Intrinsic::vector_reduce_umin:
3598 case Intrinsic::vector_reduce_umax:
3599 break;
3600 default:
3601 return false;
3602 }
3603
3604 // Find all the inputs when looking through operations that do not alter the
3605 // lane order (binops, for example). Currently we look for a single shuffle,
3606 // and can ignore splat values.
3607 std::queue<Value *> Worklist;
3608 SmallPtrSet<Value *, 4> Visited;
3609 ShuffleVectorInst *Shuffle = nullptr;
3610 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3611 Worklist.push(Op);
3612
3613 while (!Worklist.empty()) {
3614 Value *CV = Worklist.front();
3615 Worklist.pop();
3616 if (Visited.contains(CV))
3617 continue;
3618
3619 // Splats don't change the order, so can be safely ignored.
3620 if (isSplatValue(CV))
3621 continue;
3622
3623 Visited.insert(CV);
3624
3625 if (auto *CI = dyn_cast<Instruction>(CV)) {
3626 if (CI->isBinaryOp()) {
3627 for (auto *Op : CI->operand_values())
3628 Worklist.push(Op);
3629 continue;
3630 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3631 if (Shuffle && Shuffle != SV)
3632 return false;
3633 Shuffle = SV;
3634 continue;
3635 }
3636 }
3637
3638 // Anything else is currently an unknown node.
3639 return false;
3640 }
3641
3642 if (!Shuffle)
3643 return false;
3644
3645 // Check all uses of the binary ops and shuffles are also included in the
3646 // lane-invariant operations (Visited should be the list of lanewise
3647 // instructions, including the shuffle that we found).
3648 for (auto *V : Visited)
3649 for (auto *U : V->users())
3650 if (!Visited.contains(U) && U != &I)
3651 return false;
3652
3653 FixedVectorType *VecType =
3654 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3655 if (!VecType)
3656 return false;
3657 FixedVectorType *ShuffleInputType =
3659 if (!ShuffleInputType)
3660 return false;
3661 unsigned NumInputElts = ShuffleInputType->getNumElements();
3662
3663 // Find the mask from sorting the lanes into order. This is most likely to
3664 // become a identity or concat mask. Undef elements are pushed to the end.
3665 SmallVector<int> ConcatMask;
3666 Shuffle->getShuffleMask(ConcatMask);
3667 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3668 bool UsesSecondVec =
3669 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3670
3672 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3673 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3675 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3676 ShuffleInputType, ConcatMask, CostKind);
3677
3678 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3679 << "\n");
3680 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3681 << "\n");
3682 bool MadeChanges = false;
3683 if (NewCost < OldCost) {
3684 Builder.SetInsertPoint(Shuffle);
3685 Value *NewShuffle = Builder.CreateShuffleVector(
3686 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3687 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3688 replaceValue(*Shuffle, *NewShuffle);
3689 return true;
3690 }
3691
3692 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3693 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3694 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3695 return MadeChanges;
3696}
3697
3698/// For a given chain of patterns of the following form:
3699///
3700/// ```
3701/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3702///
3703/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3704/// ty1> %1)
3705/// OR
3706/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3707///
3708/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3709/// ...
3710/// ...
3711/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3712/// 3), <n x ty1> %(i - 2)
3713/// OR
3714/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3715///
3716/// %(i) = extractelement <n x ty1> %(i - 1), 0
3717/// ```
3718///
3719/// Where:
3720/// `mask` follows a partition pattern:
3721///
3722/// Ex:
3723/// [n = 8, p = poison]
3724///
3725/// 4 5 6 7 | p p p p
3726/// 2 3 | p p p p p p
3727/// 1 | p p p p p p p
3728///
3729/// For powers of 2, there's a consistent pattern, but for other cases
3730/// the parity of the current half value at each step decides the
3731/// next partition half (see `ExpectedParityMask` for more logical details
3732/// in generalising this).
3733///
3734/// Ex:
3735/// [n = 6]
3736///
3737/// 3 4 5 | p p p
3738/// 1 2 | p p p p
3739/// 1 | p p p p p
3740bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3741 // Going bottom-up for the pattern.
3742 std::queue<Value *> InstWorklist;
3743 InstructionCost OrigCost = 0;
3744
3745 // Common instruction operation after each shuffle op.
3746 std::optional<unsigned int> CommonCallOp = std::nullopt;
3747 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3748
3749 bool IsFirstCallOrBinInst = true;
3750 bool ShouldBeCallOrBinInst = true;
3751
3752 // This stores the last used instructions for shuffle/common op.
3753 //
3754 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3755 // instructions from either shuffle/common op.
3756 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3757
3758 Value *VecOpEE;
3759 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3760 return false;
3761
3762 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3763 if (!FVT)
3764 return false;
3765
3766 int64_t VecSize = FVT->getNumElements();
3767 if (VecSize < 2)
3768 return false;
3769
3770 // Number of levels would be ~log2(n), considering we always partition
3771 // by half for this fold pattern.
3772 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3773 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3774
3775 // This is how we generalise for all element sizes.
3776 // At each step, if vector size is odd, we need non-poison
3777 // values to cover the dominant half so we don't miss out on any element.
3778 //
3779 // This mask will help us retrieve this as we go from bottom to top:
3780 //
3781 // Mask Set -> N = N * 2 - 1
3782 // Mask Unset -> N = N * 2
3783 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3784 Cur = (Cur + 1) / 2, --Mask) {
3785 if (Cur & 1)
3786 ExpectedParityMask |= (1ll << Mask);
3787 }
3788
3789 InstWorklist.push(VecOpEE);
3790
3791 while (!InstWorklist.empty()) {
3792 Value *CI = InstWorklist.front();
3793 InstWorklist.pop();
3794
3795 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3796 if (!ShouldBeCallOrBinInst)
3797 return false;
3798
3799 if (!IsFirstCallOrBinInst &&
3800 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3801 return false;
3802
3803 // For the first found call/bin op, the vector has to come from the
3804 // extract element op.
3805 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3806 return false;
3807 IsFirstCallOrBinInst = false;
3808
3809 if (!CommonCallOp)
3810 CommonCallOp = II->getIntrinsicID();
3811 if (II->getIntrinsicID() != *CommonCallOp)
3812 return false;
3813
3814 switch (II->getIntrinsicID()) {
3815 case Intrinsic::umin:
3816 case Intrinsic::umax:
3817 case Intrinsic::smin:
3818 case Intrinsic::smax: {
3819 auto *Op0 = II->getOperand(0);
3820 auto *Op1 = II->getOperand(1);
3821 PrevVecV[0] = Op0;
3822 PrevVecV[1] = Op1;
3823 break;
3824 }
3825 default:
3826 return false;
3827 }
3828 ShouldBeCallOrBinInst ^= 1;
3829
3830 IntrinsicCostAttributes ICA(
3831 *CommonCallOp, II->getType(),
3832 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3833 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3834
3835 // We may need a swap here since it can be (a, b) or (b, a)
3836 // and accordingly change as we go up.
3837 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3838 std::swap(PrevVecV[0], PrevVecV[1]);
3839 InstWorklist.push(PrevVecV[1]);
3840 InstWorklist.push(PrevVecV[0]);
3841 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3842 // Similar logic for bin ops.
3843
3844 if (!ShouldBeCallOrBinInst)
3845 return false;
3846
3847 if (!IsFirstCallOrBinInst &&
3848 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3849 return false;
3850
3851 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3852 return false;
3853 IsFirstCallOrBinInst = false;
3854
3855 if (!CommonBinOp)
3856 CommonBinOp = BinOp->getOpcode();
3857
3858 if (BinOp->getOpcode() != *CommonBinOp)
3859 return false;
3860
3861 switch (*CommonBinOp) {
3862 case BinaryOperator::Add:
3863 case BinaryOperator::Mul:
3864 case BinaryOperator::Or:
3865 case BinaryOperator::And:
3866 case BinaryOperator::Xor: {
3867 auto *Op0 = BinOp->getOperand(0);
3868 auto *Op1 = BinOp->getOperand(1);
3869 PrevVecV[0] = Op0;
3870 PrevVecV[1] = Op1;
3871 break;
3872 }
3873 default:
3874 return false;
3875 }
3876 ShouldBeCallOrBinInst ^= 1;
3877
3878 OrigCost +=
3879 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3880
3881 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3882 std::swap(PrevVecV[0], PrevVecV[1]);
3883 InstWorklist.push(PrevVecV[1]);
3884 InstWorklist.push(PrevVecV[0]);
3885 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3886 // We shouldn't have any null values in the previous vectors,
3887 // is so, there was a mismatch in pattern.
3888 if (ShouldBeCallOrBinInst ||
3889 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3890 return false;
3891
3892 if (SVInst != PrevVecV[1])
3893 return false;
3894
3895 ArrayRef<int> CurMask;
3896 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3897 m_Mask(CurMask))))
3898 return false;
3899
3900 // Subtract the parity mask when checking the condition.
3901 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3902 if (Mask < ShuffleMaskHalf &&
3903 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3904 return false;
3905 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3906 return false;
3907 }
3908
3909 // Update mask values.
3910 ShuffleMaskHalf *= 2;
3911 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3912 ExpectedParityMask >>= 1;
3913
3915 SVInst->getType(), SVInst->getType(),
3916 CurMask, CostKind);
3917
3918 VisitedCnt += 1;
3919 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3920 break;
3921
3922 ShouldBeCallOrBinInst ^= 1;
3923 } else {
3924 return false;
3925 }
3926 }
3927
3928 // Pattern should end with a shuffle op.
3929 if (ShouldBeCallOrBinInst)
3930 return false;
3931
3932 assert(VecSize != -1 && "Expected Match for Vector Size");
3933
3934 Value *FinalVecV = PrevVecV[0];
3935 if (!FinalVecV)
3936 return false;
3937
3938 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3939
3940 Intrinsic::ID ReducedOp =
3941 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3942 : getReductionForBinop(*CommonBinOp));
3943 if (!ReducedOp)
3944 return false;
3945
3946 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3948
3949 if (NewCost >= OrigCost)
3950 return false;
3951
3952 auto *ReducedResult =
3953 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3954 replaceValue(I, *ReducedResult);
3955
3956 return true;
3957}
3958
3959/// Determine if its more efficient to fold:
3960/// reduce(trunc(x)) -> trunc(reduce(x)).
3961/// reduce(sext(x)) -> sext(reduce(x)).
3962/// reduce(zext(x)) -> zext(reduce(x)).
3963bool VectorCombine::foldCastFromReductions(Instruction &I) {
3964 auto *II = dyn_cast<IntrinsicInst>(&I);
3965 if (!II)
3966 return false;
3967
3968 bool TruncOnly = false;
3969 Intrinsic::ID IID = II->getIntrinsicID();
3970 switch (IID) {
3971 case Intrinsic::vector_reduce_add:
3972 case Intrinsic::vector_reduce_mul:
3973 TruncOnly = true;
3974 break;
3975 case Intrinsic::vector_reduce_and:
3976 case Intrinsic::vector_reduce_or:
3977 case Intrinsic::vector_reduce_xor:
3978 break;
3979 default:
3980 return false;
3981 }
3982
3983 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3984 Value *ReductionSrc = I.getOperand(0);
3985
3986 Value *Src;
3987 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3988 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3989 return false;
3990
3991 auto CastOpc =
3992 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3993
3994 auto *SrcTy = cast<VectorType>(Src->getType());
3995 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3996 Type *ResultTy = I.getType();
3997
3999 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4000 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4002 cast<CastInst>(ReductionSrc));
4003 InstructionCost NewCost =
4004 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4005 CostKind) +
4006 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4008
4009 if (OldCost <= NewCost || !NewCost.isValid())
4010 return false;
4011
4012 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4013 II->getIntrinsicID(), {Src});
4014 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4015 replaceValue(I, *NewCast);
4016 return true;
4017}
4018
4019/// Returns true if this ShuffleVectorInst eventually feeds into a
4020/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4021/// chains of shuffles and binary operators (in any combination/order).
4022/// The search does not go deeper than the given Depth.
4024 constexpr unsigned MaxVisited = 32;
4027 bool FoundReduction = false;
4028
4029 WorkList.push_back(SVI);
4030 while (!WorkList.empty()) {
4031 Instruction *I = WorkList.pop_back_val();
4032 for (User *U : I->users()) {
4033 auto *UI = cast<Instruction>(U);
4034 if (!UI || !Visited.insert(UI).second)
4035 continue;
4036 if (Visited.size() > MaxVisited)
4037 return false;
4038 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4039 // More than one reduction reached
4040 if (FoundReduction)
4041 return false;
4042 switch (II->getIntrinsicID()) {
4043 case Intrinsic::vector_reduce_add:
4044 case Intrinsic::vector_reduce_mul:
4045 case Intrinsic::vector_reduce_and:
4046 case Intrinsic::vector_reduce_or:
4047 case Intrinsic::vector_reduce_xor:
4048 case Intrinsic::vector_reduce_smin:
4049 case Intrinsic::vector_reduce_smax:
4050 case Intrinsic::vector_reduce_umin:
4051 case Intrinsic::vector_reduce_umax:
4052 FoundReduction = true;
4053 continue;
4054 default:
4055 return false;
4056 }
4057 }
4058
4060 return false;
4061
4062 WorkList.emplace_back(UI);
4063 }
4064 }
4065 return FoundReduction;
4066}
4067
4068/// This method looks for groups of shuffles acting on binops, of the form:
4069/// %x = shuffle ...
4070/// %y = shuffle ...
4071/// %a = binop %x, %y
4072/// %b = binop %x, %y
4073/// shuffle %a, %b, selectmask
4074/// We may, especially if the shuffle is wider than legal, be able to convert
4075/// the shuffle to a form where only parts of a and b need to be computed. On
4076/// architectures with no obvious "select" shuffle, this can reduce the total
4077/// number of operations if the target reports them as cheaper.
4078bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4079 auto *SVI = cast<ShuffleVectorInst>(&I);
4080 auto *VT = cast<FixedVectorType>(I.getType());
4081 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4082 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4083 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4084 VT != Op0->getType())
4085 return false;
4086
4087 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4088 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4089 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4090 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4091 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4092 auto checkSVNonOpUses = [&](Instruction *I) {
4093 if (!I || I->getOperand(0)->getType() != VT)
4094 return true;
4095 return any_of(I->users(), [&](User *U) {
4096 return U != Op0 && U != Op1 &&
4097 !(isa<ShuffleVectorInst>(U) &&
4098 (InputShuffles.contains(cast<Instruction>(U)) ||
4099 isInstructionTriviallyDead(cast<Instruction>(U))));
4100 });
4101 };
4102 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4103 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4104 return false;
4105
4106 // Collect all the uses that are shuffles that we can transform together. We
4107 // may not have a single shuffle, but a group that can all be transformed
4108 // together profitably.
4110 auto collectShuffles = [&](Instruction *I) {
4111 for (auto *U : I->users()) {
4112 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4113 if (!SV || SV->getType() != VT)
4114 return false;
4115 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4116 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4117 return false;
4118 if (!llvm::is_contained(Shuffles, SV))
4119 Shuffles.push_back(SV);
4120 }
4121 return true;
4122 };
4123 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4124 return false;
4125 // From a reduction, we need to be processing a single shuffle, otherwise the
4126 // other uses will not be lane-invariant.
4127 if (FromReduction && Shuffles.size() > 1)
4128 return false;
4129
4130 // Add any shuffle uses for the shuffles we have found, to include them in our
4131 // cost calculations.
4132 if (!FromReduction) {
4133 for (ShuffleVectorInst *SV : Shuffles) {
4134 for (auto *U : SV->users()) {
4135 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4136 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4137 Shuffles.push_back(SSV);
4138 }
4139 }
4140 }
4141
4142 // For each of the output shuffles, we try to sort all the first vector
4143 // elements to the beginning, followed by the second array elements at the
4144 // end. If the binops are legalized to smaller vectors, this may reduce total
4145 // number of binops. We compute the ReconstructMask mask needed to convert
4146 // back to the original lane order.
4148 SmallVector<SmallVector<int>> OrigReconstructMasks;
4149 int MaxV1Elt = 0, MaxV2Elt = 0;
4150 unsigned NumElts = VT->getNumElements();
4151 for (ShuffleVectorInst *SVN : Shuffles) {
4152 SmallVector<int> Mask;
4153 SVN->getShuffleMask(Mask);
4154
4155 // Check the operands are the same as the original, or reversed (in which
4156 // case we need to commute the mask).
4157 Value *SVOp0 = SVN->getOperand(0);
4158 Value *SVOp1 = SVN->getOperand(1);
4159 if (isa<UndefValue>(SVOp1)) {
4160 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
4161 SVOp0 = SSV->getOperand(0);
4162 SVOp1 = SSV->getOperand(1);
4163 for (int &Elem : Mask) {
4164 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
4165 return false;
4166 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
4167 }
4168 }
4169 if (SVOp0 == Op1 && SVOp1 == Op0) {
4170 std::swap(SVOp0, SVOp1);
4172 }
4173 if (SVOp0 != Op0 || SVOp1 != Op1)
4174 return false;
4175
4176 // Calculate the reconstruction mask for this shuffle, as the mask needed to
4177 // take the packed values from Op0/Op1 and reconstructing to the original
4178 // order.
4179 SmallVector<int> ReconstructMask;
4180 for (unsigned I = 0; I < Mask.size(); I++) {
4181 if (Mask[I] < 0) {
4182 ReconstructMask.push_back(-1);
4183 } else if (Mask[I] < static_cast<int>(NumElts)) {
4184 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
4185 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
4186 return Mask[I] == A.first;
4187 });
4188 if (It != V1.end())
4189 ReconstructMask.push_back(It - V1.begin());
4190 else {
4191 ReconstructMask.push_back(V1.size());
4192 V1.emplace_back(Mask[I], V1.size());
4193 }
4194 } else {
4195 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
4196 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
4197 return Mask[I] - static_cast<int>(NumElts) == A.first;
4198 });
4199 if (It != V2.end())
4200 ReconstructMask.push_back(NumElts + It - V2.begin());
4201 else {
4202 ReconstructMask.push_back(NumElts + V2.size());
4203 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
4204 }
4205 }
4206 }
4207
4208 // For reductions, we know that the lane ordering out doesn't alter the
4209 // result. In-order can help simplify the shuffle away.
4210 if (FromReduction)
4211 sort(ReconstructMask);
4212 OrigReconstructMasks.push_back(std::move(ReconstructMask));
4213 }
4214
4215 // If the Maximum element used from V1 and V2 are not larger than the new
4216 // vectors, the vectors are already packes and performing the optimization
4217 // again will likely not help any further. This also prevents us from getting
4218 // stuck in a cycle in case the costs do not also rule it out.
4219 if (V1.empty() || V2.empty() ||
4220 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
4221 MaxV2Elt == static_cast<int>(V2.size()) - 1))
4222 return false;
4223
4224 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
4225 // shuffle of another shuffle, or not a shuffle (that is treated like a
4226 // identity shuffle).
4227 auto GetBaseMaskValue = [&](Instruction *I, int M) {
4228 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4229 if (!SV)
4230 return M;
4231 if (isa<UndefValue>(SV->getOperand(1)))
4232 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4233 if (InputShuffles.contains(SSV))
4234 return SSV->getMaskValue(SV->getMaskValue(M));
4235 return SV->getMaskValue(M);
4236 };
4237
4238 // Attempt to sort the inputs my ascending mask values to make simpler input
4239 // shuffles and push complex shuffles down to the uses. We sort on the first
4240 // of the two input shuffle orders, to try and get at least one input into a
4241 // nice order.
4242 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
4243 std::pair<int, int> Y) {
4244 int MXA = GetBaseMaskValue(A, X.first);
4245 int MYA = GetBaseMaskValue(A, Y.first);
4246 return MXA < MYA;
4247 };
4248 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
4249 return SortBase(SVI0A, A, B);
4250 });
4251 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
4252 return SortBase(SVI1A, A, B);
4253 });
4254 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
4255 // modified order of the input shuffles.
4256 SmallVector<SmallVector<int>> ReconstructMasks;
4257 for (const auto &Mask : OrigReconstructMasks) {
4258 SmallVector<int> ReconstructMask;
4259 for (int M : Mask) {
4260 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
4261 auto It = find_if(V, [M](auto A) { return A.second == M; });
4262 assert(It != V.end() && "Expected all entries in Mask");
4263 return std::distance(V.begin(), It);
4264 };
4265 if (M < 0)
4266 ReconstructMask.push_back(-1);
4267 else if (M < static_cast<int>(NumElts)) {
4268 ReconstructMask.push_back(FindIndex(V1, M));
4269 } else {
4270 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
4271 }
4272 }
4273 ReconstructMasks.push_back(std::move(ReconstructMask));
4274 }
4275
4276 // Calculate the masks needed for the new input shuffles, which get padded
4277 // with undef
4278 SmallVector<int> V1A, V1B, V2A, V2B;
4279 for (unsigned I = 0; I < V1.size(); I++) {
4280 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
4281 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
4282 }
4283 for (unsigned I = 0; I < V2.size(); I++) {
4284 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
4285 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
4286 }
4287 while (V1A.size() < NumElts) {
4290 }
4291 while (V2A.size() < NumElts) {
4294 }
4295
4296 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
4297 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4298 if (!SV)
4299 return C;
4300 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
4303 VT, VT, SV->getShuffleMask(), CostKind);
4304 };
4305 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4306 return C +
4308 };
4309
4310 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
4311 unsigned MaxVectorSize =
4313 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
4314 if (MaxElementsInVector == 0)
4315 return false;
4316 // When there are multiple shufflevector operations on the same input,
4317 // especially when the vector length is larger than the register size,
4318 // identical shuffle patterns may occur across different groups of elements.
4319 // To avoid overestimating the cost by counting these repeated shuffles more
4320 // than once, we only account for unique shuffle patterns. This adjustment
4321 // prevents inflated costs in the cost model for wide vectors split into
4322 // several register-sized groups.
4323 std::set<SmallVector<int, 4>> UniqueShuffles;
4324 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4325 // Compute the cost for performing the shuffle over the full vector.
4326 auto ShuffleCost =
4328 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
4329 if (NumFullVectors < 2)
4330 return C + ShuffleCost;
4331 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
4332 unsigned NumUniqueGroups = 0;
4333 unsigned NumGroups = Mask.size() / MaxElementsInVector;
4334 // For each group of MaxElementsInVector contiguous elements,
4335 // collect their shuffle pattern and insert into the set of unique patterns.
4336 for (unsigned I = 0; I < NumFullVectors; ++I) {
4337 for (unsigned J = 0; J < MaxElementsInVector; ++J)
4338 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
4339 if (UniqueShuffles.insert(SubShuffle).second)
4340 NumUniqueGroups += 1;
4341 }
4342 return C + ShuffleCost * NumUniqueGroups / NumGroups;
4343 };
4344 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
4345 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4346 if (!SV)
4347 return C;
4348 SmallVector<int, 16> Mask;
4349 SV->getShuffleMask(Mask);
4350 return AddShuffleMaskAdjustedCost(C, Mask);
4351 };
4352 // Check that input consists of ShuffleVectors applied to the same input
4353 auto AllShufflesHaveSameOperands =
4354 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
4355 if (InputShuffles.size() < 2)
4356 return false;
4357 ShuffleVectorInst *FirstSV =
4358 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
4359 if (!FirstSV)
4360 return false;
4361
4362 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
4363 return std::all_of(
4364 std::next(InputShuffles.begin()), InputShuffles.end(),
4365 [&](Instruction *I) {
4366 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
4367 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
4368 });
4369 };
4370
4371 // Get the costs of the shuffles + binops before and after with the new
4372 // shuffle masks.
4373 InstructionCost CostBefore =
4374 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
4375 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
4376 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
4377 InstructionCost(0), AddShuffleCost);
4378 if (AllShufflesHaveSameOperands(InputShuffles)) {
4379 UniqueShuffles.clear();
4380 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4381 InstructionCost(0), AddShuffleAdjustedCost);
4382 } else {
4383 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4384 InstructionCost(0), AddShuffleCost);
4385 }
4386
4387 // The new binops will be unused for lanes past the used shuffle lengths.
4388 // These types attempt to get the correct cost for that from the target.
4389 FixedVectorType *Op0SmallVT =
4390 FixedVectorType::get(VT->getScalarType(), V1.size());
4391 FixedVectorType *Op1SmallVT =
4392 FixedVectorType::get(VT->getScalarType(), V2.size());
4393 InstructionCost CostAfter =
4394 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4395 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4396 UniqueShuffles.clear();
4397 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4398 InstructionCost(0), AddShuffleMaskAdjustedCost);
4399 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4400 CostAfter +=
4401 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4402 InstructionCost(0), AddShuffleMaskCost);
4403
4404 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4405 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4406 << " vs CostAfter: " << CostAfter << "\n");
4407 if (CostBefore < CostAfter ||
4408 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4409 return false;
4410
4411 // The cost model has passed, create the new instructions.
4412 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4413 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4414 if (!SV)
4415 return I;
4416 if (isa<UndefValue>(SV->getOperand(1)))
4417 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4418 if (InputShuffles.contains(SSV))
4419 return SSV->getOperand(Op);
4420 return SV->getOperand(Op);
4421 };
4422 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4423 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4424 GetShuffleOperand(SVI0A, 1), V1A);
4425 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4426 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4427 GetShuffleOperand(SVI0B, 1), V1B);
4428 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4429 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4430 GetShuffleOperand(SVI1A, 1), V2A);
4431 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4432 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4433 GetShuffleOperand(SVI1B, 1), V2B);
4434 Builder.SetInsertPoint(Op0);
4435 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4436 NSV0A, NSV0B);
4437 if (auto *I = dyn_cast<Instruction>(NOp0))
4438 I->copyIRFlags(Op0, true);
4439 Builder.SetInsertPoint(Op1);
4440 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4441 NSV1A, NSV1B);
4442 if (auto *I = dyn_cast<Instruction>(NOp1))
4443 I->copyIRFlags(Op1, true);
4444
4445 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4446 Builder.SetInsertPoint(Shuffles[S]);
4447 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4448 replaceValue(*Shuffles[S], *NSV, false);
4449 }
4450
4451 Worklist.pushValue(NSV0A);
4452 Worklist.pushValue(NSV0B);
4453 Worklist.pushValue(NSV1A);
4454 Worklist.pushValue(NSV1B);
4455 return true;
4456}
4457
4458/// Check if instruction depends on ZExt and this ZExt can be moved after the
4459/// instruction. Move ZExt if it is profitable. For example:
4460/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4461/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4462/// Cost model calculations takes into account if zext(x) has other users and
4463/// whether it can be propagated through them too.
4464bool VectorCombine::shrinkType(Instruction &I) {
4465 Value *ZExted, *OtherOperand;
4466 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4467 m_Value(OtherOperand))) &&
4468 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4469 return false;
4470
4471 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4472
4473 auto *BigTy = cast<FixedVectorType>(I.getType());
4474 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4475 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4476
4477 if (I.getOpcode() == Instruction::LShr) {
4478 // Check that the shift amount is less than the number of bits in the
4479 // smaller type. Otherwise, the smaller lshr will return a poison value.
4480 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4481 if (ShAmtKB.getMaxValue().uge(BW))
4482 return false;
4483 } else {
4484 // Check that the expression overall uses at most the same number of bits as
4485 // ZExted
4486 KnownBits KB = computeKnownBits(&I, *DL);
4487 if (KB.countMaxActiveBits() > BW)
4488 return false;
4489 }
4490
4491 // Calculate costs of leaving current IR as it is and moving ZExt operation
4492 // later, along with adding truncates if needed
4494 Instruction::ZExt, BigTy, SmallTy,
4495 TargetTransformInfo::CastContextHint::None, CostKind);
4496 InstructionCost CurrentCost = ZExtCost;
4497 InstructionCost ShrinkCost = 0;
4498
4499 // Calculate total cost and check that we can propagate through all ZExt users
4500 for (User *U : ZExtOperand->users()) {
4501 auto *UI = cast<Instruction>(U);
4502 if (UI == &I) {
4503 CurrentCost +=
4504 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4505 ShrinkCost +=
4506 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4507 ShrinkCost += ZExtCost;
4508 continue;
4509 }
4510
4511 if (!Instruction::isBinaryOp(UI->getOpcode()))
4512 return false;
4513
4514 // Check if we can propagate ZExt through its other users
4515 KnownBits KB = computeKnownBits(UI, *DL);
4516 if (KB.countMaxActiveBits() > BW)
4517 return false;
4518
4519 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4520 ShrinkCost +=
4521 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4522 ShrinkCost += ZExtCost;
4523 }
4524
4525 // If the other instruction operand is not a constant, we'll need to
4526 // generate a truncate instruction. So we have to adjust cost
4527 if (!isa<Constant>(OtherOperand))
4528 ShrinkCost += TTI.getCastInstrCost(
4529 Instruction::Trunc, SmallTy, BigTy,
4530 TargetTransformInfo::CastContextHint::None, CostKind);
4531
4532 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4533 // towards modifying the IR because shrinking opens opportunities for other
4534 // shrinking optimisations.
4535 if (ShrinkCost > CurrentCost)
4536 return false;
4537
4538 Builder.SetInsertPoint(&I);
4539 Value *Op0 = ZExted;
4540 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4541 // Keep the order of operands the same
4542 if (I.getOperand(0) == OtherOperand)
4543 std::swap(Op0, Op1);
4544 Value *NewBinOp =
4545 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4546 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4547 cast<Instruction>(NewBinOp)->copyMetadata(I);
4548 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4549 replaceValue(I, *NewZExtr);
4550 return true;
4551}
4552
4553/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4554/// shuffle (DstVec, SrcVec, Mask)
4555bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4556 Value *DstVec, *SrcVec;
4557 uint64_t ExtIdx, InsIdx;
4558 if (!match(&I,
4559 m_InsertElt(m_Value(DstVec),
4560 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4561 m_ConstantInt(InsIdx))))
4562 return false;
4563
4564 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4565 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4566 // We can try combining vectors with different element sizes.
4567 if (!DstVecTy || !SrcVecTy ||
4568 SrcVecTy->getElementType() != DstVecTy->getElementType())
4569 return false;
4570
4571 unsigned NumDstElts = DstVecTy->getNumElements();
4572 unsigned NumSrcElts = SrcVecTy->getNumElements();
4573 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4574 return false;
4575
4576 // Insertion into poison is a cheaper single operand shuffle.
4578 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4579
4580 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4581 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4582 if (NeedDstSrcSwap) {
4584 Mask[InsIdx] = ExtIdx % NumDstElts;
4585 std::swap(DstVec, SrcVec);
4586 } else {
4588 std::iota(Mask.begin(), Mask.end(), 0);
4589 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
4590 }
4591
4592 // Cost
4593 auto *Ins = cast<InsertElementInst>(&I);
4594 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4595 InstructionCost InsCost =
4596 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4597 InstructionCost ExtCost =
4598 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4599 InstructionCost OldCost = ExtCost + InsCost;
4600
4601 InstructionCost NewCost = 0;
4602 SmallVector<int> ExtToVecMask;
4603 if (!NeedExpOrNarrow) {
4604 // Ignore 'free' identity insertion shuffle.
4605 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4606 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4607 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4608 nullptr, {DstVec, SrcVec});
4609 } else {
4610 // When creating a length-changing-vector, always try to keep the relevant
4611 // element in an equivalent position, so that bulk shuffles are more likely
4612 // to be useful.
4613 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4614 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
4615 // Add cost for expanding or narrowing
4617 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4618 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4619 }
4620
4621 if (!Ext->hasOneUse())
4622 NewCost += ExtCost;
4623
4624 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4625 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4626 << "\n");
4627
4628 if (OldCost < NewCost)
4629 return false;
4630
4631 if (NeedExpOrNarrow) {
4632 if (!NeedDstSrcSwap)
4633 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4634 else
4635 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4636 }
4637
4638 // Canonicalize undef param to RHS to help further folds.
4639 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4640 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4641 std::swap(DstVec, SrcVec);
4642 }
4643
4644 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4645 replaceValue(I, *Shuf);
4646
4647 return true;
4648}
4649
4650/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4651/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4652/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4653/// before casting it back into `<vscale x 16 x i32>`.
4654bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4655 const APInt *SplatVal0, *SplatVal1;
4657 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4658 return false;
4659
4660 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4661 << "\n");
4662
4663 auto *VTy =
4664 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4665 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4666 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4667
4668 // Just in case the cost of interleave2 intrinsic and bitcast are both
4669 // invalid, in which case we want to bail out, we use <= rather
4670 // than < here. Even they both have valid and equal costs, it's probably
4671 // not a good idea to emit a high-cost constant splat.
4673 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4675 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4676 << *I.getType() << " is too high.\n");
4677 return false;
4678 }
4679
4680 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4681 NewSplatVal <<= Width;
4682 NewSplatVal |= SplatVal0->zext(Width * 2);
4683 auto *NewSplat = ConstantVector::getSplat(
4684 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4685
4686 IRBuilder<> Builder(&I);
4687 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4688 return true;
4689}
4690
4691// Attempt to shrink loads that are only used by shufflevector instructions.
4692bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4693 auto *OldLoad = dyn_cast<LoadInst>(&I);
4694 if (!OldLoad || !OldLoad->isSimple())
4695 return false;
4696
4697 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4698 if (!OldLoadTy)
4699 return false;
4700
4701 unsigned const OldNumElements = OldLoadTy->getNumElements();
4702
4703 // Search all uses of load. If all uses are shufflevector instructions, and
4704 // the second operands are all poison values, find the minimum and maximum
4705 // indices of the vector elements referenced by all shuffle masks.
4706 // Otherwise return `std::nullopt`.
4707 using IndexRange = std::pair<int, int>;
4708 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4709 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4710 for (llvm::Use &Use : I.uses()) {
4711 // Ensure all uses match the required pattern.
4712 User *Shuffle = Use.getUser();
4713 ArrayRef<int> Mask;
4714
4715 if (!match(Shuffle,
4716 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4717 return std::nullopt;
4718
4719 // Ignore shufflevector instructions that have no uses.
4720 if (Shuffle->use_empty())
4721 continue;
4722
4723 // Find the min and max indices used by the shufflevector instruction.
4724 for (int Index : Mask) {
4725 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4726 OutputRange.first = std::min(Index, OutputRange.first);
4727 OutputRange.second = std::max(Index, OutputRange.second);
4728 }
4729 }
4730 }
4731
4732 if (OutputRange.second < OutputRange.first)
4733 return std::nullopt;
4734
4735 return OutputRange;
4736 };
4737
4738 // Get the range of vector elements used by shufflevector instructions.
4739 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4740 unsigned const NewNumElements = Indices->second + 1u;
4741
4742 // If the range of vector elements is smaller than the full load, attempt
4743 // to create a smaller load.
4744 if (NewNumElements < OldNumElements) {
4745 IRBuilder Builder(&I);
4746 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4747
4748 // Calculate costs of old and new ops.
4749 Type *ElemTy = OldLoadTy->getElementType();
4750 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4751 Value *PtrOp = OldLoad->getPointerOperand();
4752
4754 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4755 OldLoad->getPointerAddressSpace(), CostKind);
4756 InstructionCost NewCost =
4757 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4758 OldLoad->getPointerAddressSpace(), CostKind);
4759
4760 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4762 unsigned const MaxIndex = NewNumElements * 2u;
4763
4764 for (llvm::Use &Use : I.uses()) {
4765 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4766 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4767
4768 // Create entry for new use.
4769 NewUses.push_back({Shuffle, OldMask});
4770
4771 // Validate mask indices.
4772 for (int Index : OldMask) {
4773 if (Index >= static_cast<int>(MaxIndex))
4774 return false;
4775 }
4776
4777 // Update costs.
4778 OldCost +=
4780 OldLoadTy, OldMask, CostKind);
4781 NewCost +=
4783 NewLoadTy, OldMask, CostKind);
4784 }
4785
4786 LLVM_DEBUG(
4787 dbgs() << "Found a load used only by shufflevector instructions: "
4788 << I << "\n OldCost: " << OldCost
4789 << " vs NewCost: " << NewCost << "\n");
4790
4791 if (OldCost < NewCost || !NewCost.isValid())
4792 return false;
4793
4794 // Create new load of smaller vector.
4795 auto *NewLoad = cast<LoadInst>(
4796 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4797 NewLoad->copyMetadata(I);
4798
4799 // Replace all uses.
4800 for (UseEntry &Use : NewUses) {
4801 ShuffleVectorInst *Shuffle = Use.first;
4802 std::vector<int> &NewMask = Use.second;
4803
4804 Builder.SetInsertPoint(Shuffle);
4805 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4806 Value *NewShuffle = Builder.CreateShuffleVector(
4807 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4808
4809 replaceValue(*Shuffle, *NewShuffle, false);
4810 }
4811
4812 return true;
4813 }
4814 }
4815 return false;
4816}
4817
4818// Attempt to narrow a phi of shufflevector instructions where the two incoming
4819// values have the same operands but different masks. If the two shuffle masks
4820// are offsets of one another we can use one branch to rotate the incoming
4821// vector and perform one larger shuffle after the phi.
4822bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4823 auto *Phi = dyn_cast<PHINode>(&I);
4824 if (!Phi || Phi->getNumIncomingValues() != 2u)
4825 return false;
4826
4827 Value *Op = nullptr;
4828 ArrayRef<int> Mask0;
4829 ArrayRef<int> Mask1;
4830
4831 if (!match(Phi->getOperand(0u),
4832 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4833 !match(Phi->getOperand(1u),
4834 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4835 return false;
4836
4837 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4838
4839 // Ensure result vectors are wider than the argument vector.
4840 auto *InputVT = cast<FixedVectorType>(Op->getType());
4841 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4842 auto const InputNumElements = InputVT->getNumElements();
4843
4844 if (InputNumElements >= ResultVT->getNumElements())
4845 return false;
4846
4847 // Take the difference of the two shuffle masks at each index. Ignore poison
4848 // values at the same index in both masks.
4849 SmallVector<int, 16> NewMask;
4850 NewMask.reserve(Mask0.size());
4851
4852 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4853 if (M0 >= 0 && M1 >= 0)
4854 NewMask.push_back(M0 - M1);
4855 else if (M0 == -1 && M1 == -1)
4856 continue;
4857 else
4858 return false;
4859 }
4860
4861 // Ensure all elements of the new mask are equal. If the difference between
4862 // the incoming mask elements is the same, the two must be constant offsets
4863 // of one another.
4864 if (NewMask.empty() || !all_equal(NewMask))
4865 return false;
4866
4867 // Create new mask using difference of the two incoming masks.
4868 int MaskOffset = NewMask[0u];
4869 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
4870 NewMask.clear();
4871
4872 for (unsigned I = 0u; I < InputNumElements; ++I) {
4873 NewMask.push_back(Index);
4874 Index = (Index + 1u) % InputNumElements;
4875 }
4876
4877 // Calculate costs for worst cases and compare.
4878 auto const Kind = TTI::SK_PermuteSingleSrc;
4879 auto OldCost =
4880 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4881 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4882 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4883 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4884
4885 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4886 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4887 << "\n");
4888
4889 if (NewCost > OldCost)
4890 return false;
4891
4892 // Create new shuffles and narrowed phi.
4893 auto Builder = IRBuilder(Shuf);
4894 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4895 auto *PoisonVal = PoisonValue::get(InputVT);
4896 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4897 Worklist.push(cast<Instruction>(NewShuf0));
4898
4899 Builder.SetInsertPoint(Phi);
4900 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4901 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4902 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4903 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4904
4905 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4906 PoisonVal = PoisonValue::get(NewPhi->getType());
4907 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4908
4909 replaceValue(*Phi, *NewShuf1);
4910 return true;
4911}
4912
4913/// This is the entry point for all transforms. Pass manager differences are
4914/// handled in the callers of this function.
4915bool VectorCombine::run() {
4917 return false;
4918
4919 // Don't attempt vectorization if the target does not support vectors.
4920 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4921 return false;
4922
4923 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4924
4925 auto FoldInst = [this](Instruction &I) {
4926 Builder.SetInsertPoint(&I);
4927 bool IsVectorType = isa<VectorType>(I.getType());
4928 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4929 auto Opcode = I.getOpcode();
4930
4931 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4932
4933 // These folds should be beneficial regardless of when this pass is run
4934 // in the optimization pipeline.
4935 // The type checking is for run-time efficiency. We can avoid wasting time
4936 // dispatching to folding functions if there's no chance of matching.
4937 if (IsFixedVectorType) {
4938 switch (Opcode) {
4939 case Instruction::InsertElement:
4940 if (vectorizeLoadInsert(I))
4941 return true;
4942 break;
4943 case Instruction::ShuffleVector:
4944 if (widenSubvectorLoad(I))
4945 return true;
4946 break;
4947 default:
4948 break;
4949 }
4950 }
4951
4952 // This transform works with scalable and fixed vectors
4953 // TODO: Identify and allow other scalable transforms
4954 if (IsVectorType) {
4955 if (scalarizeOpOrCmp(I))
4956 return true;
4957 if (scalarizeLoad(I))
4958 return true;
4959 if (scalarizeExtExtract(I))
4960 return true;
4961 if (scalarizeVPIntrinsic(I))
4962 return true;
4963 if (foldInterleaveIntrinsics(I))
4964 return true;
4965 }
4966
4967 if (Opcode == Instruction::Store)
4968 if (foldSingleElementStore(I))
4969 return true;
4970
4971 // If this is an early pipeline invocation of this pass, we are done.
4972 if (TryEarlyFoldsOnly)
4973 return false;
4974
4975 // Otherwise, try folds that improve codegen but may interfere with
4976 // early IR canonicalizations.
4977 // The type checking is for run-time efficiency. We can avoid wasting time
4978 // dispatching to folding functions if there's no chance of matching.
4979 if (IsFixedVectorType) {
4980 switch (Opcode) {
4981 case Instruction::InsertElement:
4982 if (foldInsExtFNeg(I))
4983 return true;
4984 if (foldInsExtBinop(I))
4985 return true;
4986 if (foldInsExtVectorToShuffle(I))
4987 return true;
4988 break;
4989 case Instruction::ShuffleVector:
4990 if (foldPermuteOfBinops(I))
4991 return true;
4992 if (foldShuffleOfBinops(I))
4993 return true;
4994 if (foldShuffleOfSelects(I))
4995 return true;
4996 if (foldShuffleOfCastops(I))
4997 return true;
4998 if (foldShuffleOfShuffles(I))
4999 return true;
5000 if (foldPermuteOfIntrinsic(I))
5001 return true;
5002 if (foldShufflesOfLengthChangingShuffles(I))
5003 return true;
5004 if (foldShuffleOfIntrinsics(I))
5005 return true;
5006 if (foldSelectShuffle(I))
5007 return true;
5008 if (foldShuffleToIdentity(I))
5009 return true;
5010 break;
5011 case Instruction::Load:
5012 if (shrinkLoadForShuffles(I))
5013 return true;
5014 break;
5015 case Instruction::BitCast:
5016 if (foldBitcastShuffle(I))
5017 return true;
5018 break;
5019 case Instruction::And:
5020 case Instruction::Or:
5021 case Instruction::Xor:
5022 if (foldBitOpOfCastops(I))
5023 return true;
5024 if (foldBitOpOfCastConstant(I))
5025 return true;
5026 break;
5027 case Instruction::PHI:
5028 if (shrinkPhiOfShuffles(I))
5029 return true;
5030 break;
5031 default:
5032 if (shrinkType(I))
5033 return true;
5034 break;
5035 }
5036 } else {
5037 switch (Opcode) {
5038 case Instruction::Call:
5039 if (foldShuffleFromReductions(I))
5040 return true;
5041 if (foldCastFromReductions(I))
5042 return true;
5043 break;
5044 case Instruction::ExtractElement:
5045 if (foldShuffleChainsToReduce(I))
5046 return true;
5047 break;
5048 case Instruction::ICmp:
5049 case Instruction::FCmp:
5050 if (foldExtractExtract(I))
5051 return true;
5052 break;
5053 case Instruction::Or:
5054 if (foldConcatOfBoolMasks(I))
5055 return true;
5056 [[fallthrough]];
5057 default:
5058 if (Instruction::isBinaryOp(Opcode)) {
5059 if (foldExtractExtract(I))
5060 return true;
5061 if (foldExtractedCmps(I))
5062 return true;
5063 if (foldBinopOfReductions(I))
5064 return true;
5065 }
5066 break;
5067 }
5068 }
5069 return false;
5070 };
5071
5072 bool MadeChange = false;
5073 for (BasicBlock &BB : F) {
5074 // Ignore unreachable basic blocks.
5075 if (!DT.isReachableFromEntry(&BB))
5076 continue;
5077 // Use early increment range so that we can erase instructions in loop.
5078 // make_early_inc_range is not applicable here, as the next iterator may
5079 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5080 // We manually maintain the next instruction and update it when it is about
5081 // to be deleted.
5082 Instruction *I = &BB.front();
5083 while (I) {
5084 NextInst = I->getNextNode();
5085 if (!I->isDebugOrPseudoInst())
5086 MadeChange |= FoldInst(*I);
5087 I = NextInst;
5088 }
5089 }
5090
5091 NextInst = nullptr;
5092
5093 while (!Worklist.isEmpty()) {
5094 Instruction *I = Worklist.removeOne();
5095 if (!I)
5096 continue;
5097
5100 continue;
5101 }
5102
5103 MadeChange |= FoldInst(*I);
5104 }
5105
5106 return MadeChange;
5107}
5108
5111 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5113 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5114 AAResults &AA = FAM.getResult<AAManager>(F);
5115 const DataLayout *DL = &F.getDataLayout();
5116 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5117 TryEarlyFoldsOnly);
5118 if (!Combiner.run())
5119 return PreservedAnalyses::all();
5122 return PA;
5123}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1450
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2266
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2079
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2503
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2157
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
constexpr bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
constexpr bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1748
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1779
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1918
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2129
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145