LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoad(Instruction &I);
133 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
134 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
135 bool scalarizeExtExtract(Instruction &I);
136 bool foldConcatOfBoolMasks(Instruction &I);
137 bool foldPermuteOfBinops(Instruction &I);
138 bool foldShuffleOfBinops(Instruction &I);
139 bool foldShuffleOfSelects(Instruction &I);
140 bool foldShuffleOfCastops(Instruction &I);
141 bool foldShuffleOfShuffles(Instruction &I);
142 bool foldPermuteOfIntrinsic(Instruction &I);
143 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
144 bool foldShuffleOfIntrinsics(Instruction &I);
145 bool foldShuffleToIdentity(Instruction &I);
146 bool foldShuffleFromReductions(Instruction &I);
147 bool foldShuffleChainsToReduce(Instruction &I);
148 bool foldCastFromReductions(Instruction &I);
149 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
150 bool foldInterleaveIntrinsics(Instruction &I);
151 bool shrinkType(Instruction &I);
152 bool shrinkLoadForShuffles(Instruction &I);
153 bool shrinkPhiOfShuffles(Instruction &I);
154
155 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
156 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
157 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
158 Old.replaceAllUsesWith(&New);
159 if (auto *NewI = dyn_cast<Instruction>(&New)) {
160 New.takeName(&Old);
161 Worklist.pushUsersToWorkList(*NewI);
162 Worklist.pushValue(NewI);
163 }
164 if (Erase && isInstructionTriviallyDead(&Old)) {
165 eraseInstruction(Old);
166 } else {
167 Worklist.push(&Old);
168 }
169 }
170
171 void eraseInstruction(Instruction &I) {
172 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
173 SmallVector<Value *> Ops(I.operands());
174 Worklist.remove(&I);
175 I.eraseFromParent();
176
177 // Push remaining users of the operands and then the operand itself - allows
178 // further folds that were hindered by OneUse limits.
179 SmallPtrSet<Value *, 4> Visited;
180 for (Value *Op : Ops) {
181 if (!Visited.contains(Op)) {
182 if (auto *OpI = dyn_cast<Instruction>(Op)) {
184 OpI, nullptr, nullptr, [&](Value *V) {
185 if (auto *I = dyn_cast<Instruction>(V)) {
186 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
187 Worklist.remove(I);
188 if (I == NextInst)
189 NextInst = NextInst->getNextNode();
190 Visited.insert(I);
191 }
192 }))
193 continue;
194 Worklist.pushUsersToWorkList(*OpI);
195 Worklist.pushValue(OpI);
196 }
197 }
198 }
199 }
200};
201} // namespace
202
203/// Return the source operand of a potentially bitcasted value. If there is no
204/// bitcast, return the input value itself.
206 while (auto *BitCast = dyn_cast<BitCastInst>(V))
207 V = BitCast->getOperand(0);
208 return V;
209}
210
211static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
212 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
213 // The widened load may load data from dirty regions or create data races
214 // non-existent in the source.
215 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
216 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
218 return false;
219
220 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
221 // sure we have all of our type-based constraints in place for this target.
222 Type *ScalarTy = Load->getType()->getScalarType();
223 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
224 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
225 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
226 ScalarSize % 8 != 0)
227 return false;
228
229 return true;
230}
231
232bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
233 // Match insert into fixed vector of scalar value.
234 // TODO: Handle non-zero insert index.
235 Value *Scalar;
236 if (!match(&I,
238 return false;
239
240 // Optionally match an extract from another vector.
241 Value *X;
242 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
243 if (!HasExtract)
244 X = Scalar;
245
246 auto *Load = dyn_cast<LoadInst>(X);
247 if (!canWidenLoad(Load, TTI))
248 return false;
249
250 Type *ScalarTy = Scalar->getType();
251 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
252 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
253
254 // Check safety of replacing the scalar load with a larger vector load.
255 // We use minimal alignment (maximum flexibility) because we only care about
256 // the dereferenceable region. When calculating cost and creating a new op,
257 // we may use a larger value based on alignment attributes.
258 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
259 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
260
261 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
262 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
263 unsigned OffsetEltIndex = 0;
264 Align Alignment = Load->getAlign();
265 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
266 &DT)) {
267 // It is not safe to load directly from the pointer, but we can still peek
268 // through gep offsets and check if it safe to load from a base address with
269 // updated alignment. If it is, we can shuffle the element(s) into place
270 // after loading.
271 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
272 APInt Offset(OffsetBitWidth, 0);
274
275 // We want to shuffle the result down from a high element of a vector, so
276 // the offset must be positive.
277 if (Offset.isNegative())
278 return false;
279
280 // The offset must be a multiple of the scalar element to shuffle cleanly
281 // in the element's size.
282 uint64_t ScalarSizeInBytes = ScalarSize / 8;
283 if (Offset.urem(ScalarSizeInBytes) != 0)
284 return false;
285
286 // If we load MinVecNumElts, will our target element still be loaded?
287 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
288 if (OffsetEltIndex >= MinVecNumElts)
289 return false;
290
291 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
292 &DT))
293 return false;
294
295 // Update alignment with offset value. Note that the offset could be negated
296 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
297 // negation does not change the result of the alignment calculation.
298 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
299 }
300
301 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
302 // Use the greater of the alignment on the load or its source pointer.
303 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
304 Type *LoadTy = Load->getType();
305 unsigned AS = Load->getPointerAddressSpace();
306 InstructionCost OldCost =
307 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
308 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
309 OldCost +=
310 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
311 /* Insert */ true, HasExtract, CostKind);
312
313 // New pattern: load VecPtr
314 InstructionCost NewCost =
315 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
316 // Optionally, we are shuffling the loaded vector element(s) into place.
317 // For the mask set everything but element 0 to undef to prevent poison from
318 // propagating from the extra loaded memory. This will also optionally
319 // shrink/grow the vector from the loaded size to the output size.
320 // We assume this operation has no cost in codegen if there was no offset.
321 // Note that we could use freeze to avoid poison problems, but then we might
322 // still need a shuffle to change the vector size.
323 auto *Ty = cast<FixedVectorType>(I.getType());
324 unsigned OutputNumElts = Ty->getNumElements();
325 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
326 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
327 Mask[0] = OffsetEltIndex;
328 if (OffsetEltIndex)
329 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
330 CostKind);
331
332 // We can aggressively convert to the vector form because the backend can
333 // invert this transform if it does not result in a performance win.
334 if (OldCost < NewCost || !NewCost.isValid())
335 return false;
336
337 // It is safe and potentially profitable to load a vector directly:
338 // inselt undef, load Scalar, 0 --> load VecPtr
339 IRBuilder<> Builder(Load);
340 Value *CastedPtr =
341 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
342 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
343 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
344
345 replaceValue(I, *VecLd);
346 ++NumVecLoad;
347 return true;
348}
349
350/// If we are loading a vector and then inserting it into a larger vector with
351/// undefined elements, try to load the larger vector and eliminate the insert.
352/// This removes a shuffle in IR and may allow combining of other loaded values.
353bool VectorCombine::widenSubvectorLoad(Instruction &I) {
354 // Match subvector insert of fixed vector.
355 auto *Shuf = cast<ShuffleVectorInst>(&I);
356 if (!Shuf->isIdentityWithPadding())
357 return false;
358
359 // Allow a non-canonical shuffle mask that is choosing elements from op1.
360 unsigned NumOpElts =
361 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
362 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
363 return M >= (int)(NumOpElts);
364 });
365
366 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
367 if (!canWidenLoad(Load, TTI))
368 return false;
369
370 // We use minimal alignment (maximum flexibility) because we only care about
371 // the dereferenceable region. When calculating cost and creating a new op,
372 // we may use a larger value based on alignment attributes.
373 auto *Ty = cast<FixedVectorType>(I.getType());
374 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
375 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
376 Align Alignment = Load->getAlign();
377 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
378 return false;
379
380 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
381 Type *LoadTy = Load->getType();
382 unsigned AS = Load->getPointerAddressSpace();
383
384 // Original pattern: insert_subvector (load PtrOp)
385 // This conservatively assumes that the cost of a subvector insert into an
386 // undef value is 0. We could add that cost if the cost model accurately
387 // reflects the real cost of that operation.
388 InstructionCost OldCost =
389 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
390
391 // New pattern: load PtrOp
392 InstructionCost NewCost =
393 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
394
395 // We can aggressively convert to the vector form because the backend can
396 // invert this transform if it does not result in a performance win.
397 if (OldCost < NewCost || !NewCost.isValid())
398 return false;
399
400 IRBuilder<> Builder(Load);
401 Value *CastedPtr =
402 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
403 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
404 replaceValue(I, *VecLd);
405 ++NumVecLoad;
406 return true;
407}
408
409/// Determine which, if any, of the inputs should be replaced by a shuffle
410/// followed by extract from a different index.
411ExtractElementInst *VectorCombine::getShuffleExtract(
412 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
413 unsigned PreferredExtractIndex = InvalidIndex) const {
414 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
415 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
416 assert(Index0C && Index1C && "Expected constant extract indexes");
417
418 unsigned Index0 = Index0C->getZExtValue();
419 unsigned Index1 = Index1C->getZExtValue();
420
421 // If the extract indexes are identical, no shuffle is needed.
422 if (Index0 == Index1)
423 return nullptr;
424
425 Type *VecTy = Ext0->getVectorOperand()->getType();
426 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
427 InstructionCost Cost0 =
428 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
429 InstructionCost Cost1 =
430 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
431
432 // If both costs are invalid no shuffle is needed
433 if (!Cost0.isValid() && !Cost1.isValid())
434 return nullptr;
435
436 // We are extracting from 2 different indexes, so one operand must be shuffled
437 // before performing a vector operation and/or extract. The more expensive
438 // extract will be replaced by a shuffle.
439 if (Cost0 > Cost1)
440 return Ext0;
441 if (Cost1 > Cost0)
442 return Ext1;
443
444 // If the costs are equal and there is a preferred extract index, shuffle the
445 // opposite operand.
446 if (PreferredExtractIndex == Index0)
447 return Ext1;
448 if (PreferredExtractIndex == Index1)
449 return Ext0;
450
451 // Otherwise, replace the extract with the higher index.
452 return Index0 > Index1 ? Ext0 : Ext1;
453}
454
455/// Compare the relative costs of 2 extracts followed by scalar operation vs.
456/// vector operation(s) followed by extract. Return true if the existing
457/// instructions are cheaper than a vector alternative. Otherwise, return false
458/// and if one of the extracts should be transformed to a shufflevector, set
459/// \p ConvertToShuffle to that extract instruction.
460bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
461 ExtractElementInst *Ext1,
462 const Instruction &I,
463 ExtractElementInst *&ConvertToShuffle,
464 unsigned PreferredExtractIndex) {
465 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
466 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
467 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
468
469 unsigned Opcode = I.getOpcode();
470 Value *Ext0Src = Ext0->getVectorOperand();
471 Value *Ext1Src = Ext1->getVectorOperand();
472 Type *ScalarTy = Ext0->getType();
473 auto *VecTy = cast<VectorType>(Ext0Src->getType());
474 InstructionCost ScalarOpCost, VectorOpCost;
475
476 // Get cost estimates for scalar and vector versions of the operation.
477 bool IsBinOp = Instruction::isBinaryOp(Opcode);
478 if (IsBinOp) {
479 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
480 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
481 } else {
482 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
483 "Expected a compare");
484 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
485 ScalarOpCost = TTI.getCmpSelInstrCost(
486 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
487 VectorOpCost = TTI.getCmpSelInstrCost(
488 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
489 }
490
491 // Get cost estimates for the extract elements. These costs will factor into
492 // both sequences.
493 unsigned Ext0Index = Ext0IndexC->getZExtValue();
494 unsigned Ext1Index = Ext1IndexC->getZExtValue();
495
496 InstructionCost Extract0Cost =
497 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
498 InstructionCost Extract1Cost =
499 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
500
501 // A more expensive extract will always be replaced by a splat shuffle.
502 // For example, if Ext0 is more expensive:
503 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
504 // extelt (opcode (splat V0, Ext0), V1), Ext1
505 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
506 // check the cost of creating a broadcast shuffle and shuffling both
507 // operands to element 0.
508 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
509 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
510 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
511
512 // Extra uses of the extracts mean that we include those costs in the
513 // vector total because those instructions will not be eliminated.
514 InstructionCost OldCost, NewCost;
515 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
516 // Handle a special case. If the 2 extracts are identical, adjust the
517 // formulas to account for that. The extra use charge allows for either the
518 // CSE'd pattern or an unoptimized form with identical values:
519 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
520 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
521 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
522 OldCost = CheapExtractCost + ScalarOpCost;
523 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
524 } else {
525 // Handle the general case. Each extract is actually a different value:
526 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
527 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
528 NewCost = VectorOpCost + CheapExtractCost +
529 !Ext0->hasOneUse() * Extract0Cost +
530 !Ext1->hasOneUse() * Extract1Cost;
531 }
532
533 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
534 if (ConvertToShuffle) {
535 if (IsBinOp && DisableBinopExtractShuffle)
536 return true;
537
538 // If we are extracting from 2 different indexes, then one operand must be
539 // shuffled before performing the vector operation. The shuffle mask is
540 // poison except for 1 lane that is being translated to the remaining
541 // extraction lane. Therefore, it is a splat shuffle. Ex:
542 // ShufMask = { poison, poison, 0, poison }
543 // TODO: The cost model has an option for a "broadcast" shuffle
544 // (splat-from-element-0), but no option for a more general splat.
545 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
546 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
548 ShuffleMask[BestInsIndex] = BestExtIndex;
550 VecTy, VecTy, ShuffleMask, CostKind, 0,
551 nullptr, {ConvertToShuffle});
552 } else {
554 VecTy, VecTy, {}, CostKind, 0, nullptr,
555 {ConvertToShuffle});
556 }
557 }
558
559 // Aggressively form a vector op if the cost is equal because the transform
560 // may enable further optimization.
561 // Codegen can reverse this transform (scalarize) if it was not profitable.
562 return OldCost < NewCost;
563}
564
565/// Create a shuffle that translates (shifts) 1 element from the input vector
566/// to a new element location.
567static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
568 unsigned NewIndex, IRBuilderBase &Builder) {
569 // The shuffle mask is poison except for 1 lane that is being translated
570 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
571 // ShufMask = { 2, poison, poison, poison }
572 auto *VecTy = cast<FixedVectorType>(Vec->getType());
573 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
574 ShufMask[NewIndex] = OldIndex;
575 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
576}
577
578/// Given an extract element instruction with constant index operand, shuffle
579/// the source vector (shift the scalar element) to a NewIndex for extraction.
580/// Return null if the input can be constant folded, so that we are not creating
581/// unnecessary instructions.
582static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
583 IRBuilderBase &Builder) {
584 // Shufflevectors can only be created for fixed-width vectors.
585 Value *X = ExtElt->getVectorOperand();
586 if (!isa<FixedVectorType>(X->getType()))
587 return nullptr;
588
589 // If the extract can be constant-folded, this code is unsimplified. Defer
590 // to other passes to handle that.
591 Value *C = ExtElt->getIndexOperand();
592 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
593 if (isa<Constant>(X))
594 return nullptr;
595
596 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
597 NewIndex, Builder);
598 return Shuf;
599}
600
601/// Try to reduce extract element costs by converting scalar compares to vector
602/// compares followed by extract.
603/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
604Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
605 Instruction &I) {
606 assert(isa<CmpInst>(&I) && "Expected a compare");
607
608 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
609 // --> extelt (cmp Pred V0, V1), ExtIndex
610 ++NumVecCmp;
611 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
612 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
613 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
614}
615
616/// Try to reduce extract element costs by converting scalar binops to vector
617/// binops followed by extract.
618/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
619Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
620 Instruction &I) {
621 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
622
623 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
624 // --> extelt (bo V0, V1), ExtIndex
625 ++NumVecBO;
626 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
627 V1, "foldExtExtBinop");
628
629 // All IR flags are safe to back-propagate because any potential poison
630 // created in unused vector elements is discarded by the extract.
631 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
632 VecBOInst->copyIRFlags(&I);
633
634 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
635}
636
637/// Match an instruction with extracted vector operands.
638bool VectorCombine::foldExtractExtract(Instruction &I) {
639 // It is not safe to transform things like div, urem, etc. because we may
640 // create undefined behavior when executing those on unknown vector elements.
642 return false;
643
644 Instruction *I0, *I1;
645 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
646 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
648 return false;
649
650 Value *V0, *V1;
651 uint64_t C0, C1;
652 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
653 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
654 V0->getType() != V1->getType())
655 return false;
656
657 // If the scalar value 'I' is going to be re-inserted into a vector, then try
658 // to create an extract to that same element. The extract/insert can be
659 // reduced to a "select shuffle".
660 // TODO: If we add a larger pattern match that starts from an insert, this
661 // probably becomes unnecessary.
662 auto *Ext0 = cast<ExtractElementInst>(I0);
663 auto *Ext1 = cast<ExtractElementInst>(I1);
664 uint64_t InsertIndex = InvalidIndex;
665 if (I.hasOneUse())
666 match(I.user_back(),
667 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
668
669 ExtractElementInst *ExtractToChange;
670 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
671 return false;
672
673 Value *ExtOp0 = Ext0->getVectorOperand();
674 Value *ExtOp1 = Ext1->getVectorOperand();
675
676 if (ExtractToChange) {
677 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
678 Value *NewExtOp =
679 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
680 if (!NewExtOp)
681 return false;
682 if (ExtractToChange == Ext0)
683 ExtOp0 = NewExtOp;
684 else
685 ExtOp1 = NewExtOp;
686 }
687
688 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
689 : Ext0->getIndexOperand();
690 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
691 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
692 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
693 Worklist.push(Ext0);
694 Worklist.push(Ext1);
695 replaceValue(I, *NewExt);
696 return true;
697}
698
699/// Try to replace an extract + scalar fneg + insert with a vector fneg +
700/// shuffle.
701bool VectorCombine::foldInsExtFNeg(Instruction &I) {
702 // Match an insert (op (extract)) pattern.
703 Value *DstVec;
704 uint64_t ExtIdx, InsIdx;
705 Instruction *FNeg;
706 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
707 m_ConstantInt(InsIdx))))
708 return false;
709
710 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
711 Value *SrcVec;
712 Instruction *Extract;
713 if (!match(FNeg, m_FNeg(m_CombineAnd(
714 m_Instruction(Extract),
715 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
716 return false;
717
718 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
719 auto *DstVecScalarTy = DstVecTy->getScalarType();
720 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
721 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
722 return false;
723
724 // Ignore if insert/extract index is out of bounds or destination vector has
725 // one element
726 unsigned NumDstElts = DstVecTy->getNumElements();
727 unsigned NumSrcElts = SrcVecTy->getNumElements();
728 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
729 return false;
730
731 // We are inserting the negated element into the same lane that we extracted
732 // from. This is equivalent to a select-shuffle that chooses all but the
733 // negated element from the destination vector.
734 SmallVector<int> Mask(NumDstElts);
735 std::iota(Mask.begin(), Mask.end(), 0);
736 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
737 InstructionCost OldCost =
738 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
739 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
740
741 // If the extract has one use, it will be eliminated, so count it in the
742 // original cost. If it has more than one use, ignore the cost because it will
743 // be the same before/after.
744 if (Extract->hasOneUse())
745 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
746
747 InstructionCost NewCost =
748 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
750 DstVecTy, Mask, CostKind);
751
752 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
753 // If the lengths of the two vectors are not equal,
754 // we need to add a length-change vector. Add this cost.
755 SmallVector<int> SrcMask;
756 if (NeedLenChg) {
757 SrcMask.assign(NumDstElts, PoisonMaskElem);
758 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
760 DstVecTy, SrcVecTy, SrcMask, CostKind);
761 }
762
763 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
764 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
765 << "\n");
766 if (NewCost > OldCost)
767 return false;
768
769 Value *NewShuf, *LenChgShuf = nullptr;
770 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
771 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
772 if (NeedLenChg) {
773 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
774 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
775 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
776 Worklist.pushValue(LenChgShuf);
777 } else {
778 // shuffle DstVec, (fneg SrcVec), Mask
779 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
780 }
781
782 Worklist.pushValue(VecFNeg);
783 replaceValue(I, *NewShuf);
784 return true;
785}
786
787/// Try to fold insert(binop(x,y),binop(a,b),idx)
788/// --> binop(insert(x,a,idx),insert(y,b,idx))
789bool VectorCombine::foldInsExtBinop(Instruction &I) {
790 BinaryOperator *VecBinOp, *SclBinOp;
791 uint64_t Index;
792 if (!match(&I,
793 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
794 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
795 return false;
796
797 // TODO: Add support for addlike etc.
798 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
799 if (BinOpcode != SclBinOp->getOpcode())
800 return false;
801
802 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
803 if (!ResultTy)
804 return false;
805
806 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
807 // shuffle?
808
810 TTI.getInstructionCost(VecBinOp, CostKind) +
812 InstructionCost NewCost =
813 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
814 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
815 Index, VecBinOp->getOperand(0),
816 SclBinOp->getOperand(0)) +
817 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
818 Index, VecBinOp->getOperand(1),
819 SclBinOp->getOperand(1));
820
821 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
822 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
823 << "\n");
824 if (NewCost > OldCost)
825 return false;
826
827 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
828 SclBinOp->getOperand(0), Index);
829 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
830 SclBinOp->getOperand(1), Index);
831 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
832
833 // Intersect flags from the old binops.
834 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
835 NewInst->copyIRFlags(VecBinOp);
836 NewInst->andIRFlags(SclBinOp);
837 }
838
839 Worklist.pushValue(NewIns0);
840 Worklist.pushValue(NewIns1);
841 replaceValue(I, *NewBO);
842 return true;
843}
844
845/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
846/// Supports: bitcast, trunc, sext, zext
847bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
848 // Check if this is a bitwise logic operation
849 auto *BinOp = dyn_cast<BinaryOperator>(&I);
850 if (!BinOp || !BinOp->isBitwiseLogicOp())
851 return false;
852
853 // Get the cast instructions
854 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
855 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
856 if (!LHSCast || !RHSCast) {
857 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
858 return false;
859 }
860
861 // Both casts must be the same type
862 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
863 if (CastOpcode != RHSCast->getOpcode())
864 return false;
865
866 // Only handle supported cast operations
867 switch (CastOpcode) {
868 case Instruction::BitCast:
869 case Instruction::Trunc:
870 case Instruction::SExt:
871 case Instruction::ZExt:
872 break;
873 default:
874 return false;
875 }
876
877 Value *LHSSrc = LHSCast->getOperand(0);
878 Value *RHSSrc = RHSCast->getOperand(0);
879
880 // Source types must match
881 if (LHSSrc->getType() != RHSSrc->getType())
882 return false;
883
884 auto *SrcTy = LHSSrc->getType();
885 auto *DstTy = I.getType();
886 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
887 // Other casts only handle vector types with integer elements.
888 if (CastOpcode != Instruction::BitCast &&
889 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
890 return false;
891
892 // Only integer scalar/vector values are legal for bitwise logic operations.
893 if (!SrcTy->getScalarType()->isIntegerTy() ||
894 !DstTy->getScalarType()->isIntegerTy())
895 return false;
896
897 // Cost Check :
898 // OldCost = bitlogic + 2*casts
899 // NewCost = bitlogic + cast
900
901 // Calculate specific costs for each cast with instruction context
903 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
905 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
906
907 InstructionCost OldCost =
908 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
909 LHSCastCost + RHSCastCost;
910
911 // For new cost, we can't provide an instruction (it doesn't exist yet)
912 InstructionCost GenericCastCost = TTI.getCastInstrCost(
913 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
914
915 InstructionCost NewCost =
916 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
917 GenericCastCost;
918
919 // Account for multi-use casts using specific costs
920 if (!LHSCast->hasOneUse())
921 NewCost += LHSCastCost;
922 if (!RHSCast->hasOneUse())
923 NewCost += RHSCastCost;
924
925 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
926 << " NewCost=" << NewCost << "\n");
927
928 if (NewCost > OldCost)
929 return false;
930
931 // Create the operation on the source type
932 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
933 BinOp->getName() + ".inner");
934 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
935 NewBinOp->copyIRFlags(BinOp);
936
937 Worklist.pushValue(NewOp);
938
939 // Create the cast operation directly to ensure we get a new instruction
940 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
941
942 // Preserve cast instruction flags
943 NewCast->copyIRFlags(LHSCast);
944 NewCast->andIRFlags(RHSCast);
945
946 // Insert the new instruction
947 Value *Result = Builder.Insert(NewCast);
948
949 replaceValue(I, *Result);
950 return true;
951}
952
953/// Match:
954// bitop(castop(x), C) ->
955// bitop(castop(x), castop(InvC)) ->
956// castop(bitop(x, InvC))
957// Supports: bitcast
958bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
960 Constant *C;
961
962 // Check if this is a bitwise logic operation
964 return false;
965
966 // Get the cast instructions
967 auto *LHSCast = dyn_cast<CastInst>(LHS);
968 if (!LHSCast)
969 return false;
970
971 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
972
973 // Only handle supported cast operations
974 switch (CastOpcode) {
975 case Instruction::BitCast:
976 case Instruction::ZExt:
977 case Instruction::SExt:
978 case Instruction::Trunc:
979 break;
980 default:
981 return false;
982 }
983
984 Value *LHSSrc = LHSCast->getOperand(0);
985
986 auto *SrcTy = LHSSrc->getType();
987 auto *DstTy = I.getType();
988 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
989 // Other casts only handle vector types with integer elements.
990 if (CastOpcode != Instruction::BitCast &&
991 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
992 return false;
993
994 // Only integer scalar/vector values are legal for bitwise logic operations.
995 if (!SrcTy->getScalarType()->isIntegerTy() ||
996 !DstTy->getScalarType()->isIntegerTy())
997 return false;
998
999 // Find the constant InvC, such that castop(InvC) equals to C.
1000 PreservedCastFlags RHSFlags;
1001 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1002 if (!InvC)
1003 return false;
1004
1005 // Cost Check :
1006 // OldCost = bitlogic + cast
1007 // NewCost = bitlogic + cast
1008
1009 // Calculate specific costs for each cast with instruction context
1010 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1011 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1012
1013 InstructionCost OldCost =
1014 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1015
1016 // For new cost, we can't provide an instruction (it doesn't exist yet)
1017 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1018 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1019
1020 InstructionCost NewCost =
1021 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1022 GenericCastCost;
1023
1024 // Account for multi-use casts using specific costs
1025 if (!LHSCast->hasOneUse())
1026 NewCost += LHSCastCost;
1027
1028 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1029 << " NewCost=" << NewCost << "\n");
1030
1031 if (NewCost > OldCost)
1032 return false;
1033
1034 // Create the operation on the source type
1035 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1036 LHSSrc, InvC, I.getName() + ".inner");
1037 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1038 NewBinOp->copyIRFlags(&I);
1039
1040 Worklist.pushValue(NewOp);
1041
1042 // Create the cast operation directly to ensure we get a new instruction
1043 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1044
1045 // Preserve cast instruction flags
1046 if (RHSFlags.NNeg)
1047 NewCast->setNonNeg();
1048 if (RHSFlags.NUW)
1049 NewCast->setHasNoUnsignedWrap();
1050 if (RHSFlags.NSW)
1051 NewCast->setHasNoSignedWrap();
1052
1053 NewCast->andIRFlags(LHSCast);
1054
1055 // Insert the new instruction
1056 Value *Result = Builder.Insert(NewCast);
1057
1058 replaceValue(I, *Result);
1059 return true;
1060}
1061
1062/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1063/// destination type followed by shuffle. This can enable further transforms by
1064/// moving bitcasts or shuffles together.
1065bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1066 Value *V0, *V1;
1067 ArrayRef<int> Mask;
1068 if (!match(&I, m_BitCast(m_OneUse(
1069 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1070 return false;
1071
1072 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1073 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1074 // mask for scalable type is a splat or not.
1075 // 2) Disallow non-vector casts.
1076 // TODO: We could allow any shuffle.
1077 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1078 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1079 if (!DestTy || !SrcTy)
1080 return false;
1081
1082 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1083 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1084 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1085 return false;
1086
1087 bool IsUnary = isa<UndefValue>(V1);
1088
1089 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1090 // if it won't increase the number of bitcasts.
1091 if (!IsUnary) {
1094 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1095 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1096 return false;
1097 }
1098
1099 SmallVector<int, 16> NewMask;
1100 if (DestEltSize <= SrcEltSize) {
1101 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1102 // always be expanded to the equivalent form choosing narrower elements.
1103 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1104 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1105 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1106 } else {
1107 // The bitcast is from narrow elements to wide elements. The shuffle mask
1108 // must choose consecutive elements to allow casting first.
1109 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1110 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1111 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1112 return false;
1113 }
1114
1115 // Bitcast the shuffle src - keep its original width but using the destination
1116 // scalar type.
1117 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1118 auto *NewShuffleTy =
1119 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1120 auto *OldShuffleTy =
1121 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1122 unsigned NumOps = IsUnary ? 1 : 2;
1123
1124 // The new shuffle must not cost more than the old shuffle.
1128
1129 InstructionCost NewCost =
1130 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1131 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1132 TargetTransformInfo::CastContextHint::None,
1133 CostKind));
1134 InstructionCost OldCost =
1135 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1136 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1137 TargetTransformInfo::CastContextHint::None,
1138 CostKind);
1139
1140 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1141 << OldCost << " vs NewCost: " << NewCost << "\n");
1142
1143 if (NewCost > OldCost || !NewCost.isValid())
1144 return false;
1145
1146 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1147 ++NumShufOfBitcast;
1148 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1149 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1150 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1151 replaceValue(I, *Shuf);
1152 return true;
1153}
1154
1155/// VP Intrinsics whose vector operands are both splat values may be simplified
1156/// into the scalar version of the operation and the result splatted. This
1157/// can lead to scalarization down the line.
1158bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1159 if (!isa<VPIntrinsic>(I))
1160 return false;
1161 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1162 Value *Op0 = VPI.getArgOperand(0);
1163 Value *Op1 = VPI.getArgOperand(1);
1164
1165 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1166 return false;
1167
1168 // Check getSplatValue early in this function, to avoid doing unnecessary
1169 // work.
1170 Value *ScalarOp0 = getSplatValue(Op0);
1171 Value *ScalarOp1 = getSplatValue(Op1);
1172 if (!ScalarOp0 || !ScalarOp1)
1173 return false;
1174
1175 // For the binary VP intrinsics supported here, the result on disabled lanes
1176 // is a poison value. For now, only do this simplification if all lanes
1177 // are active.
1178 // TODO: Relax the condition that all lanes are active by using insertelement
1179 // on inactive lanes.
1180 auto IsAllTrueMask = [](Value *MaskVal) {
1181 if (Value *SplattedVal = getSplatValue(MaskVal))
1182 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1183 return ConstValue->isAllOnesValue();
1184 return false;
1185 };
1186 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1187 return false;
1188
1189 // Check to make sure we support scalarization of the intrinsic
1190 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1191 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1192 return false;
1193
1194 // Calculate cost of splatting both operands into vectors and the vector
1195 // intrinsic
1196 VectorType *VecTy = cast<VectorType>(VPI.getType());
1197 SmallVector<int> Mask;
1198 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1199 Mask.resize(FVTy->getNumElements(), 0);
1200 InstructionCost SplatCost =
1201 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1203 CostKind);
1204
1205 // Calculate the cost of the VP Intrinsic
1207 for (Value *V : VPI.args())
1208 Args.push_back(V->getType());
1209 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1210 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1211 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1212
1213 // Determine scalar opcode
1214 std::optional<unsigned> FunctionalOpcode =
1215 VPI.getFunctionalOpcode();
1216 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1217 if (!FunctionalOpcode) {
1218 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1219 if (!ScalarIntrID)
1220 return false;
1221 }
1222
1223 // Calculate cost of scalarizing
1224 InstructionCost ScalarOpCost = 0;
1225 if (ScalarIntrID) {
1226 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1227 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1228 } else {
1229 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1230 VecTy->getScalarType(), CostKind);
1231 }
1232
1233 // The existing splats may be kept around if other instructions use them.
1234 InstructionCost CostToKeepSplats =
1235 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1236 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1237
1238 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1239 << "\n");
1240 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1241 << ", Cost of scalarizing:" << NewCost << "\n");
1242
1243 // We want to scalarize unless the vector variant actually has lower cost.
1244 if (OldCost < NewCost || !NewCost.isValid())
1245 return false;
1246
1247 // Scalarize the intrinsic
1248 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1249 Value *EVL = VPI.getArgOperand(3);
1250
1251 // If the VP op might introduce UB or poison, we can scalarize it provided
1252 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1253 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1254 // scalarizing it.
1255 bool SafeToSpeculate;
1256 if (ScalarIntrID)
1257 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1258 .hasAttribute(Attribute::AttrKind::Speculatable);
1259 else
1261 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1262 if (!SafeToSpeculate &&
1263 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1264 return false;
1265
1266 Value *ScalarVal =
1267 ScalarIntrID
1268 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1269 {ScalarOp0, ScalarOp1})
1270 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1271 ScalarOp0, ScalarOp1);
1272
1273 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1274 return true;
1275}
1276
1277/// Match a vector op/compare/intrinsic with at least one
1278/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1279/// by insertelement.
1280bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1281 auto *UO = dyn_cast<UnaryOperator>(&I);
1282 auto *BO = dyn_cast<BinaryOperator>(&I);
1283 auto *CI = dyn_cast<CmpInst>(&I);
1284 auto *II = dyn_cast<IntrinsicInst>(&I);
1285 if (!UO && !BO && !CI && !II)
1286 return false;
1287
1288 // TODO: Allow intrinsics with different argument types
1289 if (II) {
1290 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1291 return false;
1292 for (auto [Idx, Arg] : enumerate(II->args()))
1293 if (Arg->getType() != II->getType() &&
1294 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1295 return false;
1296 }
1297
1298 // Do not convert the vector condition of a vector select into a scalar
1299 // condition. That may cause problems for codegen because of differences in
1300 // boolean formats and register-file transfers.
1301 // TODO: Can we account for that in the cost model?
1302 if (CI)
1303 for (User *U : I.users())
1304 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1305 return false;
1306
1307 // Match constant vectors or scalars being inserted into constant vectors:
1308 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1309 SmallVector<Value *> VecCs, ScalarOps;
1310 std::optional<uint64_t> Index;
1311
1312 auto Ops = II ? II->args() : I.operands();
1313 for (auto [OpNum, Op] : enumerate(Ops)) {
1314 Constant *VecC;
1315 Value *V;
1316 uint64_t InsIdx = 0;
1317 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1318 m_ConstantInt(InsIdx)))) {
1319 // Bail if any inserts are out of bounds.
1320 VectorType *OpTy = cast<VectorType>(Op->getType());
1321 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1322 return false;
1323 // All inserts must have the same index.
1324 // TODO: Deal with mismatched index constants and variable indexes?
1325 if (!Index)
1326 Index = InsIdx;
1327 else if (InsIdx != *Index)
1328 return false;
1329 VecCs.push_back(VecC);
1330 ScalarOps.push_back(V);
1331 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1332 OpNum, &TTI)) {
1333 VecCs.push_back(Op.get());
1334 ScalarOps.push_back(Op.get());
1335 } else if (match(Op.get(), m_Constant(VecC))) {
1336 VecCs.push_back(VecC);
1337 ScalarOps.push_back(nullptr);
1338 } else {
1339 return false;
1340 }
1341 }
1342
1343 // Bail if all operands are constant.
1344 if (!Index.has_value())
1345 return false;
1346
1347 VectorType *VecTy = cast<VectorType>(I.getType());
1348 Type *ScalarTy = VecTy->getScalarType();
1349 assert(VecTy->isVectorTy() &&
1350 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1351 ScalarTy->isPointerTy()) &&
1352 "Unexpected types for insert element into binop or cmp");
1353
1354 unsigned Opcode = I.getOpcode();
1355 InstructionCost ScalarOpCost, VectorOpCost;
1356 if (CI) {
1357 CmpInst::Predicate Pred = CI->getPredicate();
1358 ScalarOpCost = TTI.getCmpSelInstrCost(
1359 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1360 VectorOpCost = TTI.getCmpSelInstrCost(
1361 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1362 } else if (UO || BO) {
1363 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1364 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1365 } else {
1366 IntrinsicCostAttributes ScalarICA(
1367 II->getIntrinsicID(), ScalarTy,
1368 SmallVector<Type *>(II->arg_size(), ScalarTy));
1369 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1370 IntrinsicCostAttributes VectorICA(
1371 II->getIntrinsicID(), VecTy,
1372 SmallVector<Type *>(II->arg_size(), VecTy));
1373 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1374 }
1375
1376 // Fold the vector constants in the original vectors into a new base vector to
1377 // get more accurate cost modelling.
1378 Value *NewVecC = nullptr;
1379 if (CI)
1380 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1381 else if (UO)
1382 NewVecC =
1383 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1384 else if (BO)
1385 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1386 else if (II)
1387 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1388
1389 if (!NewVecC)
1390 return false;
1391
1392 // Get cost estimate for the insert element. This cost will factor into
1393 // both sequences.
1394 InstructionCost OldCost = VectorOpCost;
1395 InstructionCost NewCost =
1396 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1397 CostKind, *Index, NewVecC);
1398
1399 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1400 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1401 II->getIntrinsicID(), Idx, &TTI)))
1402 continue;
1404 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1405 OldCost += InsertCost;
1406 NewCost += !Op->hasOneUse() * InsertCost;
1407 }
1408
1409 // We want to scalarize unless the vector variant actually has lower cost.
1410 if (OldCost < NewCost || !NewCost.isValid())
1411 return false;
1412
1413 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1414 // inselt NewVecC, (scalar_op V0, V1), Index
1415 if (CI)
1416 ++NumScalarCmp;
1417 else if (UO || BO)
1418 ++NumScalarOps;
1419 else
1420 ++NumScalarIntrinsic;
1421
1422 // For constant cases, extract the scalar element, this should constant fold.
1423 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1424 if (!Scalar)
1426 cast<Constant>(VecC), Builder.getInt64(*Index));
1427
1428 Value *Scalar;
1429 if (CI)
1430 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1431 else if (UO || BO)
1432 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1433 else
1434 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1435
1436 Scalar->setName(I.getName() + ".scalar");
1437
1438 // All IR flags are safe to back-propagate. There is no potential for extra
1439 // poison to be created by the scalar instruction.
1440 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1441 ScalarInst->copyIRFlags(&I);
1442
1443 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1444 replaceValue(I, *Insert);
1445 return true;
1446}
1447
1448/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1449/// a vector into vector operations followed by extract. Note: The SLP pass
1450/// may miss this pattern because of implementation problems.
1451bool VectorCombine::foldExtractedCmps(Instruction &I) {
1452 auto *BI = dyn_cast<BinaryOperator>(&I);
1453
1454 // We are looking for a scalar binop of booleans.
1455 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1456 if (!BI || !I.getType()->isIntegerTy(1))
1457 return false;
1458
1459 // The compare predicates should match, and each compare should have a
1460 // constant operand.
1461 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1462 Instruction *I0, *I1;
1463 Constant *C0, *C1;
1464 CmpPredicate P0, P1;
1465 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1466 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1467 return false;
1468
1469 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1470 if (!MatchingPred)
1471 return false;
1472
1473 // The compare operands must be extracts of the same vector with constant
1474 // extract indexes.
1475 Value *X;
1476 uint64_t Index0, Index1;
1477 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1478 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1479 return false;
1480
1481 auto *Ext0 = cast<ExtractElementInst>(I0);
1482 auto *Ext1 = cast<ExtractElementInst>(I1);
1483 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1484 if (!ConvertToShuf)
1485 return false;
1486 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1487 "Unknown ExtractElementInst");
1488
1489 // The original scalar pattern is:
1490 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1491 CmpInst::Predicate Pred = *MatchingPred;
1492 unsigned CmpOpcode =
1493 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1494 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1495 if (!VecTy)
1496 return false;
1497
1498 InstructionCost Ext0Cost =
1499 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1500 InstructionCost Ext1Cost =
1501 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1503 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1504 CostKind);
1505
1506 InstructionCost OldCost =
1507 Ext0Cost + Ext1Cost + CmpCost * 2 +
1508 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1509
1510 // The proposed vector pattern is:
1511 // vcmp = cmp Pred X, VecC
1512 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1513 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1514 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1517 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1518 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1519 ShufMask[CheapIndex] = ExpensiveIndex;
1521 CmpTy, ShufMask, CostKind);
1522 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1523 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1524 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1525 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1526
1527 // Aggressively form vector ops if the cost is equal because the transform
1528 // may enable further optimization.
1529 // Codegen can reverse this transform (scalarize) if it was not profitable.
1530 if (OldCost < NewCost || !NewCost.isValid())
1531 return false;
1532
1533 // Create a vector constant from the 2 scalar constants.
1534 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1535 PoisonValue::get(VecTy->getElementType()));
1536 CmpC[Index0] = C0;
1537 CmpC[Index1] = C1;
1538 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1539 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1540 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1541 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1542 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1543 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1544 replaceValue(I, *NewExt);
1545 ++NumVecCmpBO;
1546 return true;
1547}
1548
1551 const TargetTransformInfo &TTI,
1552 InstructionCost &CostBeforeReduction,
1553 InstructionCost &CostAfterReduction) {
1554 Instruction *Op0, *Op1;
1555 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1556 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1557 unsigned ReductionOpc =
1558 getArithmeticReductionInstruction(II.getIntrinsicID());
1559 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1560 bool IsUnsigned = isa<ZExtInst>(RedOp);
1561 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1562
1563 CostBeforeReduction =
1564 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1566 CostAfterReduction =
1567 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1568 ExtType, FastMathFlags(), CostKind);
1569 return;
1570 }
1571 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1572 match(RedOp,
1574 match(Op0, m_ZExtOrSExt(m_Value())) &&
1575 Op0->getOpcode() == Op1->getOpcode() &&
1576 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1577 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1578 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1579 bool IsUnsigned = isa<ZExtInst>(Op0);
1580 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1581 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1582
1583 InstructionCost ExtCost =
1584 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1586 InstructionCost MulCost =
1587 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1588 InstructionCost Ext2Cost =
1589 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1591
1592 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1593 CostAfterReduction = TTI.getMulAccReductionCost(
1594 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1595 return;
1596 }
1597 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1598 std::nullopt, CostKind);
1599}
1600
1601bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1602 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1603 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1604 if (BinOpOpc == Instruction::Sub)
1605 ReductionIID = Intrinsic::vector_reduce_add;
1606 if (ReductionIID == Intrinsic::not_intrinsic)
1607 return false;
1608
1609 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1610 Intrinsic::ID IID) -> Value * {
1611 auto *II = dyn_cast<IntrinsicInst>(V);
1612 if (!II)
1613 return nullptr;
1614 if (II->getIntrinsicID() == IID && II->hasOneUse())
1615 return II->getArgOperand(0);
1616 return nullptr;
1617 };
1618
1619 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1620 if (!V0)
1621 return false;
1622 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1623 if (!V1)
1624 return false;
1625
1626 auto *VTy = cast<VectorType>(V0->getType());
1627 if (V1->getType() != VTy)
1628 return false;
1629 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1630 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1631 unsigned ReductionOpc =
1632 getArithmeticReductionInstruction(II0.getIntrinsicID());
1633
1634 InstructionCost OldCost = 0;
1635 InstructionCost NewCost = 0;
1636 InstructionCost CostOfRedOperand0 = 0;
1637 InstructionCost CostOfRed0 = 0;
1638 InstructionCost CostOfRedOperand1 = 0;
1639 InstructionCost CostOfRed1 = 0;
1640 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1641 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1642 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1643 NewCost =
1644 CostOfRedOperand0 + CostOfRedOperand1 +
1645 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1646 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1647 if (NewCost >= OldCost || !NewCost.isValid())
1648 return false;
1649
1650 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1651 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1652 << "\n");
1653 Value *VectorBO;
1654 if (BinOpOpc == Instruction::Or)
1655 VectorBO = Builder.CreateOr(V0, V1, "",
1656 cast<PossiblyDisjointInst>(I).isDisjoint());
1657 else
1658 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1659
1660 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1661 replaceValue(I, *Rdx);
1662 return true;
1663}
1664
1665// Check if memory loc modified between two instrs in the same BB
1668 const MemoryLocation &Loc, AAResults &AA) {
1669 unsigned NumScanned = 0;
1670 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1671 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1672 ++NumScanned > MaxInstrsToScan;
1673 });
1674}
1675
1676namespace {
1677/// Helper class to indicate whether a vector index can be safely scalarized and
1678/// if a freeze needs to be inserted.
1679class ScalarizationResult {
1680 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1681
1682 StatusTy Status;
1683 Value *ToFreeze;
1684
1685 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1686 : Status(Status), ToFreeze(ToFreeze) {}
1687
1688public:
1689 ScalarizationResult(const ScalarizationResult &Other) = default;
1690 ~ScalarizationResult() {
1691 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1692 }
1693
1694 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1695 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1696 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1697 return {StatusTy::SafeWithFreeze, ToFreeze};
1698 }
1699
1700 /// Returns true if the index can be scalarize without requiring a freeze.
1701 bool isSafe() const { return Status == StatusTy::Safe; }
1702 /// Returns true if the index cannot be scalarized.
1703 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1704 /// Returns true if the index can be scalarize, but requires inserting a
1705 /// freeze.
1706 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1707
1708 /// Reset the state of Unsafe and clear ToFreze if set.
1709 void discard() {
1710 ToFreeze = nullptr;
1711 Status = StatusTy::Unsafe;
1712 }
1713
1714 /// Freeze the ToFreeze and update the use in \p User to use it.
1715 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1716 assert(isSafeWithFreeze() &&
1717 "should only be used when freezing is required");
1718 assert(is_contained(ToFreeze->users(), &UserI) &&
1719 "UserI must be a user of ToFreeze");
1720 IRBuilder<>::InsertPointGuard Guard(Builder);
1721 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1722 Value *Frozen =
1723 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1724 for (Use &U : make_early_inc_range((UserI.operands())))
1725 if (U.get() == ToFreeze)
1726 U.set(Frozen);
1727
1728 ToFreeze = nullptr;
1729 }
1730};
1731} // namespace
1732
1733/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1734/// Idx. \p Idx must access a valid vector element.
1735static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1736 Instruction *CtxI,
1737 AssumptionCache &AC,
1738 const DominatorTree &DT) {
1739 // We do checks for both fixed vector types and scalable vector types.
1740 // This is the number of elements of fixed vector types,
1741 // or the minimum number of elements of scalable vector types.
1742 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1743 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1744
1745 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1746 if (C->getValue().ult(NumElements))
1747 return ScalarizationResult::safe();
1748 return ScalarizationResult::unsafe();
1749 }
1750
1751 // Always unsafe if the index type can't handle all inbound values.
1752 if (!llvm::isUIntN(IntWidth, NumElements))
1753 return ScalarizationResult::unsafe();
1754
1755 APInt Zero(IntWidth, 0);
1756 APInt MaxElts(IntWidth, NumElements);
1757 ConstantRange ValidIndices(Zero, MaxElts);
1758 ConstantRange IdxRange(IntWidth, true);
1759
1760 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1761 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1762 true, &AC, CtxI, &DT)))
1763 return ScalarizationResult::safe();
1764 return ScalarizationResult::unsafe();
1765 }
1766
1767 // If the index may be poison, check if we can insert a freeze before the
1768 // range of the index is restricted.
1769 Value *IdxBase;
1770 ConstantInt *CI;
1771 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1772 IdxRange = IdxRange.binaryAnd(CI->getValue());
1773 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1774 IdxRange = IdxRange.urem(CI->getValue());
1775 }
1776
1777 if (ValidIndices.contains(IdxRange))
1778 return ScalarizationResult::safeWithFreeze(IdxBase);
1779 return ScalarizationResult::unsafe();
1780}
1781
1782/// The memory operation on a vector of \p ScalarType had alignment of
1783/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1784/// alignment that will be valid for the memory operation on a single scalar
1785/// element of the same type with index \p Idx.
1787 Type *ScalarType, Value *Idx,
1788 const DataLayout &DL) {
1789 if (auto *C = dyn_cast<ConstantInt>(Idx))
1790 return commonAlignment(VectorAlignment,
1791 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1792 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1793}
1794
1795// Combine patterns like:
1796// %0 = load <4 x i32>, <4 x i32>* %a
1797// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1798// store <4 x i32> %1, <4 x i32>* %a
1799// to:
1800// %0 = bitcast <4 x i32>* %a to i32*
1801// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1802// store i32 %b, i32* %1
1803bool VectorCombine::foldSingleElementStore(Instruction &I) {
1805 return false;
1806 auto *SI = cast<StoreInst>(&I);
1807 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1808 return false;
1809
1810 // TODO: Combine more complicated patterns (multiple insert) by referencing
1811 // TargetTransformInfo.
1813 Value *NewElement;
1814 Value *Idx;
1815 if (!match(SI->getValueOperand(),
1816 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1817 m_Value(Idx))))
1818 return false;
1819
1820 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1821 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1822 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1823 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1824 // modified between, vector type matches store size, and index is inbounds.
1825 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1826 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1827 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1828 return false;
1829
1830 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1831 if (ScalarizableIdx.isUnsafe() ||
1832 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1833 MemoryLocation::get(SI), AA))
1834 return false;
1835
1836 // Ensure we add the load back to the worklist BEFORE its users so they can
1837 // erased in the correct order.
1838 Worklist.push(Load);
1839
1840 if (ScalarizableIdx.isSafeWithFreeze())
1841 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1842 Value *GEP = Builder.CreateInBoundsGEP(
1843 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1844 {ConstantInt::get(Idx->getType(), 0), Idx});
1845 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1846 NSI->copyMetadata(*SI);
1847 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1848 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1849 *DL);
1850 NSI->setAlignment(ScalarOpAlignment);
1851 replaceValue(I, *NSI);
1853 return true;
1854 }
1855
1856 return false;
1857}
1858
1859/// Try to scalarize vector loads feeding extractelement or bitcast
1860/// instructions.
1861bool VectorCombine::scalarizeLoad(Instruction &I) {
1862 Value *Ptr;
1863 if (!match(&I, m_Load(m_Value(Ptr))))
1864 return false;
1865
1866 auto *LI = cast<LoadInst>(&I);
1867 auto *VecTy = cast<VectorType>(LI->getType());
1868 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1869 return false;
1870
1871 bool AllExtracts = true;
1872 bool AllBitcasts = true;
1873 Instruction *LastCheckedInst = LI;
1874 unsigned NumInstChecked = 0;
1875
1876 // Check what type of users we have (must either all be extracts or
1877 // bitcasts) and ensure no memory modifications between the load and
1878 // its users.
1879 for (User *U : LI->users()) {
1880 auto *UI = dyn_cast<Instruction>(U);
1881 if (!UI || UI->getParent() != LI->getParent())
1882 return false;
1883
1884 // If any user is waiting to be erased, then bail out as this will
1885 // distort the cost calculation and possibly lead to infinite loops.
1886 if (UI->use_empty())
1887 return false;
1888
1889 if (!isa<ExtractElementInst>(UI))
1890 AllExtracts = false;
1891 if (!isa<BitCastInst>(UI))
1892 AllBitcasts = false;
1893
1894 // Check if any instruction between the load and the user may modify memory.
1895 if (LastCheckedInst->comesBefore(UI)) {
1896 for (Instruction &I :
1897 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1898 // Bail out if we reached the check limit or the instruction may write
1899 // to memory.
1900 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1901 return false;
1902 NumInstChecked++;
1903 }
1904 LastCheckedInst = UI;
1905 }
1906 }
1907
1908 if (AllExtracts)
1909 return scalarizeLoadExtract(LI, VecTy, Ptr);
1910 if (AllBitcasts)
1911 return scalarizeLoadBitcast(LI, VecTy, Ptr);
1912 return false;
1913}
1914
1915/// Try to scalarize vector loads feeding extractelement instructions.
1916bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
1917 Value *Ptr) {
1919 return false;
1920
1921 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1922 auto FailureGuard = make_scope_exit([&]() {
1923 // If the transform is aborted, discard the ScalarizationResults.
1924 for (auto &Pair : NeedFreeze)
1925 Pair.second.discard();
1926 });
1927
1928 InstructionCost OriginalCost =
1929 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1931 InstructionCost ScalarizedCost = 0;
1932
1933 for (User *U : LI->users()) {
1934 auto *UI = cast<ExtractElementInst>(U);
1935
1936 auto ScalarIdx =
1937 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1938 if (ScalarIdx.isUnsafe())
1939 return false;
1940 if (ScalarIdx.isSafeWithFreeze()) {
1941 NeedFreeze.try_emplace(UI, ScalarIdx);
1942 ScalarIdx.discard();
1943 }
1944
1945 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1946 OriginalCost +=
1947 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1948 Index ? Index->getZExtValue() : -1);
1949 ScalarizedCost +=
1950 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1952 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1953 nullptr, nullptr, CostKind);
1954 }
1955
1956 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
1957 << "\n LoadExtractCost: " << OriginalCost
1958 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1959
1960 if (ScalarizedCost >= OriginalCost)
1961 return false;
1962
1963 // Ensure we add the load back to the worklist BEFORE its users so they can
1964 // erased in the correct order.
1965 Worklist.push(LI);
1966
1967 Type *ElemType = VecTy->getElementType();
1968
1969 // Replace extracts with narrow scalar loads.
1970 for (User *U : LI->users()) {
1971 auto *EI = cast<ExtractElementInst>(U);
1972 Value *Idx = EI->getIndexOperand();
1973
1974 // Insert 'freeze' for poison indexes.
1975 auto It = NeedFreeze.find(EI);
1976 if (It != NeedFreeze.end())
1977 It->second.freeze(Builder, *cast<Instruction>(Idx));
1978
1979 Builder.SetInsertPoint(EI);
1980 Value *GEP =
1981 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1982 auto *NewLoad = cast<LoadInst>(
1983 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1984
1985 Align ScalarOpAlignment =
1986 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1987 NewLoad->setAlignment(ScalarOpAlignment);
1988
1989 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1990 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1991 AAMDNodes OldAAMD = LI->getAAMetadata();
1992 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1993 }
1994
1995 replaceValue(*EI, *NewLoad, false);
1996 }
1997
1998 FailureGuard.release();
1999 return true;
2000}
2001
2002/// Try to scalarize vector loads feeding bitcast instructions.
2003bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2004 Value *Ptr) {
2005 InstructionCost OriginalCost =
2006 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2008
2009 Type *TargetScalarType = nullptr;
2010 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2011
2012 for (User *U : LI->users()) {
2013 auto *BC = cast<BitCastInst>(U);
2014
2015 Type *DestTy = BC->getDestTy();
2016 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2017 return false;
2018
2019 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2020 if (DestBitWidth != VecBitWidth)
2021 return false;
2022
2023 // All bitcasts must target the same scalar type.
2024 if (!TargetScalarType)
2025 TargetScalarType = DestTy;
2026 else if (TargetScalarType != DestTy)
2027 return false;
2028
2029 OriginalCost +=
2030 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2032 }
2033
2034 if (!TargetScalarType)
2035 return false;
2036
2037 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2038 InstructionCost ScalarizedCost =
2039 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2041
2042 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2043 << "\n OriginalCost: " << OriginalCost
2044 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2045
2046 if (ScalarizedCost >= OriginalCost)
2047 return false;
2048
2049 // Ensure we add the load back to the worklist BEFORE its users so they can
2050 // erased in the correct order.
2051 Worklist.push(LI);
2052
2053 Builder.SetInsertPoint(LI);
2054 auto *ScalarLoad =
2055 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2056 ScalarLoad->setAlignment(LI->getAlign());
2057 ScalarLoad->copyMetadata(*LI);
2058
2059 // Replace all bitcast users with the scalar load.
2060 for (User *U : LI->users()) {
2061 auto *BC = cast<BitCastInst>(U);
2062 replaceValue(*BC, *ScalarLoad, false);
2063 }
2064
2065 return true;
2066}
2067
2068bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2070 return false;
2071 auto *Ext = dyn_cast<ZExtInst>(&I);
2072 if (!Ext)
2073 return false;
2074
2075 // Try to convert a vector zext feeding only extracts to a set of scalar
2076 // (Src << ExtIdx *Size) & (Size -1)
2077 // if profitable .
2078 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2079 if (!SrcTy)
2080 return false;
2081 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2082
2083 Type *ScalarDstTy = DstTy->getElementType();
2084 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2085 return false;
2086
2087 InstructionCost VectorCost =
2088 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2090 unsigned ExtCnt = 0;
2091 bool ExtLane0 = false;
2092 for (User *U : Ext->users()) {
2093 uint64_t Idx;
2094 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2095 return false;
2096 if (cast<Instruction>(U)->use_empty())
2097 continue;
2098 ExtCnt += 1;
2099 ExtLane0 |= !Idx;
2100 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2101 CostKind, Idx, U);
2102 }
2103
2104 InstructionCost ScalarCost =
2105 ExtCnt * TTI.getArithmeticInstrCost(
2106 Instruction::And, ScalarDstTy, CostKind,
2109 (ExtCnt - ExtLane0) *
2111 Instruction::LShr, ScalarDstTy, CostKind,
2114 if (ScalarCost > VectorCost)
2115 return false;
2116
2117 Value *ScalarV = Ext->getOperand(0);
2118 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2119 &DT)) {
2120 // Check wether all lanes are extracted, all extracts trigger UB
2121 // on poison, and the last extract (and hence all previous ones)
2122 // are guaranteed to execute if Ext executes. If so, we do not
2123 // need to insert a freeze.
2124 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2125 bool AllExtractsTriggerUB = true;
2126 ExtractElementInst *LastExtract = nullptr;
2127 BasicBlock *ExtBB = Ext->getParent();
2128 for (User *U : Ext->users()) {
2129 auto *Extract = cast<ExtractElementInst>(U);
2130 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2131 AllExtractsTriggerUB = false;
2132 break;
2133 }
2134 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2135 if (!LastExtract || LastExtract->comesBefore(Extract))
2136 LastExtract = Extract;
2137 }
2138 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2139 !AllExtractsTriggerUB ||
2141 LastExtract->getIterator()))
2142 ScalarV = Builder.CreateFreeze(ScalarV);
2143 }
2144 ScalarV = Builder.CreateBitCast(
2145 ScalarV,
2146 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2147 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2148 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2149 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2150 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2151 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2152 for (User *U : Ext->users()) {
2153 auto *Extract = cast<ExtractElementInst>(U);
2154 uint64_t Idx =
2155 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2156 uint64_t ShiftAmt =
2157 DL->isBigEndian()
2158 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2159 : (Idx * SrcEltSizeInBits);
2160 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2161 Value *And = Builder.CreateAnd(LShr, Mask);
2162 U->replaceAllUsesWith(And);
2163 }
2164 return true;
2165}
2166
2167/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2168/// to "(bitcast (concat X, Y))"
2169/// where X/Y are bitcasted from i1 mask vectors.
2170bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2171 Type *Ty = I.getType();
2172 if (!Ty->isIntegerTy())
2173 return false;
2174
2175 // TODO: Add big endian test coverage
2176 if (DL->isBigEndian())
2177 return false;
2178
2179 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2180 Instruction *X, *Y;
2182 return false;
2183
2184 // Allow both sources to contain shl, to handle more generic pattern:
2185 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2186 Value *SrcX;
2187 uint64_t ShAmtX = 0;
2188 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2189 !match(X, m_OneUse(
2191 m_ConstantInt(ShAmtX)))))
2192 return false;
2193
2194 Value *SrcY;
2195 uint64_t ShAmtY = 0;
2196 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2197 !match(Y, m_OneUse(
2199 m_ConstantInt(ShAmtY)))))
2200 return false;
2201
2202 // Canonicalize larger shift to the RHS.
2203 if (ShAmtX > ShAmtY) {
2204 std::swap(X, Y);
2205 std::swap(SrcX, SrcY);
2206 std::swap(ShAmtX, ShAmtY);
2207 }
2208
2209 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2210 // difference is the mask width so they can be easily concatenated together.
2211 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2212 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2213 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2214 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2215 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2216 !MaskTy->getElementType()->isIntegerTy(1) ||
2217 MaskTy->getNumElements() != ShAmtDiff ||
2218 MaskTy->getNumElements() > (BitWidth / 2))
2219 return false;
2220
2221 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2222 auto *ConcatIntTy =
2223 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2224 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2225
2226 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2227 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2228
2229 // TODO: Is it worth supporting multi use cases?
2230 InstructionCost OldCost = 0;
2231 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2232 OldCost +=
2233 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2234 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2236 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2238
2239 InstructionCost NewCost = 0;
2241 MaskTy, ConcatMask, CostKind);
2242 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2244 if (Ty != ConcatIntTy)
2245 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2247 if (ShAmtX > 0)
2248 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2249
2250 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2251 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2252 << "\n");
2253
2254 if (NewCost > OldCost)
2255 return false;
2256
2257 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2258 // any residual zero-extension or shifting.
2259 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2260 Worklist.pushValue(Concat);
2261
2262 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2263
2264 if (Ty != ConcatIntTy) {
2265 Worklist.pushValue(Result);
2266 Result = Builder.CreateZExt(Result, Ty);
2267 }
2268
2269 if (ShAmtX > 0) {
2270 Worklist.pushValue(Result);
2271 Result = Builder.CreateShl(Result, ShAmtX);
2272 }
2273
2274 replaceValue(I, *Result);
2275 return true;
2276}
2277
2278/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2279/// --> "binop (shuffle), (shuffle)".
2280bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2281 BinaryOperator *BinOp;
2282 ArrayRef<int> OuterMask;
2283 if (!match(&I,
2284 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2285 return false;
2286
2287 // Don't introduce poison into div/rem.
2288 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2289 return false;
2290
2291 Value *Op00, *Op01, *Op10, *Op11;
2292 ArrayRef<int> Mask0, Mask1;
2293 bool Match0 =
2294 match(BinOp->getOperand(0),
2295 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2296 bool Match1 =
2297 match(BinOp->getOperand(1),
2298 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2299 if (!Match0 && !Match1)
2300 return false;
2301
2302 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2303 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2304 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2305 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2306
2307 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2308 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2309 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2310 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2311 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2312 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2313 return false;
2314
2315 unsigned NumSrcElts = BinOpTy->getNumElements();
2316
2317 // Don't accept shuffles that reference the second operand in
2318 // div/rem or if its an undef arg.
2319 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2320 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2321 return false;
2322
2323 // Merge outer / inner (or identity if no match) shuffles.
2324 SmallVector<int> NewMask0, NewMask1;
2325 for (int M : OuterMask) {
2326 if (M < 0 || M >= (int)NumSrcElts) {
2327 NewMask0.push_back(PoisonMaskElem);
2328 NewMask1.push_back(PoisonMaskElem);
2329 } else {
2330 NewMask0.push_back(Match0 ? Mask0[M] : M);
2331 NewMask1.push_back(Match1 ? Mask1[M] : M);
2332 }
2333 }
2334
2335 unsigned NumOpElts = Op0Ty->getNumElements();
2336 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2337 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2338 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2339 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2340 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2341 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2342
2343 // Try to merge shuffles across the binop if the new shuffles are not costly.
2344 InstructionCost OldCost =
2345 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2347 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2348 if (Match0)
2349 OldCost += TTI.getShuffleCost(
2350 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2351 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2352 if (Match1)
2353 OldCost += TTI.getShuffleCost(
2354 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2355 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2356
2357 InstructionCost NewCost =
2358 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2359
2360 if (!IsIdentity0)
2361 NewCost +=
2363 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2364 if (!IsIdentity1)
2365 NewCost +=
2367 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2368
2369 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2370 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2371 << "\n");
2372
2373 // If costs are equal, still fold as we reduce instruction count.
2374 if (NewCost > OldCost)
2375 return false;
2376
2377 Value *LHS =
2378 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2379 Value *RHS =
2380 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2381 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2382
2383 // Intersect flags from the old binops.
2384 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2385 NewInst->copyIRFlags(BinOp);
2386
2387 Worklist.pushValue(LHS);
2388 Worklist.pushValue(RHS);
2389 replaceValue(I, *NewBO);
2390 return true;
2391}
2392
2393/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2394/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2395bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2396 ArrayRef<int> OldMask;
2397 Instruction *LHS, *RHS;
2399 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2400 return false;
2401
2402 // TODO: Add support for addlike etc.
2403 if (LHS->getOpcode() != RHS->getOpcode())
2404 return false;
2405
2406 Value *X, *Y, *Z, *W;
2407 bool IsCommutative = false;
2408 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2409 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2410 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2411 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2412 auto *BO = cast<BinaryOperator>(LHS);
2413 // Don't introduce poison into div/rem.
2414 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2415 return false;
2416 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2417 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2418 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2419 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2420 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2421 } else
2422 return false;
2423
2424 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2425 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2426 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2427 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2428 return false;
2429
2430 unsigned NumSrcElts = BinOpTy->getNumElements();
2431
2432 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2433 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2434 std::swap(X, Y);
2435
2436 auto ConvertToUnary = [NumSrcElts](int &M) {
2437 if (M >= (int)NumSrcElts)
2438 M -= NumSrcElts;
2439 };
2440
2441 SmallVector<int> NewMask0(OldMask);
2443 if (X == Z) {
2444 llvm::for_each(NewMask0, ConvertToUnary);
2446 Z = PoisonValue::get(BinOpTy);
2447 }
2448
2449 SmallVector<int> NewMask1(OldMask);
2451 if (Y == W) {
2452 llvm::for_each(NewMask1, ConvertToUnary);
2454 W = PoisonValue::get(BinOpTy);
2455 }
2456
2457 // Try to replace a binop with a shuffle if the shuffle is not costly.
2458 InstructionCost OldCost =
2462 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2463 &I);
2464
2465 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2466 // where one use shuffles have gotten split across the binop/cmp. These
2467 // often allow a major reduction in total cost that wouldn't happen as
2468 // individual folds.
2469 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2470 TTI::TargetCostKind CostKind) -> bool {
2471 Value *InnerOp;
2472 ArrayRef<int> InnerMask;
2473 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2474 m_Mask(InnerMask)))) &&
2475 InnerOp->getType() == Op->getType() &&
2476 all_of(InnerMask,
2477 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2478 for (int &M : Mask)
2479 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2480 M = InnerMask[M - Offset];
2481 M = 0 <= M ? M + Offset : M;
2482 }
2484 Op = InnerOp;
2485 return true;
2486 }
2487 return false;
2488 };
2489 bool ReducedInstCount = false;
2490 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2491 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2492 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2493 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2494
2495 auto *ShuffleCmpTy =
2496 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2497 InstructionCost NewCost =
2498 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2499 nullptr, {X, Z}) +
2500 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2501 nullptr, {Y, W});
2502
2503 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2504 NewCost +=
2505 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2506 } else {
2507 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2508 ShuffleDstTy, PredLHS, CostKind);
2509 }
2510
2511 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2512 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2513 << "\n");
2514
2515 // If either shuffle will constant fold away, then fold for the same cost as
2516 // we will reduce the instruction count.
2517 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2518 (isa<Constant>(Y) && isa<Constant>(W));
2519 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2520 return false;
2521
2522 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2523 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2524 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2525 ? Builder.CreateBinOp(
2526 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2527 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2528
2529 // Intersect flags from the old binops.
2530 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2531 NewInst->copyIRFlags(LHS);
2532 NewInst->andIRFlags(RHS);
2533 }
2534
2535 Worklist.pushValue(Shuf0);
2536 Worklist.pushValue(Shuf1);
2537 replaceValue(I, *NewBO);
2538 return true;
2539}
2540
2541/// Try to convert,
2542/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2543/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2544bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2545 ArrayRef<int> Mask;
2546 Value *C1, *T1, *F1, *C2, *T2, *F2;
2547 if (!match(&I, m_Shuffle(
2549 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2550 m_Mask(Mask))))
2551 return false;
2552
2553 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2554 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2555 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2556 return false;
2557
2558 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2559 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2560 // SelectInsts must have the same FMF.
2561 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2562 ((SI0FOp != nullptr) &&
2563 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2564 return false;
2565
2566 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2567 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2569 auto SelOp = Instruction::Select;
2571 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2572 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2574 OldCost +=
2575 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2576 {I.getOperand(0), I.getOperand(1)}, &I);
2577
2579 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2580 Mask, CostKind, 0, nullptr, {C1, C2});
2581 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2582 nullptr, {T1, T2});
2583 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2584 nullptr, {F1, F2});
2585 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2586 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2587 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2589
2590 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2591 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2592 << "\n");
2593 if (NewCost > OldCost)
2594 return false;
2595
2596 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2597 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2598 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2599 Value *NewSel;
2600 // We presuppose that the SelectInsts have the same FMF.
2601 if (SI0FOp)
2602 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2603 SI0FOp->getFastMathFlags());
2604 else
2605 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2606
2607 Worklist.pushValue(ShuffleCmp);
2608 Worklist.pushValue(ShuffleTrue);
2609 Worklist.pushValue(ShuffleFalse);
2610 replaceValue(I, *NewSel);
2611 return true;
2612}
2613
2614/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2615/// into "castop (shuffle)".
2616bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2617 Value *V0, *V1;
2618 ArrayRef<int> OldMask;
2619 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2620 return false;
2621
2622 // Check whether this is a binary shuffle.
2623 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2624
2625 auto *C0 = dyn_cast<CastInst>(V0);
2626 auto *C1 = dyn_cast<CastInst>(V1);
2627 if (!C0 || (IsBinaryShuffle && !C1))
2628 return false;
2629
2630 Instruction::CastOps Opcode = C0->getOpcode();
2631
2632 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2633 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2634 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2635 return false;
2636
2637 if (IsBinaryShuffle) {
2638 if (C0->getSrcTy() != C1->getSrcTy())
2639 return false;
2640 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2641 if (Opcode != C1->getOpcode()) {
2642 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2643 Opcode = Instruction::SExt;
2644 else
2645 return false;
2646 }
2647 }
2648
2649 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2650 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2651 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2652 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2653 return false;
2654
2655 unsigned NumSrcElts = CastSrcTy->getNumElements();
2656 unsigned NumDstElts = CastDstTy->getNumElements();
2657 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2658 "Only bitcasts expected to alter src/dst element counts");
2659
2660 // Check for bitcasting of unscalable vector types.
2661 // e.g. <32 x i40> -> <40 x i32>
2662 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2663 (NumDstElts % NumSrcElts) != 0)
2664 return false;
2665
2666 SmallVector<int, 16> NewMask;
2667 if (NumSrcElts >= NumDstElts) {
2668 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2669 // always be expanded to the equivalent form choosing narrower elements.
2670 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2671 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2672 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2673 } else {
2674 // The bitcast is from narrow elements to wide elements. The shuffle mask
2675 // must choose consecutive elements to allow casting first.
2676 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2677 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2678 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2679 return false;
2680 }
2681
2682 auto *NewShuffleDstTy =
2683 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2684
2685 // Try to replace a castop with a shuffle if the shuffle is not costly.
2686 InstructionCost CostC0 =
2687 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2689
2691 if (IsBinaryShuffle)
2693 else
2695
2696 InstructionCost OldCost = CostC0;
2697 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2698 CostKind, 0, nullptr, {}, &I);
2699
2700 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2701 CastSrcTy, NewMask, CostKind);
2702 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2704 if (!C0->hasOneUse())
2705 NewCost += CostC0;
2706 if (IsBinaryShuffle) {
2707 InstructionCost CostC1 =
2708 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2710 OldCost += CostC1;
2711 if (!C1->hasOneUse())
2712 NewCost += CostC1;
2713 }
2714
2715 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2716 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2717 << "\n");
2718 if (NewCost > OldCost)
2719 return false;
2720
2721 Value *Shuf;
2722 if (IsBinaryShuffle)
2723 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2724 NewMask);
2725 else
2726 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2727
2728 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2729
2730 // Intersect flags from the old casts.
2731 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2732 NewInst->copyIRFlags(C0);
2733 if (IsBinaryShuffle)
2734 NewInst->andIRFlags(C1);
2735 }
2736
2737 Worklist.pushValue(Shuf);
2738 replaceValue(I, *Cast);
2739 return true;
2740}
2741
2742/// Try to convert any of:
2743/// "shuffle (shuffle x, y), (shuffle y, x)"
2744/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2745/// "shuffle (shuffle x, undef), y"
2746/// "shuffle x, (shuffle y, undef)"
2747/// into "shuffle x, y".
2748bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2749 ArrayRef<int> OuterMask;
2750 Value *OuterV0, *OuterV1;
2751 if (!match(&I,
2752 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2753 return false;
2754
2755 ArrayRef<int> InnerMask0, InnerMask1;
2756 Value *X0, *X1, *Y0, *Y1;
2757 bool Match0 =
2758 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2759 bool Match1 =
2760 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2761 if (!Match0 && !Match1)
2762 return false;
2763
2764 // If the outer shuffle is a permute, then create a fake inner all-poison
2765 // shuffle. This is easier than accounting for length-changing shuffles below.
2766 SmallVector<int, 16> PoisonMask1;
2767 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2768 X1 = X0;
2769 Y1 = Y0;
2770 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2771 InnerMask1 = PoisonMask1;
2772 Match1 = true; // fake match
2773 }
2774
2775 X0 = Match0 ? X0 : OuterV0;
2776 Y0 = Match0 ? Y0 : OuterV0;
2777 X1 = Match1 ? X1 : OuterV1;
2778 Y1 = Match1 ? Y1 : OuterV1;
2779 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2780 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2781 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2782 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2783 X0->getType() != X1->getType())
2784 return false;
2785
2786 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2787 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2788
2789 // Attempt to merge shuffles, matching upto 2 source operands.
2790 // Replace index to a poison arg with PoisonMaskElem.
2791 // Bail if either inner masks reference an undef arg.
2792 SmallVector<int, 16> NewMask(OuterMask);
2793 Value *NewX = nullptr, *NewY = nullptr;
2794 for (int &M : NewMask) {
2795 Value *Src = nullptr;
2796 if (0 <= M && M < (int)NumImmElts) {
2797 Src = OuterV0;
2798 if (Match0) {
2799 M = InnerMask0[M];
2800 Src = M >= (int)NumSrcElts ? Y0 : X0;
2801 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2802 }
2803 } else if (M >= (int)NumImmElts) {
2804 Src = OuterV1;
2805 M -= NumImmElts;
2806 if (Match1) {
2807 M = InnerMask1[M];
2808 Src = M >= (int)NumSrcElts ? Y1 : X1;
2809 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2810 }
2811 }
2812 if (Src && M != PoisonMaskElem) {
2813 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2814 if (isa<UndefValue>(Src)) {
2815 // We've referenced an undef element - if its poison, update the shuffle
2816 // mask, else bail.
2817 if (!isa<PoisonValue>(Src))
2818 return false;
2819 M = PoisonMaskElem;
2820 continue;
2821 }
2822 if (!NewX || NewX == Src) {
2823 NewX = Src;
2824 continue;
2825 }
2826 if (!NewY || NewY == Src) {
2827 M += NumSrcElts;
2828 NewY = Src;
2829 continue;
2830 }
2831 return false;
2832 }
2833 }
2834
2835 if (!NewX)
2836 return PoisonValue::get(ShuffleDstTy);
2837 if (!NewY)
2838 NewY = PoisonValue::get(ShuffleSrcTy);
2839
2840 // Have we folded to an Identity shuffle?
2841 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2842 replaceValue(I, *NewX);
2843 return true;
2844 }
2845
2846 // Try to merge the shuffles if the new shuffle is not costly.
2847 InstructionCost InnerCost0 = 0;
2848 if (Match0)
2849 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2850
2851 InstructionCost InnerCost1 = 0;
2852 if (Match1)
2853 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2854
2856
2857 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2858
2859 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2863 InstructionCost NewCost =
2864 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2865 nullptr, {NewX, NewY});
2866 if (!OuterV0->hasOneUse())
2867 NewCost += InnerCost0;
2868 if (!OuterV1->hasOneUse())
2869 NewCost += InnerCost1;
2870
2871 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2872 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2873 << "\n");
2874 if (NewCost > OldCost)
2875 return false;
2876
2877 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2878 replaceValue(I, *Shuf);
2879 return true;
2880}
2881
2882/// Try to convert a chain of length-preserving shuffles that are fed by
2883/// length-changing shuffles from the same source, e.g. a chain of length 3:
2884///
2885/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
2886/// (shuffle y, undef)),
2887// (shuffle y, undef)"
2888///
2889/// into a single shuffle fed by a length-changing shuffle:
2890///
2891/// "shuffle x, (shuffle y, undef)"
2892///
2893/// Such chains arise e.g. from folding extract/insert sequences.
2894bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
2895 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
2896 if (!TrunkType)
2897 return false;
2898
2899 unsigned ChainLength = 0;
2900 SmallVector<int> Mask;
2901 SmallVector<int> YMask;
2902 InstructionCost OldCost = 0;
2903 InstructionCost NewCost = 0;
2904 Value *Trunk = &I;
2905 unsigned NumTrunkElts = TrunkType->getNumElements();
2906 Value *Y = nullptr;
2907
2908 for (;;) {
2909 // Match the current trunk against (commutations of) the pattern
2910 // "shuffle trunk', (shuffle y, undef)"
2911 ArrayRef<int> OuterMask;
2912 Value *OuterV0, *OuterV1;
2913 if (ChainLength != 0 && !Trunk->hasOneUse())
2914 break;
2915 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
2916 m_Mask(OuterMask))))
2917 break;
2918 if (OuterV0->getType() != TrunkType) {
2919 // This shuffle is not length-preserving, so it cannot be part of the
2920 // chain.
2921 break;
2922 }
2923
2924 ArrayRef<int> InnerMask0, InnerMask1;
2925 Value *A0, *A1, *B0, *B1;
2926 bool Match0 =
2927 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
2928 bool Match1 =
2929 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
2930 bool Match0Leaf = Match0 && A0->getType() != I.getType();
2931 bool Match1Leaf = Match1 && A1->getType() != I.getType();
2932 if (Match0Leaf == Match1Leaf) {
2933 // Only handle the case of exactly one leaf in each step. The "two leaves"
2934 // case is handled by foldShuffleOfShuffles.
2935 break;
2936 }
2937
2938 SmallVector<int> CommutedOuterMask;
2939 if (Match0Leaf) {
2940 std::swap(OuterV0, OuterV1);
2941 std::swap(InnerMask0, InnerMask1);
2942 std::swap(A0, A1);
2943 std::swap(B0, B1);
2944 llvm::append_range(CommutedOuterMask, OuterMask);
2945 for (int &M : CommutedOuterMask) {
2946 if (M == PoisonMaskElem)
2947 continue;
2948 if (M < (int)NumTrunkElts)
2949 M += NumTrunkElts;
2950 else
2951 M -= NumTrunkElts;
2952 }
2953 OuterMask = CommutedOuterMask;
2954 }
2955 if (!OuterV1->hasOneUse())
2956 break;
2957
2958 if (!isa<UndefValue>(A1)) {
2959 if (!Y)
2960 Y = A1;
2961 else if (Y != A1)
2962 break;
2963 }
2964 if (!isa<UndefValue>(B1)) {
2965 if (!Y)
2966 Y = B1;
2967 else if (Y != B1)
2968 break;
2969 }
2970
2971 auto *YType = cast<FixedVectorType>(A1->getType());
2972 int NumLeafElts = YType->getNumElements();
2973 SmallVector<int> LocalYMask(InnerMask1);
2974 for (int &M : LocalYMask) {
2975 if (M >= NumLeafElts)
2976 M -= NumLeafElts;
2977 }
2978
2979 InstructionCost LocalOldCost =
2982
2983 // Handle the initial (start of chain) case.
2984 if (!ChainLength) {
2985 Mask.assign(OuterMask);
2986 YMask.assign(LocalYMask);
2987 OldCost = NewCost = LocalOldCost;
2988 Trunk = OuterV0;
2989 ChainLength++;
2990 continue;
2991 }
2992
2993 // For the non-root case, first attempt to combine masks.
2994 SmallVector<int> NewYMask(YMask);
2995 bool Valid = true;
2996 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
2997 if (LeafM == -1 || CombinedM == LeafM)
2998 continue;
2999 if (CombinedM == -1) {
3000 CombinedM = LeafM;
3001 } else {
3002 Valid = false;
3003 break;
3004 }
3005 }
3006 if (!Valid)
3007 break;
3008
3009 SmallVector<int> NewMask;
3010 NewMask.reserve(NumTrunkElts);
3011 for (int M : Mask) {
3012 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3013 NewMask.push_back(M);
3014 else
3015 NewMask.push_back(OuterMask[M]);
3016 }
3017
3018 // Break the chain if adding this new step complicates the shuffles such
3019 // that it would increase the new cost by more than the old cost of this
3020 // step.
3021 InstructionCost LocalNewCost =
3023 YType, NewYMask, CostKind) +
3025 TrunkType, NewMask, CostKind);
3026
3027 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3028 break;
3029
3030 LLVM_DEBUG({
3031 if (ChainLength == 1) {
3032 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3033 << I << '\n';
3034 }
3035 dbgs() << " next chain link: " << *Trunk << '\n'
3036 << " old cost: " << (OldCost + LocalOldCost)
3037 << " new cost: " << LocalNewCost << '\n';
3038 });
3039
3040 Mask = NewMask;
3041 YMask = NewYMask;
3042 OldCost += LocalOldCost;
3043 NewCost = LocalNewCost;
3044 Trunk = OuterV0;
3045 ChainLength++;
3046 }
3047 if (ChainLength <= 1)
3048 return false;
3049
3050 if (llvm::all_of(Mask, [&](int M) {
3051 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3052 })) {
3053 // Produce a canonical simplified form if all elements are sourced from Y.
3054 for (int &M : Mask) {
3055 if (M >= static_cast<int>(NumTrunkElts))
3056 M = YMask[M - NumTrunkElts];
3057 }
3058 Value *Root =
3059 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3060 replaceValue(I, *Root);
3061 return true;
3062 }
3063
3064 Value *Leaf =
3065 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3066 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3067 replaceValue(I, *Root);
3068 return true;
3069}
3070
3071/// Try to convert
3072/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3073bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3074 Value *V0, *V1;
3075 ArrayRef<int> OldMask;
3076 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
3077 m_Mask(OldMask))))
3078 return false;
3079
3080 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3081 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3082 if (!II0 || !II1)
3083 return false;
3084
3085 Intrinsic::ID IID = II0->getIntrinsicID();
3086 if (IID != II1->getIntrinsicID())
3087 return false;
3088
3089 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3090 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3091 if (!ShuffleDstTy || !II0Ty)
3092 return false;
3093
3094 if (!isTriviallyVectorizable(IID))
3095 return false;
3096
3097 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3099 II0->getArgOperand(I) != II1->getArgOperand(I))
3100 return false;
3101
3102 InstructionCost OldCost =
3103 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
3104 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
3106 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3107
3108 SmallVector<Type *> NewArgsTy;
3109 InstructionCost NewCost = 0;
3110 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3111 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3113 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3114 } else {
3115 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3116 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3117 ShuffleDstTy->getNumElements());
3118 NewArgsTy.push_back(ArgTy);
3119 std::pair<Value *, Value *> OperandPair =
3120 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3121 if (!SeenOperandPairs.insert(OperandPair).second) {
3122 // We've already computed the cost for this operand pair.
3123 continue;
3124 }
3125 NewCost += TTI.getShuffleCost(
3126 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3127 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3128 }
3129 }
3130 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3131 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3132
3133 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3134 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3135 << "\n");
3136
3137 if (NewCost > OldCost)
3138 return false;
3139
3140 SmallVector<Value *> NewArgs;
3141 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3142 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3144 NewArgs.push_back(II0->getArgOperand(I));
3145 } else {
3146 std::pair<Value *, Value *> OperandPair =
3147 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3148 auto It = ShuffleCache.find(OperandPair);
3149 if (It != ShuffleCache.end()) {
3150 // Reuse previously created shuffle for this operand pair.
3151 NewArgs.push_back(It->second);
3152 continue;
3153 }
3154 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3155 II1->getArgOperand(I), OldMask);
3156 ShuffleCache[OperandPair] = Shuf;
3157 NewArgs.push_back(Shuf);
3158 Worklist.pushValue(Shuf);
3159 }
3160 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3161
3162 // Intersect flags from the old intrinsics.
3163 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3164 NewInst->copyIRFlags(II0);
3165 NewInst->andIRFlags(II1);
3166 }
3167
3168 replaceValue(I, *NewIntrinsic);
3169 return true;
3170}
3171
3172/// Try to convert
3173/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3174bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3175 Value *V0;
3176 ArrayRef<int> Mask;
3177 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Undef(), m_Mask(Mask))))
3178 return false;
3179
3180 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3181 if (!II0)
3182 return false;
3183
3184 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3185 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3186 if (!ShuffleDstTy || !IntrinsicSrcTy)
3187 return false;
3188
3189 // Validate it's a pure permute, mask should only reference the first vector
3190 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3191 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3192 return false;
3193
3194 Intrinsic::ID IID = II0->getIntrinsicID();
3195 if (!isTriviallyVectorizable(IID))
3196 return false;
3197
3198 // Cost analysis
3199 InstructionCost OldCost =
3200 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
3202 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3203
3204 SmallVector<Type *> NewArgsTy;
3205 InstructionCost NewCost = 0;
3206 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3208 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3209 } else {
3210 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3211 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3212 ShuffleDstTy->getNumElements());
3213 NewArgsTy.push_back(ArgTy);
3215 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3216 {II0->getArgOperand(I)});
3217 }
3218 }
3219 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3220 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3221
3222 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3223 << OldCost << " vs NewCost: " << NewCost << "\n");
3224
3225 if (NewCost > OldCost)
3226 return false;
3227
3228 // Transform
3229 SmallVector<Value *> NewArgs;
3230 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3232 NewArgs.push_back(II0->getArgOperand(I));
3233 } else {
3234 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3235 NewArgs.push_back(Shuf);
3236 Worklist.pushValue(Shuf);
3237 }
3238 }
3239
3240 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3241
3242 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3243 NewInst->copyIRFlags(II0);
3244
3245 replaceValue(I, *NewIntrinsic);
3246 return true;
3247}
3248
3249using InstLane = std::pair<Use *, int>;
3250
3251static InstLane lookThroughShuffles(Use *U, int Lane) {
3252 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
3253 unsigned NumElts =
3254 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3255 int M = SV->getMaskValue(Lane);
3256 if (M < 0)
3257 return {nullptr, PoisonMaskElem};
3258 if (static_cast<unsigned>(M) < NumElts) {
3259 U = &SV->getOperandUse(0);
3260 Lane = M;
3261 } else {
3262 U = &SV->getOperandUse(1);
3263 Lane = M - NumElts;
3264 }
3265 }
3266 return InstLane{U, Lane};
3267}
3268
3272 for (InstLane IL : Item) {
3273 auto [U, Lane] = IL;
3274 InstLane OpLane =
3275 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
3276 Lane)
3277 : InstLane{nullptr, PoisonMaskElem};
3278 NItem.emplace_back(OpLane);
3279 }
3280 return NItem;
3281}
3282
3283/// Detect concat of multiple values into a vector
3285 const TargetTransformInfo &TTI) {
3286 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3287 unsigned NumElts = Ty->getNumElements();
3288 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3289 return false;
3290
3291 // Check that the concat is free, usually meaning that the type will be split
3292 // during legalization.
3293 SmallVector<int, 16> ConcatMask(NumElts * 2);
3294 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3295 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3296 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3297 Ty, ConcatMask, CostKind) != 0)
3298 return false;
3299
3300 unsigned NumSlices = Item.size() / NumElts;
3301 // Currently we generate a tree of shuffles for the concats, which limits us
3302 // to a power2.
3303 if (!isPowerOf2_32(NumSlices))
3304 return false;
3305 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3306 Use *SliceV = Item[Slice * NumElts].first;
3307 if (!SliceV || SliceV->get()->getType() != Ty)
3308 return false;
3309 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3310 auto [V, Lane] = Item[Slice * NumElts + Elt];
3311 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3312 return false;
3313 }
3314 }
3315 return true;
3316}
3317
3319 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3320 const SmallPtrSet<Use *, 4> &SplatLeafs,
3321 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3322 IRBuilderBase &Builder,
3323 const TargetTransformInfo *TTI) {
3324 auto [FrontU, FrontLane] = Item.front();
3325
3326 if (IdentityLeafs.contains(FrontU)) {
3327 return FrontU->get();
3328 }
3329 if (SplatLeafs.contains(FrontU)) {
3330 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3331 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3332 }
3333 if (ConcatLeafs.contains(FrontU)) {
3334 unsigned NumElts =
3335 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3336 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3337 for (unsigned S = 0; S < Values.size(); ++S)
3338 Values[S] = Item[S * NumElts].first->get();
3339
3340 while (Values.size() > 1) {
3341 NumElts *= 2;
3342 SmallVector<int, 16> Mask(NumElts, 0);
3343 std::iota(Mask.begin(), Mask.end(), 0);
3344 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3345 for (unsigned S = 0; S < NewValues.size(); ++S)
3346 NewValues[S] =
3347 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3348 Values = NewValues;
3349 }
3350 return Values[0];
3351 }
3352
3353 auto *I = cast<Instruction>(FrontU->get());
3354 auto *II = dyn_cast<IntrinsicInst>(I);
3355 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3357 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3358 if (II &&
3359 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3360 Ops[Idx] = II->getOperand(Idx);
3361 continue;
3362 }
3364 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3365 Builder, TTI);
3366 }
3367
3368 SmallVector<Value *, 8> ValueList;
3369 for (const auto &Lane : Item)
3370 if (Lane.first)
3371 ValueList.push_back(Lane.first->get());
3372
3373 Type *DstTy =
3374 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3375 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3376 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3377 Ops[0], Ops[1]);
3378 propagateIRFlags(Value, ValueList);
3379 return Value;
3380 }
3381 if (auto *CI = dyn_cast<CmpInst>(I)) {
3382 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3383 propagateIRFlags(Value, ValueList);
3384 return Value;
3385 }
3386 if (auto *SI = dyn_cast<SelectInst>(I)) {
3387 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3388 propagateIRFlags(Value, ValueList);
3389 return Value;
3390 }
3391 if (auto *CI = dyn_cast<CastInst>(I)) {
3392 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3393 propagateIRFlags(Value, ValueList);
3394 return Value;
3395 }
3396 if (II) {
3397 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3398 propagateIRFlags(Value, ValueList);
3399 return Value;
3400 }
3401 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3402 auto *Value =
3403 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3404 propagateIRFlags(Value, ValueList);
3405 return Value;
3406}
3407
3408// Starting from a shuffle, look up through operands tracking the shuffled index
3409// of each lane. If we can simplify away the shuffles to identities then
3410// do so.
3411bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3412 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3413 if (!Ty || I.use_empty())
3414 return false;
3415
3416 SmallVector<InstLane> Start(Ty->getNumElements());
3417 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3418 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3419
3421 Worklist.push_back(Start);
3422 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3423 unsigned NumVisited = 0;
3424
3425 while (!Worklist.empty()) {
3426 if (++NumVisited > MaxInstrsToScan)
3427 return false;
3428
3429 SmallVector<InstLane> Item = Worklist.pop_back_val();
3430 auto [FrontU, FrontLane] = Item.front();
3431
3432 // If we found an undef first lane then bail out to keep things simple.
3433 if (!FrontU)
3434 return false;
3435
3436 // Helper to peek through bitcasts to the same value.
3437 auto IsEquiv = [&](Value *X, Value *Y) {
3438 return X->getType() == Y->getType() &&
3440 };
3441
3442 // Look for an identity value.
3443 if (FrontLane == 0 &&
3444 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3445 Ty->getNumElements() &&
3446 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3447 Value *FrontV = Item.front().first->get();
3448 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3449 E.value().second == (int)E.index());
3450 })) {
3451 IdentityLeafs.insert(FrontU);
3452 continue;
3453 }
3454 // Look for constants, for the moment only supporting constant splats.
3455 if (auto *C = dyn_cast<Constant>(FrontU);
3456 C && C->getSplatValue() &&
3457 all_of(drop_begin(Item), [Item](InstLane &IL) {
3458 Value *FrontV = Item.front().first->get();
3459 Use *U = IL.first;
3460 return !U || (isa<Constant>(U->get()) &&
3461 cast<Constant>(U->get())->getSplatValue() ==
3462 cast<Constant>(FrontV)->getSplatValue());
3463 })) {
3464 SplatLeafs.insert(FrontU);
3465 continue;
3466 }
3467 // Look for a splat value.
3468 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3469 auto [FrontU, FrontLane] = Item.front();
3470 auto [U, Lane] = IL;
3471 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3472 })) {
3473 SplatLeafs.insert(FrontU);
3474 continue;
3475 }
3476
3477 // We need each element to be the same type of value, and check that each
3478 // element has a single use.
3479 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3480 Value *FrontV = Item.front().first->get();
3481 if (!IL.first)
3482 return true;
3483 Value *V = IL.first->get();
3484 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3485 return false;
3486 if (V->getValueID() != FrontV->getValueID())
3487 return false;
3488 if (auto *CI = dyn_cast<CmpInst>(V))
3489 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3490 return false;
3491 if (auto *CI = dyn_cast<CastInst>(V))
3492 if (CI->getSrcTy()->getScalarType() !=
3493 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3494 return false;
3495 if (auto *SI = dyn_cast<SelectInst>(V))
3496 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3497 SI->getOperand(0)->getType() !=
3498 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3499 return false;
3500 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3501 return false;
3502 auto *II = dyn_cast<IntrinsicInst>(V);
3503 return !II || (isa<IntrinsicInst>(FrontV) &&
3504 II->getIntrinsicID() ==
3505 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3506 !II->hasOperandBundles());
3507 };
3508 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3509 // Check the operator is one that we support.
3510 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3511 // We exclude div/rem in case they hit UB from poison lanes.
3512 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3513 BO && BO->isIntDivRem())
3514 return false;
3517 continue;
3518 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3519 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3521 continue;
3522 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3523 // TODO: Handle vector widening/narrowing bitcasts.
3524 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3525 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3526 if (DstTy && SrcTy &&
3527 SrcTy->getNumElements() == DstTy->getNumElements()) {
3529 continue;
3530 }
3531 } else if (isa<SelectInst>(FrontU)) {
3535 continue;
3536 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3537 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3538 !II->hasOperandBundles()) {
3539 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3540 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3541 &TTI)) {
3542 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3543 Value *FrontV = Item.front().first->get();
3544 Use *U = IL.first;
3545 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3546 cast<Instruction>(FrontV)->getOperand(Op));
3547 }))
3548 return false;
3549 continue;
3550 }
3552 }
3553 continue;
3554 }
3555 }
3556
3557 if (isFreeConcat(Item, CostKind, TTI)) {
3558 ConcatLeafs.insert(FrontU);
3559 continue;
3560 }
3561
3562 return false;
3563 }
3564
3565 if (NumVisited <= 1)
3566 return false;
3567
3568 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3569
3570 // If we got this far, we know the shuffles are superfluous and can be
3571 // removed. Scan through again and generate the new tree of instructions.
3572 Builder.SetInsertPoint(&I);
3573 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3574 ConcatLeafs, Builder, &TTI);
3575 replaceValue(I, *V);
3576 return true;
3577}
3578
3579/// Given a commutative reduction, the order of the input lanes does not alter
3580/// the results. We can use this to remove certain shuffles feeding the
3581/// reduction, removing the need to shuffle at all.
3582bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3583 auto *II = dyn_cast<IntrinsicInst>(&I);
3584 if (!II)
3585 return false;
3586 switch (II->getIntrinsicID()) {
3587 case Intrinsic::vector_reduce_add:
3588 case Intrinsic::vector_reduce_mul:
3589 case Intrinsic::vector_reduce_and:
3590 case Intrinsic::vector_reduce_or:
3591 case Intrinsic::vector_reduce_xor:
3592 case Intrinsic::vector_reduce_smin:
3593 case Intrinsic::vector_reduce_smax:
3594 case Intrinsic::vector_reduce_umin:
3595 case Intrinsic::vector_reduce_umax:
3596 break;
3597 default:
3598 return false;
3599 }
3600
3601 // Find all the inputs when looking through operations that do not alter the
3602 // lane order (binops, for example). Currently we look for a single shuffle,
3603 // and can ignore splat values.
3604 std::queue<Value *> Worklist;
3605 SmallPtrSet<Value *, 4> Visited;
3606 ShuffleVectorInst *Shuffle = nullptr;
3607 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3608 Worklist.push(Op);
3609
3610 while (!Worklist.empty()) {
3611 Value *CV = Worklist.front();
3612 Worklist.pop();
3613 if (Visited.contains(CV))
3614 continue;
3615
3616 // Splats don't change the order, so can be safely ignored.
3617 if (isSplatValue(CV))
3618 continue;
3619
3620 Visited.insert(CV);
3621
3622 if (auto *CI = dyn_cast<Instruction>(CV)) {
3623 if (CI->isBinaryOp()) {
3624 for (auto *Op : CI->operand_values())
3625 Worklist.push(Op);
3626 continue;
3627 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3628 if (Shuffle && Shuffle != SV)
3629 return false;
3630 Shuffle = SV;
3631 continue;
3632 }
3633 }
3634
3635 // Anything else is currently an unknown node.
3636 return false;
3637 }
3638
3639 if (!Shuffle)
3640 return false;
3641
3642 // Check all uses of the binary ops and shuffles are also included in the
3643 // lane-invariant operations (Visited should be the list of lanewise
3644 // instructions, including the shuffle that we found).
3645 for (auto *V : Visited)
3646 for (auto *U : V->users())
3647 if (!Visited.contains(U) && U != &I)
3648 return false;
3649
3650 FixedVectorType *VecType =
3651 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3652 if (!VecType)
3653 return false;
3654 FixedVectorType *ShuffleInputType =
3656 if (!ShuffleInputType)
3657 return false;
3658 unsigned NumInputElts = ShuffleInputType->getNumElements();
3659
3660 // Find the mask from sorting the lanes into order. This is most likely to
3661 // become a identity or concat mask. Undef elements are pushed to the end.
3662 SmallVector<int> ConcatMask;
3663 Shuffle->getShuffleMask(ConcatMask);
3664 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3665 bool UsesSecondVec =
3666 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3667
3669 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3670 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3672 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3673 ShuffleInputType, ConcatMask, CostKind);
3674
3675 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3676 << "\n");
3677 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3678 << "\n");
3679 bool MadeChanges = false;
3680 if (NewCost < OldCost) {
3681 Builder.SetInsertPoint(Shuffle);
3682 Value *NewShuffle = Builder.CreateShuffleVector(
3683 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3684 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3685 replaceValue(*Shuffle, *NewShuffle);
3686 return true;
3687 }
3688
3689 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3690 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3691 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3692 return MadeChanges;
3693}
3694
3695/// For a given chain of patterns of the following form:
3696///
3697/// ```
3698/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3699///
3700/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3701/// ty1> %1)
3702/// OR
3703/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3704///
3705/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3706/// ...
3707/// ...
3708/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3709/// 3), <n x ty1> %(i - 2)
3710/// OR
3711/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3712///
3713/// %(i) = extractelement <n x ty1> %(i - 1), 0
3714/// ```
3715///
3716/// Where:
3717/// `mask` follows a partition pattern:
3718///
3719/// Ex:
3720/// [n = 8, p = poison]
3721///
3722/// 4 5 6 7 | p p p p
3723/// 2 3 | p p p p p p
3724/// 1 | p p p p p p p
3725///
3726/// For powers of 2, there's a consistent pattern, but for other cases
3727/// the parity of the current half value at each step decides the
3728/// next partition half (see `ExpectedParityMask` for more logical details
3729/// in generalising this).
3730///
3731/// Ex:
3732/// [n = 6]
3733///
3734/// 3 4 5 | p p p
3735/// 1 2 | p p p p
3736/// 1 | p p p p p
3737bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3738 // Going bottom-up for the pattern.
3739 std::queue<Value *> InstWorklist;
3740 InstructionCost OrigCost = 0;
3741
3742 // Common instruction operation after each shuffle op.
3743 std::optional<unsigned int> CommonCallOp = std::nullopt;
3744 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3745
3746 bool IsFirstCallOrBinInst = true;
3747 bool ShouldBeCallOrBinInst = true;
3748
3749 // This stores the last used instructions for shuffle/common op.
3750 //
3751 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3752 // instructions from either shuffle/common op.
3753 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3754
3755 Value *VecOpEE;
3756 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3757 return false;
3758
3759 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3760 if (!FVT)
3761 return false;
3762
3763 int64_t VecSize = FVT->getNumElements();
3764 if (VecSize < 2)
3765 return false;
3766
3767 // Number of levels would be ~log2(n), considering we always partition
3768 // by half for this fold pattern.
3769 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3770 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3771
3772 // This is how we generalise for all element sizes.
3773 // At each step, if vector size is odd, we need non-poison
3774 // values to cover the dominant half so we don't miss out on any element.
3775 //
3776 // This mask will help us retrieve this as we go from bottom to top:
3777 //
3778 // Mask Set -> N = N * 2 - 1
3779 // Mask Unset -> N = N * 2
3780 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3781 Cur = (Cur + 1) / 2, --Mask) {
3782 if (Cur & 1)
3783 ExpectedParityMask |= (1ll << Mask);
3784 }
3785
3786 InstWorklist.push(VecOpEE);
3787
3788 while (!InstWorklist.empty()) {
3789 Value *CI = InstWorklist.front();
3790 InstWorklist.pop();
3791
3792 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3793 if (!ShouldBeCallOrBinInst)
3794 return false;
3795
3796 if (!IsFirstCallOrBinInst &&
3797 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3798 return false;
3799
3800 // For the first found call/bin op, the vector has to come from the
3801 // extract element op.
3802 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3803 return false;
3804 IsFirstCallOrBinInst = false;
3805
3806 if (!CommonCallOp)
3807 CommonCallOp = II->getIntrinsicID();
3808 if (II->getIntrinsicID() != *CommonCallOp)
3809 return false;
3810
3811 switch (II->getIntrinsicID()) {
3812 case Intrinsic::umin:
3813 case Intrinsic::umax:
3814 case Intrinsic::smin:
3815 case Intrinsic::smax: {
3816 auto *Op0 = II->getOperand(0);
3817 auto *Op1 = II->getOperand(1);
3818 PrevVecV[0] = Op0;
3819 PrevVecV[1] = Op1;
3820 break;
3821 }
3822 default:
3823 return false;
3824 }
3825 ShouldBeCallOrBinInst ^= 1;
3826
3827 IntrinsicCostAttributes ICA(
3828 *CommonCallOp, II->getType(),
3829 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3830 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3831
3832 // We may need a swap here since it can be (a, b) or (b, a)
3833 // and accordingly change as we go up.
3834 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3835 std::swap(PrevVecV[0], PrevVecV[1]);
3836 InstWorklist.push(PrevVecV[1]);
3837 InstWorklist.push(PrevVecV[0]);
3838 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3839 // Similar logic for bin ops.
3840
3841 if (!ShouldBeCallOrBinInst)
3842 return false;
3843
3844 if (!IsFirstCallOrBinInst &&
3845 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3846 return false;
3847
3848 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3849 return false;
3850 IsFirstCallOrBinInst = false;
3851
3852 if (!CommonBinOp)
3853 CommonBinOp = BinOp->getOpcode();
3854
3855 if (BinOp->getOpcode() != *CommonBinOp)
3856 return false;
3857
3858 switch (*CommonBinOp) {
3859 case BinaryOperator::Add:
3860 case BinaryOperator::Mul:
3861 case BinaryOperator::Or:
3862 case BinaryOperator::And:
3863 case BinaryOperator::Xor: {
3864 auto *Op0 = BinOp->getOperand(0);
3865 auto *Op1 = BinOp->getOperand(1);
3866 PrevVecV[0] = Op0;
3867 PrevVecV[1] = Op1;
3868 break;
3869 }
3870 default:
3871 return false;
3872 }
3873 ShouldBeCallOrBinInst ^= 1;
3874
3875 OrigCost +=
3876 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3877
3878 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3879 std::swap(PrevVecV[0], PrevVecV[1]);
3880 InstWorklist.push(PrevVecV[1]);
3881 InstWorklist.push(PrevVecV[0]);
3882 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3883 // We shouldn't have any null values in the previous vectors,
3884 // is so, there was a mismatch in pattern.
3885 if (ShouldBeCallOrBinInst ||
3886 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3887 return false;
3888
3889 if (SVInst != PrevVecV[1])
3890 return false;
3891
3892 ArrayRef<int> CurMask;
3893 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3894 m_Mask(CurMask))))
3895 return false;
3896
3897 // Subtract the parity mask when checking the condition.
3898 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3899 if (Mask < ShuffleMaskHalf &&
3900 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3901 return false;
3902 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3903 return false;
3904 }
3905
3906 // Update mask values.
3907 ShuffleMaskHalf *= 2;
3908 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3909 ExpectedParityMask >>= 1;
3910
3912 SVInst->getType(), SVInst->getType(),
3913 CurMask, CostKind);
3914
3915 VisitedCnt += 1;
3916 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3917 break;
3918
3919 ShouldBeCallOrBinInst ^= 1;
3920 } else {
3921 return false;
3922 }
3923 }
3924
3925 // Pattern should end with a shuffle op.
3926 if (ShouldBeCallOrBinInst)
3927 return false;
3928
3929 assert(VecSize != -1 && "Expected Match for Vector Size");
3930
3931 Value *FinalVecV = PrevVecV[0];
3932 if (!FinalVecV)
3933 return false;
3934
3935 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3936
3937 Intrinsic::ID ReducedOp =
3938 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3939 : getReductionForBinop(*CommonBinOp));
3940 if (!ReducedOp)
3941 return false;
3942
3943 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3945
3946 if (NewCost >= OrigCost)
3947 return false;
3948
3949 auto *ReducedResult =
3950 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3951 replaceValue(I, *ReducedResult);
3952
3953 return true;
3954}
3955
3956/// Determine if its more efficient to fold:
3957/// reduce(trunc(x)) -> trunc(reduce(x)).
3958/// reduce(sext(x)) -> sext(reduce(x)).
3959/// reduce(zext(x)) -> zext(reduce(x)).
3960bool VectorCombine::foldCastFromReductions(Instruction &I) {
3961 auto *II = dyn_cast<IntrinsicInst>(&I);
3962 if (!II)
3963 return false;
3964
3965 bool TruncOnly = false;
3966 Intrinsic::ID IID = II->getIntrinsicID();
3967 switch (IID) {
3968 case Intrinsic::vector_reduce_add:
3969 case Intrinsic::vector_reduce_mul:
3970 TruncOnly = true;
3971 break;
3972 case Intrinsic::vector_reduce_and:
3973 case Intrinsic::vector_reduce_or:
3974 case Intrinsic::vector_reduce_xor:
3975 break;
3976 default:
3977 return false;
3978 }
3979
3980 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3981 Value *ReductionSrc = I.getOperand(0);
3982
3983 Value *Src;
3984 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3985 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3986 return false;
3987
3988 auto CastOpc =
3989 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3990
3991 auto *SrcTy = cast<VectorType>(Src->getType());
3992 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3993 Type *ResultTy = I.getType();
3994
3996 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3997 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3999 cast<CastInst>(ReductionSrc));
4000 InstructionCost NewCost =
4001 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4002 CostKind) +
4003 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4005
4006 if (OldCost <= NewCost || !NewCost.isValid())
4007 return false;
4008
4009 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4010 II->getIntrinsicID(), {Src});
4011 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4012 replaceValue(I, *NewCast);
4013 return true;
4014}
4015
4016/// Returns true if this ShuffleVectorInst eventually feeds into a
4017/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4018/// chains of shuffles and binary operators (in any combination/order).
4019/// The search does not go deeper than the given Depth.
4021 constexpr unsigned MaxVisited = 32;
4024 bool FoundReduction = false;
4025
4026 WorkList.push_back(SVI);
4027 while (!WorkList.empty()) {
4028 Instruction *I = WorkList.pop_back_val();
4029 for (User *U : I->users()) {
4030 auto *UI = cast<Instruction>(U);
4031 if (!UI || !Visited.insert(UI).second)
4032 continue;
4033 if (Visited.size() > MaxVisited)
4034 return false;
4035 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4036 // More than one reduction reached
4037 if (FoundReduction)
4038 return false;
4039 switch (II->getIntrinsicID()) {
4040 case Intrinsic::vector_reduce_add:
4041 case Intrinsic::vector_reduce_mul:
4042 case Intrinsic::vector_reduce_and:
4043 case Intrinsic::vector_reduce_or:
4044 case Intrinsic::vector_reduce_xor:
4045 case Intrinsic::vector_reduce_smin:
4046 case Intrinsic::vector_reduce_smax:
4047 case Intrinsic::vector_reduce_umin:
4048 case Intrinsic::vector_reduce_umax:
4049 FoundReduction = true;
4050 continue;
4051 default:
4052 return false;
4053 }
4054 }
4055
4057 return false;
4058
4059 WorkList.emplace_back(UI);
4060 }
4061 }
4062 return FoundReduction;
4063}
4064
4065/// This method looks for groups of shuffles acting on binops, of the form:
4066/// %x = shuffle ...
4067/// %y = shuffle ...
4068/// %a = binop %x, %y
4069/// %b = binop %x, %y
4070/// shuffle %a, %b, selectmask
4071/// We may, especially if the shuffle is wider than legal, be able to convert
4072/// the shuffle to a form where only parts of a and b need to be computed. On
4073/// architectures with no obvious "select" shuffle, this can reduce the total
4074/// number of operations if the target reports them as cheaper.
4075bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4076 auto *SVI = cast<ShuffleVectorInst>(&I);
4077 auto *VT = cast<FixedVectorType>(I.getType());
4078 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4079 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4080 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4081 VT != Op0->getType())
4082 return false;
4083
4084 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4085 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4086 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4087 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4088 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4089 auto checkSVNonOpUses = [&](Instruction *I) {
4090 if (!I || I->getOperand(0)->getType() != VT)
4091 return true;
4092 return any_of(I->users(), [&](User *U) {
4093 return U != Op0 && U != Op1 &&
4094 !(isa<ShuffleVectorInst>(U) &&
4095 (InputShuffles.contains(cast<Instruction>(U)) ||
4096 isInstructionTriviallyDead(cast<Instruction>(U))));
4097 });
4098 };
4099 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4100 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4101 return false;
4102
4103 // Collect all the uses that are shuffles that we can transform together. We
4104 // may not have a single shuffle, but a group that can all be transformed
4105 // together profitably.
4107 auto collectShuffles = [&](Instruction *I) {
4108 for (auto *U : I->users()) {
4109 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4110 if (!SV || SV->getType() != VT)
4111 return false;
4112 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4113 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4114 return false;
4115 if (!llvm::is_contained(Shuffles, SV))
4116 Shuffles.push_back(SV);
4117 }
4118 return true;
4119 };
4120 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4121 return false;
4122 // From a reduction, we need to be processing a single shuffle, otherwise the
4123 // other uses will not be lane-invariant.
4124 if (FromReduction && Shuffles.size() > 1)
4125 return false;
4126
4127 // Add any shuffle uses for the shuffles we have found, to include them in our
4128 // cost calculations.
4129 if (!FromReduction) {
4130 for (ShuffleVectorInst *SV : Shuffles) {
4131 for (auto *U : SV->users()) {
4132 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4133 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4134 Shuffles.push_back(SSV);
4135 }
4136 }
4137 }
4138
4139 // For each of the output shuffles, we try to sort all the first vector
4140 // elements to the beginning, followed by the second array elements at the
4141 // end. If the binops are legalized to smaller vectors, this may reduce total
4142 // number of binops. We compute the ReconstructMask mask needed to convert
4143 // back to the original lane order.
4145 SmallVector<SmallVector<int>> OrigReconstructMasks;
4146 int MaxV1Elt = 0, MaxV2Elt = 0;
4147 unsigned NumElts = VT->getNumElements();
4148 for (ShuffleVectorInst *SVN : Shuffles) {
4149 SmallVector<int> Mask;
4150 SVN->getShuffleMask(Mask);
4151
4152 // Check the operands are the same as the original, or reversed (in which
4153 // case we need to commute the mask).
4154 Value *SVOp0 = SVN->getOperand(0);
4155 Value *SVOp1 = SVN->getOperand(1);
4156 if (isa<UndefValue>(SVOp1)) {
4157 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
4158 SVOp0 = SSV->getOperand(0);
4159 SVOp1 = SSV->getOperand(1);
4160 for (int &Elem : Mask) {
4161 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
4162 return false;
4163 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
4164 }
4165 }
4166 if (SVOp0 == Op1 && SVOp1 == Op0) {
4167 std::swap(SVOp0, SVOp1);
4169 }
4170 if (SVOp0 != Op0 || SVOp1 != Op1)
4171 return false;
4172
4173 // Calculate the reconstruction mask for this shuffle, as the mask needed to
4174 // take the packed values from Op0/Op1 and reconstructing to the original
4175 // order.
4176 SmallVector<int> ReconstructMask;
4177 for (unsigned I = 0; I < Mask.size(); I++) {
4178 if (Mask[I] < 0) {
4179 ReconstructMask.push_back(-1);
4180 } else if (Mask[I] < static_cast<int>(NumElts)) {
4181 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
4182 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
4183 return Mask[I] == A.first;
4184 });
4185 if (It != V1.end())
4186 ReconstructMask.push_back(It - V1.begin());
4187 else {
4188 ReconstructMask.push_back(V1.size());
4189 V1.emplace_back(Mask[I], V1.size());
4190 }
4191 } else {
4192 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
4193 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
4194 return Mask[I] - static_cast<int>(NumElts) == A.first;
4195 });
4196 if (It != V2.end())
4197 ReconstructMask.push_back(NumElts + It - V2.begin());
4198 else {
4199 ReconstructMask.push_back(NumElts + V2.size());
4200 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
4201 }
4202 }
4203 }
4204
4205 // For reductions, we know that the lane ordering out doesn't alter the
4206 // result. In-order can help simplify the shuffle away.
4207 if (FromReduction)
4208 sort(ReconstructMask);
4209 OrigReconstructMasks.push_back(std::move(ReconstructMask));
4210 }
4211
4212 // If the Maximum element used from V1 and V2 are not larger than the new
4213 // vectors, the vectors are already packes and performing the optimization
4214 // again will likely not help any further. This also prevents us from getting
4215 // stuck in a cycle in case the costs do not also rule it out.
4216 if (V1.empty() || V2.empty() ||
4217 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
4218 MaxV2Elt == static_cast<int>(V2.size()) - 1))
4219 return false;
4220
4221 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
4222 // shuffle of another shuffle, or not a shuffle (that is treated like a
4223 // identity shuffle).
4224 auto GetBaseMaskValue = [&](Instruction *I, int M) {
4225 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4226 if (!SV)
4227 return M;
4228 if (isa<UndefValue>(SV->getOperand(1)))
4229 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4230 if (InputShuffles.contains(SSV))
4231 return SSV->getMaskValue(SV->getMaskValue(M));
4232 return SV->getMaskValue(M);
4233 };
4234
4235 // Attempt to sort the inputs my ascending mask values to make simpler input
4236 // shuffles and push complex shuffles down to the uses. We sort on the first
4237 // of the two input shuffle orders, to try and get at least one input into a
4238 // nice order.
4239 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
4240 std::pair<int, int> Y) {
4241 int MXA = GetBaseMaskValue(A, X.first);
4242 int MYA = GetBaseMaskValue(A, Y.first);
4243 return MXA < MYA;
4244 };
4245 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
4246 return SortBase(SVI0A, A, B);
4247 });
4248 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
4249 return SortBase(SVI1A, A, B);
4250 });
4251 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
4252 // modified order of the input shuffles.
4253 SmallVector<SmallVector<int>> ReconstructMasks;
4254 for (const auto &Mask : OrigReconstructMasks) {
4255 SmallVector<int> ReconstructMask;
4256 for (int M : Mask) {
4257 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
4258 auto It = find_if(V, [M](auto A) { return A.second == M; });
4259 assert(It != V.end() && "Expected all entries in Mask");
4260 return std::distance(V.begin(), It);
4261 };
4262 if (M < 0)
4263 ReconstructMask.push_back(-1);
4264 else if (M < static_cast<int>(NumElts)) {
4265 ReconstructMask.push_back(FindIndex(V1, M));
4266 } else {
4267 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
4268 }
4269 }
4270 ReconstructMasks.push_back(std::move(ReconstructMask));
4271 }
4272
4273 // Calculate the masks needed for the new input shuffles, which get padded
4274 // with undef
4275 SmallVector<int> V1A, V1B, V2A, V2B;
4276 for (unsigned I = 0; I < V1.size(); I++) {
4277 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
4278 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
4279 }
4280 for (unsigned I = 0; I < V2.size(); I++) {
4281 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
4282 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
4283 }
4284 while (V1A.size() < NumElts) {
4287 }
4288 while (V2A.size() < NumElts) {
4291 }
4292
4293 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
4294 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4295 if (!SV)
4296 return C;
4297 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
4300 VT, VT, SV->getShuffleMask(), CostKind);
4301 };
4302 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4303 return C +
4305 };
4306
4307 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
4308 unsigned MaxVectorSize =
4310 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
4311 if (MaxElementsInVector == 0)
4312 return false;
4313 // When there are multiple shufflevector operations on the same input,
4314 // especially when the vector length is larger than the register size,
4315 // identical shuffle patterns may occur across different groups of elements.
4316 // To avoid overestimating the cost by counting these repeated shuffles more
4317 // than once, we only account for unique shuffle patterns. This adjustment
4318 // prevents inflated costs in the cost model for wide vectors split into
4319 // several register-sized groups.
4320 std::set<SmallVector<int, 4>> UniqueShuffles;
4321 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4322 // Compute the cost for performing the shuffle over the full vector.
4323 auto ShuffleCost =
4325 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
4326 if (NumFullVectors < 2)
4327 return C + ShuffleCost;
4328 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
4329 unsigned NumUniqueGroups = 0;
4330 unsigned NumGroups = Mask.size() / MaxElementsInVector;
4331 // For each group of MaxElementsInVector contiguous elements,
4332 // collect their shuffle pattern and insert into the set of unique patterns.
4333 for (unsigned I = 0; I < NumFullVectors; ++I) {
4334 for (unsigned J = 0; J < MaxElementsInVector; ++J)
4335 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
4336 if (UniqueShuffles.insert(SubShuffle).second)
4337 NumUniqueGroups += 1;
4338 }
4339 return C + ShuffleCost * NumUniqueGroups / NumGroups;
4340 };
4341 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
4342 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4343 if (!SV)
4344 return C;
4345 SmallVector<int, 16> Mask;
4346 SV->getShuffleMask(Mask);
4347 return AddShuffleMaskAdjustedCost(C, Mask);
4348 };
4349 // Check that input consists of ShuffleVectors applied to the same input
4350 auto AllShufflesHaveSameOperands =
4351 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
4352 if (InputShuffles.size() < 2)
4353 return false;
4354 ShuffleVectorInst *FirstSV =
4355 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
4356 if (!FirstSV)
4357 return false;
4358
4359 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
4360 return std::all_of(
4361 std::next(InputShuffles.begin()), InputShuffles.end(),
4362 [&](Instruction *I) {
4363 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
4364 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
4365 });
4366 };
4367
4368 // Get the costs of the shuffles + binops before and after with the new
4369 // shuffle masks.
4370 InstructionCost CostBefore =
4371 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
4372 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
4373 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
4374 InstructionCost(0), AddShuffleCost);
4375 if (AllShufflesHaveSameOperands(InputShuffles)) {
4376 UniqueShuffles.clear();
4377 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4378 InstructionCost(0), AddShuffleAdjustedCost);
4379 } else {
4380 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4381 InstructionCost(0), AddShuffleCost);
4382 }
4383
4384 // The new binops will be unused for lanes past the used shuffle lengths.
4385 // These types attempt to get the correct cost for that from the target.
4386 FixedVectorType *Op0SmallVT =
4387 FixedVectorType::get(VT->getScalarType(), V1.size());
4388 FixedVectorType *Op1SmallVT =
4389 FixedVectorType::get(VT->getScalarType(), V2.size());
4390 InstructionCost CostAfter =
4391 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4392 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4393 UniqueShuffles.clear();
4394 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4395 InstructionCost(0), AddShuffleMaskAdjustedCost);
4396 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4397 CostAfter +=
4398 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4399 InstructionCost(0), AddShuffleMaskCost);
4400
4401 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4402 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4403 << " vs CostAfter: " << CostAfter << "\n");
4404 if (CostBefore < CostAfter ||
4405 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4406 return false;
4407
4408 // The cost model has passed, create the new instructions.
4409 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4410 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4411 if (!SV)
4412 return I;
4413 if (isa<UndefValue>(SV->getOperand(1)))
4414 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4415 if (InputShuffles.contains(SSV))
4416 return SSV->getOperand(Op);
4417 return SV->getOperand(Op);
4418 };
4419 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4420 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4421 GetShuffleOperand(SVI0A, 1), V1A);
4422 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4423 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4424 GetShuffleOperand(SVI0B, 1), V1B);
4425 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4426 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4427 GetShuffleOperand(SVI1A, 1), V2A);
4428 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4429 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4430 GetShuffleOperand(SVI1B, 1), V2B);
4431 Builder.SetInsertPoint(Op0);
4432 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4433 NSV0A, NSV0B);
4434 if (auto *I = dyn_cast<Instruction>(NOp0))
4435 I->copyIRFlags(Op0, true);
4436 Builder.SetInsertPoint(Op1);
4437 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4438 NSV1A, NSV1B);
4439 if (auto *I = dyn_cast<Instruction>(NOp1))
4440 I->copyIRFlags(Op1, true);
4441
4442 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4443 Builder.SetInsertPoint(Shuffles[S]);
4444 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4445 replaceValue(*Shuffles[S], *NSV, false);
4446 }
4447
4448 Worklist.pushValue(NSV0A);
4449 Worklist.pushValue(NSV0B);
4450 Worklist.pushValue(NSV1A);
4451 Worklist.pushValue(NSV1B);
4452 return true;
4453}
4454
4455/// Check if instruction depends on ZExt and this ZExt can be moved after the
4456/// instruction. Move ZExt if it is profitable. For example:
4457/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4458/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4459/// Cost model calculations takes into account if zext(x) has other users and
4460/// whether it can be propagated through them too.
4461bool VectorCombine::shrinkType(Instruction &I) {
4462 Value *ZExted, *OtherOperand;
4463 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4464 m_Value(OtherOperand))) &&
4465 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4466 return false;
4467
4468 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4469
4470 auto *BigTy = cast<FixedVectorType>(I.getType());
4471 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4472 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4473
4474 if (I.getOpcode() == Instruction::LShr) {
4475 // Check that the shift amount is less than the number of bits in the
4476 // smaller type. Otherwise, the smaller lshr will return a poison value.
4477 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4478 if (ShAmtKB.getMaxValue().uge(BW))
4479 return false;
4480 } else {
4481 // Check that the expression overall uses at most the same number of bits as
4482 // ZExted
4483 KnownBits KB = computeKnownBits(&I, *DL);
4484 if (KB.countMaxActiveBits() > BW)
4485 return false;
4486 }
4487
4488 // Calculate costs of leaving current IR as it is and moving ZExt operation
4489 // later, along with adding truncates if needed
4491 Instruction::ZExt, BigTy, SmallTy,
4492 TargetTransformInfo::CastContextHint::None, CostKind);
4493 InstructionCost CurrentCost = ZExtCost;
4494 InstructionCost ShrinkCost = 0;
4495
4496 // Calculate total cost and check that we can propagate through all ZExt users
4497 for (User *U : ZExtOperand->users()) {
4498 auto *UI = cast<Instruction>(U);
4499 if (UI == &I) {
4500 CurrentCost +=
4501 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4502 ShrinkCost +=
4503 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4504 ShrinkCost += ZExtCost;
4505 continue;
4506 }
4507
4508 if (!Instruction::isBinaryOp(UI->getOpcode()))
4509 return false;
4510
4511 // Check if we can propagate ZExt through its other users
4512 KnownBits KB = computeKnownBits(UI, *DL);
4513 if (KB.countMaxActiveBits() > BW)
4514 return false;
4515
4516 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4517 ShrinkCost +=
4518 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4519 ShrinkCost += ZExtCost;
4520 }
4521
4522 // If the other instruction operand is not a constant, we'll need to
4523 // generate a truncate instruction. So we have to adjust cost
4524 if (!isa<Constant>(OtherOperand))
4525 ShrinkCost += TTI.getCastInstrCost(
4526 Instruction::Trunc, SmallTy, BigTy,
4527 TargetTransformInfo::CastContextHint::None, CostKind);
4528
4529 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4530 // towards modifying the IR because shrinking opens opportunities for other
4531 // shrinking optimisations.
4532 if (ShrinkCost > CurrentCost)
4533 return false;
4534
4535 Builder.SetInsertPoint(&I);
4536 Value *Op0 = ZExted;
4537 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4538 // Keep the order of operands the same
4539 if (I.getOperand(0) == OtherOperand)
4540 std::swap(Op0, Op1);
4541 Value *NewBinOp =
4542 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4543 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4544 cast<Instruction>(NewBinOp)->copyMetadata(I);
4545 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4546 replaceValue(I, *NewZExtr);
4547 return true;
4548}
4549
4550/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4551/// shuffle (DstVec, SrcVec, Mask)
4552bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4553 Value *DstVec, *SrcVec;
4554 uint64_t ExtIdx, InsIdx;
4555 if (!match(&I,
4556 m_InsertElt(m_Value(DstVec),
4557 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4558 m_ConstantInt(InsIdx))))
4559 return false;
4560
4561 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4562 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4563 // We can try combining vectors with different element sizes.
4564 if (!DstVecTy || !SrcVecTy ||
4565 SrcVecTy->getElementType() != DstVecTy->getElementType())
4566 return false;
4567
4568 unsigned NumDstElts = DstVecTy->getNumElements();
4569 unsigned NumSrcElts = SrcVecTy->getNumElements();
4570 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4571 return false;
4572
4573 // Insertion into poison is a cheaper single operand shuffle.
4575 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4576
4577 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4578 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4579 if (NeedDstSrcSwap) {
4581 Mask[InsIdx] = ExtIdx % NumDstElts;
4582 std::swap(DstVec, SrcVec);
4583 } else {
4585 std::iota(Mask.begin(), Mask.end(), 0);
4586 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
4587 }
4588
4589 // Cost
4590 auto *Ins = cast<InsertElementInst>(&I);
4591 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4592 InstructionCost InsCost =
4593 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4594 InstructionCost ExtCost =
4595 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4596 InstructionCost OldCost = ExtCost + InsCost;
4597
4598 InstructionCost NewCost = 0;
4599 SmallVector<int> ExtToVecMask;
4600 if (!NeedExpOrNarrow) {
4601 // Ignore 'free' identity insertion shuffle.
4602 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4603 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4604 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4605 nullptr, {DstVec, SrcVec});
4606 } else {
4607 // When creating a length-changing-vector, always try to keep the relevant
4608 // element in an equivalent position, so that bulk shuffles are more likely
4609 // to be useful.
4610 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4611 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
4612 // Add cost for expanding or narrowing
4614 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4615 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4616 }
4617
4618 if (!Ext->hasOneUse())
4619 NewCost += ExtCost;
4620
4621 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4622 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4623 << "\n");
4624
4625 if (OldCost < NewCost)
4626 return false;
4627
4628 if (NeedExpOrNarrow) {
4629 if (!NeedDstSrcSwap)
4630 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4631 else
4632 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4633 }
4634
4635 // Canonicalize undef param to RHS to help further folds.
4636 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4637 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4638 std::swap(DstVec, SrcVec);
4639 }
4640
4641 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4642 replaceValue(I, *Shuf);
4643
4644 return true;
4645}
4646
4647/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4648/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4649/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4650/// before casting it back into `<vscale x 16 x i32>`.
4651bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4652 const APInt *SplatVal0, *SplatVal1;
4654 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4655 return false;
4656
4657 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4658 << "\n");
4659
4660 auto *VTy =
4661 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4662 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4663 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4664
4665 // Just in case the cost of interleave2 intrinsic and bitcast are both
4666 // invalid, in which case we want to bail out, we use <= rather
4667 // than < here. Even they both have valid and equal costs, it's probably
4668 // not a good idea to emit a high-cost constant splat.
4670 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4672 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4673 << *I.getType() << " is too high.\n");
4674 return false;
4675 }
4676
4677 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4678 NewSplatVal <<= Width;
4679 NewSplatVal |= SplatVal0->zext(Width * 2);
4680 auto *NewSplat = ConstantVector::getSplat(
4681 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4682
4683 IRBuilder<> Builder(&I);
4684 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4685 return true;
4686}
4687
4688// Attempt to shrink loads that are only used by shufflevector instructions.
4689bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4690 auto *OldLoad = dyn_cast<LoadInst>(&I);
4691 if (!OldLoad || !OldLoad->isSimple())
4692 return false;
4693
4694 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4695 if (!OldLoadTy)
4696 return false;
4697
4698 unsigned const OldNumElements = OldLoadTy->getNumElements();
4699
4700 // Search all uses of load. If all uses are shufflevector instructions, and
4701 // the second operands are all poison values, find the minimum and maximum
4702 // indices of the vector elements referenced by all shuffle masks.
4703 // Otherwise return `std::nullopt`.
4704 using IndexRange = std::pair<int, int>;
4705 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4706 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4707 for (llvm::Use &Use : I.uses()) {
4708 // Ensure all uses match the required pattern.
4709 User *Shuffle = Use.getUser();
4710 ArrayRef<int> Mask;
4711
4712 if (!match(Shuffle,
4713 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4714 return std::nullopt;
4715
4716 // Ignore shufflevector instructions that have no uses.
4717 if (Shuffle->use_empty())
4718 continue;
4719
4720 // Find the min and max indices used by the shufflevector instruction.
4721 for (int Index : Mask) {
4722 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4723 OutputRange.first = std::min(Index, OutputRange.first);
4724 OutputRange.second = std::max(Index, OutputRange.second);
4725 }
4726 }
4727 }
4728
4729 if (OutputRange.second < OutputRange.first)
4730 return std::nullopt;
4731
4732 return OutputRange;
4733 };
4734
4735 // Get the range of vector elements used by shufflevector instructions.
4736 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4737 unsigned const NewNumElements = Indices->second + 1u;
4738
4739 // If the range of vector elements is smaller than the full load, attempt
4740 // to create a smaller load.
4741 if (NewNumElements < OldNumElements) {
4742 IRBuilder Builder(&I);
4743 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4744
4745 // Calculate costs of old and new ops.
4746 Type *ElemTy = OldLoadTy->getElementType();
4747 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4748 Value *PtrOp = OldLoad->getPointerOperand();
4749
4751 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4752 OldLoad->getPointerAddressSpace(), CostKind);
4753 InstructionCost NewCost =
4754 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4755 OldLoad->getPointerAddressSpace(), CostKind);
4756
4757 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4759 unsigned const MaxIndex = NewNumElements * 2u;
4760
4761 for (llvm::Use &Use : I.uses()) {
4762 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4763 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4764
4765 // Create entry for new use.
4766 NewUses.push_back({Shuffle, OldMask});
4767
4768 // Validate mask indices.
4769 for (int Index : OldMask) {
4770 if (Index >= static_cast<int>(MaxIndex))
4771 return false;
4772 }
4773
4774 // Update costs.
4775 OldCost +=
4777 OldLoadTy, OldMask, CostKind);
4778 NewCost +=
4780 NewLoadTy, OldMask, CostKind);
4781 }
4782
4783 LLVM_DEBUG(
4784 dbgs() << "Found a load used only by shufflevector instructions: "
4785 << I << "\n OldCost: " << OldCost
4786 << " vs NewCost: " << NewCost << "\n");
4787
4788 if (OldCost < NewCost || !NewCost.isValid())
4789 return false;
4790
4791 // Create new load of smaller vector.
4792 auto *NewLoad = cast<LoadInst>(
4793 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4794 NewLoad->copyMetadata(I);
4795
4796 // Replace all uses.
4797 for (UseEntry &Use : NewUses) {
4798 ShuffleVectorInst *Shuffle = Use.first;
4799 std::vector<int> &NewMask = Use.second;
4800
4801 Builder.SetInsertPoint(Shuffle);
4802 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4803 Value *NewShuffle = Builder.CreateShuffleVector(
4804 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4805
4806 replaceValue(*Shuffle, *NewShuffle, false);
4807 }
4808
4809 return true;
4810 }
4811 }
4812 return false;
4813}
4814
4815// Attempt to narrow a phi of shufflevector instructions where the two incoming
4816// values have the same operands but different masks. If the two shuffle masks
4817// are offsets of one another we can use one branch to rotate the incoming
4818// vector and perform one larger shuffle after the phi.
4819bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4820 auto *Phi = dyn_cast<PHINode>(&I);
4821 if (!Phi || Phi->getNumIncomingValues() != 2u)
4822 return false;
4823
4824 Value *Op = nullptr;
4825 ArrayRef<int> Mask0;
4826 ArrayRef<int> Mask1;
4827
4828 if (!match(Phi->getOperand(0u),
4829 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4830 !match(Phi->getOperand(1u),
4831 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4832 return false;
4833
4834 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4835
4836 // Ensure result vectors are wider than the argument vector.
4837 auto *InputVT = cast<FixedVectorType>(Op->getType());
4838 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4839 auto const InputNumElements = InputVT->getNumElements();
4840
4841 if (InputNumElements >= ResultVT->getNumElements())
4842 return false;
4843
4844 // Take the difference of the two shuffle masks at each index. Ignore poison
4845 // values at the same index in both masks.
4846 SmallVector<int, 16> NewMask;
4847 NewMask.reserve(Mask0.size());
4848
4849 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4850 if (M0 >= 0 && M1 >= 0)
4851 NewMask.push_back(M0 - M1);
4852 else if (M0 == -1 && M1 == -1)
4853 continue;
4854 else
4855 return false;
4856 }
4857
4858 // Ensure all elements of the new mask are equal. If the difference between
4859 // the incoming mask elements is the same, the two must be constant offsets
4860 // of one another.
4861 if (NewMask.empty() || !all_equal(NewMask))
4862 return false;
4863
4864 // Create new mask using difference of the two incoming masks.
4865 int MaskOffset = NewMask[0u];
4866 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
4867 NewMask.clear();
4868
4869 for (unsigned I = 0u; I < InputNumElements; ++I) {
4870 NewMask.push_back(Index);
4871 Index = (Index + 1u) % InputNumElements;
4872 }
4873
4874 // Calculate costs for worst cases and compare.
4875 auto const Kind = TTI::SK_PermuteSingleSrc;
4876 auto OldCost =
4877 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4878 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4879 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4880 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4881
4882 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4883 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4884 << "\n");
4885
4886 if (NewCost > OldCost)
4887 return false;
4888
4889 // Create new shuffles and narrowed phi.
4890 auto Builder = IRBuilder(Shuf);
4891 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4892 auto *PoisonVal = PoisonValue::get(InputVT);
4893 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4894 Worklist.push(cast<Instruction>(NewShuf0));
4895
4896 Builder.SetInsertPoint(Phi);
4897 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4898 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4899 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4900 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4901
4902 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4903 PoisonVal = PoisonValue::get(NewPhi->getType());
4904 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4905
4906 replaceValue(*Phi, *NewShuf1);
4907 return true;
4908}
4909
4910/// This is the entry point for all transforms. Pass manager differences are
4911/// handled in the callers of this function.
4912bool VectorCombine::run() {
4914 return false;
4915
4916 // Don't attempt vectorization if the target does not support vectors.
4917 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4918 return false;
4919
4920 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4921
4922 auto FoldInst = [this](Instruction &I) {
4923 Builder.SetInsertPoint(&I);
4924 bool IsVectorType = isa<VectorType>(I.getType());
4925 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4926 auto Opcode = I.getOpcode();
4927
4928 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4929
4930 // These folds should be beneficial regardless of when this pass is run
4931 // in the optimization pipeline.
4932 // The type checking is for run-time efficiency. We can avoid wasting time
4933 // dispatching to folding functions if there's no chance of matching.
4934 if (IsFixedVectorType) {
4935 switch (Opcode) {
4936 case Instruction::InsertElement:
4937 if (vectorizeLoadInsert(I))
4938 return true;
4939 break;
4940 case Instruction::ShuffleVector:
4941 if (widenSubvectorLoad(I))
4942 return true;
4943 break;
4944 default:
4945 break;
4946 }
4947 }
4948
4949 // This transform works with scalable and fixed vectors
4950 // TODO: Identify and allow other scalable transforms
4951 if (IsVectorType) {
4952 if (scalarizeOpOrCmp(I))
4953 return true;
4954 if (scalarizeLoad(I))
4955 return true;
4956 if (scalarizeExtExtract(I))
4957 return true;
4958 if (scalarizeVPIntrinsic(I))
4959 return true;
4960 if (foldInterleaveIntrinsics(I))
4961 return true;
4962 }
4963
4964 if (Opcode == Instruction::Store)
4965 if (foldSingleElementStore(I))
4966 return true;
4967
4968 // If this is an early pipeline invocation of this pass, we are done.
4969 if (TryEarlyFoldsOnly)
4970 return false;
4971
4972 // Otherwise, try folds that improve codegen but may interfere with
4973 // early IR canonicalizations.
4974 // The type checking is for run-time efficiency. We can avoid wasting time
4975 // dispatching to folding functions if there's no chance of matching.
4976 if (IsFixedVectorType) {
4977 switch (Opcode) {
4978 case Instruction::InsertElement:
4979 if (foldInsExtFNeg(I))
4980 return true;
4981 if (foldInsExtBinop(I))
4982 return true;
4983 if (foldInsExtVectorToShuffle(I))
4984 return true;
4985 break;
4986 case Instruction::ShuffleVector:
4987 if (foldPermuteOfBinops(I))
4988 return true;
4989 if (foldShuffleOfBinops(I))
4990 return true;
4991 if (foldShuffleOfSelects(I))
4992 return true;
4993 if (foldShuffleOfCastops(I))
4994 return true;
4995 if (foldShuffleOfShuffles(I))
4996 return true;
4997 if (foldPermuteOfIntrinsic(I))
4998 return true;
4999 if (foldShufflesOfLengthChangingShuffles(I))
5000 return true;
5001 if (foldShuffleOfIntrinsics(I))
5002 return true;
5003 if (foldSelectShuffle(I))
5004 return true;
5005 if (foldShuffleToIdentity(I))
5006 return true;
5007 break;
5008 case Instruction::Load:
5009 if (shrinkLoadForShuffles(I))
5010 return true;
5011 break;
5012 case Instruction::BitCast:
5013 if (foldBitcastShuffle(I))
5014 return true;
5015 break;
5016 case Instruction::And:
5017 case Instruction::Or:
5018 case Instruction::Xor:
5019 if (foldBitOpOfCastops(I))
5020 return true;
5021 if (foldBitOpOfCastConstant(I))
5022 return true;
5023 break;
5024 case Instruction::PHI:
5025 if (shrinkPhiOfShuffles(I))
5026 return true;
5027 break;
5028 default:
5029 if (shrinkType(I))
5030 return true;
5031 break;
5032 }
5033 } else {
5034 switch (Opcode) {
5035 case Instruction::Call:
5036 if (foldShuffleFromReductions(I))
5037 return true;
5038 if (foldCastFromReductions(I))
5039 return true;
5040 break;
5041 case Instruction::ExtractElement:
5042 if (foldShuffleChainsToReduce(I))
5043 return true;
5044 break;
5045 case Instruction::ICmp:
5046 case Instruction::FCmp:
5047 if (foldExtractExtract(I))
5048 return true;
5049 break;
5050 case Instruction::Or:
5051 if (foldConcatOfBoolMasks(I))
5052 return true;
5053 [[fallthrough]];
5054 default:
5055 if (Instruction::isBinaryOp(Opcode)) {
5056 if (foldExtractExtract(I))
5057 return true;
5058 if (foldExtractedCmps(I))
5059 return true;
5060 if (foldBinopOfReductions(I))
5061 return true;
5062 }
5063 break;
5064 }
5065 }
5066 return false;
5067 };
5068
5069 bool MadeChange = false;
5070 for (BasicBlock &BB : F) {
5071 // Ignore unreachable basic blocks.
5072 if (!DT.isReachableFromEntry(&BB))
5073 continue;
5074 // Use early increment range so that we can erase instructions in loop.
5075 // make_early_inc_range is not applicable here, as the next iterator may
5076 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5077 // We manually maintain the next instruction and update it when it is about
5078 // to be deleted.
5079 Instruction *I = &BB.front();
5080 while (I) {
5081 NextInst = I->getNextNode();
5082 if (!I->isDebugOrPseudoInst())
5083 MadeChange |= FoldInst(*I);
5084 I = NextInst;
5085 }
5086 }
5087
5088 NextInst = nullptr;
5089
5090 while (!Worklist.isEmpty()) {
5091 Instruction *I = Worklist.removeOne();
5092 if (!I)
5093 continue;
5094
5097 continue;
5098 }
5099
5100 MadeChange |= FoldInst(*I);
5101 }
5102
5103 return MadeChange;
5104}
5105
5108 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5110 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5111 AAResults &AA = FAM.getResult<AAManager>(F);
5112 const DataLayout *DL = &F.getDataLayout();
5113 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5114 TryEarlyFoldsOnly);
5115 if (!Combiner.run())
5116 return PreservedAnalyses::all();
5119 return PA;
5120}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1450
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2266
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2070
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2494
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145