LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoad(Instruction &I);
133 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
134 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
135 bool scalarizeExtExtract(Instruction &I);
136 bool foldConcatOfBoolMasks(Instruction &I);
137 bool foldPermuteOfBinops(Instruction &I);
138 bool foldShuffleOfBinops(Instruction &I);
139 bool foldShuffleOfSelects(Instruction &I);
140 bool foldShuffleOfCastops(Instruction &I);
141 bool foldShuffleOfShuffles(Instruction &I);
142 bool foldPermuteOfIntrinsic(Instruction &I);
143 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
144 bool foldShuffleOfIntrinsics(Instruction &I);
145 bool foldShuffleToIdentity(Instruction &I);
146 bool foldShuffleFromReductions(Instruction &I);
147 bool foldShuffleChainsToReduce(Instruction &I);
148 bool foldCastFromReductions(Instruction &I);
149 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
150 bool foldInterleaveIntrinsics(Instruction &I);
151 bool shrinkType(Instruction &I);
152 bool shrinkLoadForShuffles(Instruction &I);
153 bool shrinkPhiOfShuffles(Instruction &I);
154
155 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
156 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
157 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
158 Old.replaceAllUsesWith(&New);
159 if (auto *NewI = dyn_cast<Instruction>(&New)) {
160 New.takeName(&Old);
161 Worklist.pushUsersToWorkList(*NewI);
162 Worklist.pushValue(NewI);
163 }
164 if (Erase && isInstructionTriviallyDead(&Old)) {
165 eraseInstruction(Old);
166 } else {
167 Worklist.push(&Old);
168 }
169 }
170
171 void eraseInstruction(Instruction &I) {
172 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
173 SmallVector<Value *> Ops(I.operands());
174 Worklist.remove(&I);
175 I.eraseFromParent();
176
177 // Push remaining users of the operands and then the operand itself - allows
178 // further folds that were hindered by OneUse limits.
179 SmallPtrSet<Value *, 4> Visited;
180 for (Value *Op : Ops) {
181 if (!Visited.contains(Op)) {
182 if (auto *OpI = dyn_cast<Instruction>(Op)) {
184 OpI, nullptr, nullptr, [&](Value *V) {
185 if (auto *I = dyn_cast<Instruction>(V)) {
186 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
187 Worklist.remove(I);
188 if (I == NextInst)
189 NextInst = NextInst->getNextNode();
190 Visited.insert(I);
191 }
192 }))
193 continue;
194 Worklist.pushUsersToWorkList(*OpI);
195 Worklist.pushValue(OpI);
196 }
197 }
198 }
199 }
200};
201} // namespace
202
203/// Return the source operand of a potentially bitcasted value. If there is no
204/// bitcast, return the input value itself.
206 while (auto *BitCast = dyn_cast<BitCastInst>(V))
207 V = BitCast->getOperand(0);
208 return V;
209}
210
211static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
212 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
213 // The widened load may load data from dirty regions or create data races
214 // non-existent in the source.
215 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
216 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
218 return false;
219
220 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
221 // sure we have all of our type-based constraints in place for this target.
222 Type *ScalarTy = Load->getType()->getScalarType();
223 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
224 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
225 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
226 ScalarSize % 8 != 0)
227 return false;
228
229 return true;
230}
231
232bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
233 // Match insert into fixed vector of scalar value.
234 // TODO: Handle non-zero insert index.
235 Value *Scalar;
236 if (!match(&I,
238 return false;
239
240 // Optionally match an extract from another vector.
241 Value *X;
242 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
243 if (!HasExtract)
244 X = Scalar;
245
246 auto *Load = dyn_cast<LoadInst>(X);
247 if (!canWidenLoad(Load, TTI))
248 return false;
249
250 Type *ScalarTy = Scalar->getType();
251 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
252 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
253
254 // Check safety of replacing the scalar load with a larger vector load.
255 // We use minimal alignment (maximum flexibility) because we only care about
256 // the dereferenceable region. When calculating cost and creating a new op,
257 // we may use a larger value based on alignment attributes.
258 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
259 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
260
261 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
262 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
263 unsigned OffsetEltIndex = 0;
264 Align Alignment = Load->getAlign();
265 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
266 &DT)) {
267 // It is not safe to load directly from the pointer, but we can still peek
268 // through gep offsets and check if it safe to load from a base address with
269 // updated alignment. If it is, we can shuffle the element(s) into place
270 // after loading.
271 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
272 APInt Offset(OffsetBitWidth, 0);
274
275 // We want to shuffle the result down from a high element of a vector, so
276 // the offset must be positive.
277 if (Offset.isNegative())
278 return false;
279
280 // The offset must be a multiple of the scalar element to shuffle cleanly
281 // in the element's size.
282 uint64_t ScalarSizeInBytes = ScalarSize / 8;
283 if (Offset.urem(ScalarSizeInBytes) != 0)
284 return false;
285
286 // If we load MinVecNumElts, will our target element still be loaded?
287 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
288 if (OffsetEltIndex >= MinVecNumElts)
289 return false;
290
291 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
292 &DT))
293 return false;
294
295 // Update alignment with offset value. Note that the offset could be negated
296 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
297 // negation does not change the result of the alignment calculation.
298 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
299 }
300
301 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
302 // Use the greater of the alignment on the load or its source pointer.
303 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
304 Type *LoadTy = Load->getType();
305 unsigned AS = Load->getPointerAddressSpace();
306 InstructionCost OldCost =
307 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
308 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
309 OldCost +=
310 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
311 /* Insert */ true, HasExtract, CostKind);
312
313 // New pattern: load VecPtr
314 InstructionCost NewCost =
315 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
316 // Optionally, we are shuffling the loaded vector element(s) into place.
317 // For the mask set everything but element 0 to undef to prevent poison from
318 // propagating from the extra loaded memory. This will also optionally
319 // shrink/grow the vector from the loaded size to the output size.
320 // We assume this operation has no cost in codegen if there was no offset.
321 // Note that we could use freeze to avoid poison problems, but then we might
322 // still need a shuffle to change the vector size.
323 auto *Ty = cast<FixedVectorType>(I.getType());
324 unsigned OutputNumElts = Ty->getNumElements();
325 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
326 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
327 Mask[0] = OffsetEltIndex;
328 if (OffsetEltIndex)
329 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
330 CostKind);
331
332 // We can aggressively convert to the vector form because the backend can
333 // invert this transform if it does not result in a performance win.
334 if (OldCost < NewCost || !NewCost.isValid())
335 return false;
336
337 // It is safe and potentially profitable to load a vector directly:
338 // inselt undef, load Scalar, 0 --> load VecPtr
339 IRBuilder<> Builder(Load);
340 Value *CastedPtr =
341 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
342 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
343 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
344
345 replaceValue(I, *VecLd);
346 ++NumVecLoad;
347 return true;
348}
349
350/// If we are loading a vector and then inserting it into a larger vector with
351/// undefined elements, try to load the larger vector and eliminate the insert.
352/// This removes a shuffle in IR and may allow combining of other loaded values.
353bool VectorCombine::widenSubvectorLoad(Instruction &I) {
354 // Match subvector insert of fixed vector.
355 auto *Shuf = cast<ShuffleVectorInst>(&I);
356 if (!Shuf->isIdentityWithPadding())
357 return false;
358
359 // Allow a non-canonical shuffle mask that is choosing elements from op1.
360 unsigned NumOpElts =
361 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
362 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
363 return M >= (int)(NumOpElts);
364 });
365
366 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
367 if (!canWidenLoad(Load, TTI))
368 return false;
369
370 // We use minimal alignment (maximum flexibility) because we only care about
371 // the dereferenceable region. When calculating cost and creating a new op,
372 // we may use a larger value based on alignment attributes.
373 auto *Ty = cast<FixedVectorType>(I.getType());
374 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
375 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
376 Align Alignment = Load->getAlign();
377 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
378 return false;
379
380 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
381 Type *LoadTy = Load->getType();
382 unsigned AS = Load->getPointerAddressSpace();
383
384 // Original pattern: insert_subvector (load PtrOp)
385 // This conservatively assumes that the cost of a subvector insert into an
386 // undef value is 0. We could add that cost if the cost model accurately
387 // reflects the real cost of that operation.
388 InstructionCost OldCost =
389 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
390
391 // New pattern: load PtrOp
392 InstructionCost NewCost =
393 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
394
395 // We can aggressively convert to the vector form because the backend can
396 // invert this transform if it does not result in a performance win.
397 if (OldCost < NewCost || !NewCost.isValid())
398 return false;
399
400 IRBuilder<> Builder(Load);
401 Value *CastedPtr =
402 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
403 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
404 replaceValue(I, *VecLd);
405 ++NumVecLoad;
406 return true;
407}
408
409/// Determine which, if any, of the inputs should be replaced by a shuffle
410/// followed by extract from a different index.
411ExtractElementInst *VectorCombine::getShuffleExtract(
412 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
413 unsigned PreferredExtractIndex = InvalidIndex) const {
414 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
415 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
416 assert(Index0C && Index1C && "Expected constant extract indexes");
417
418 unsigned Index0 = Index0C->getZExtValue();
419 unsigned Index1 = Index1C->getZExtValue();
420
421 // If the extract indexes are identical, no shuffle is needed.
422 if (Index0 == Index1)
423 return nullptr;
424
425 Type *VecTy = Ext0->getVectorOperand()->getType();
426 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
427 InstructionCost Cost0 =
428 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
429 InstructionCost Cost1 =
430 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
431
432 // If both costs are invalid no shuffle is needed
433 if (!Cost0.isValid() && !Cost1.isValid())
434 return nullptr;
435
436 // We are extracting from 2 different indexes, so one operand must be shuffled
437 // before performing a vector operation and/or extract. The more expensive
438 // extract will be replaced by a shuffle.
439 if (Cost0 > Cost1)
440 return Ext0;
441 if (Cost1 > Cost0)
442 return Ext1;
443
444 // If the costs are equal and there is a preferred extract index, shuffle the
445 // opposite operand.
446 if (PreferredExtractIndex == Index0)
447 return Ext1;
448 if (PreferredExtractIndex == Index1)
449 return Ext0;
450
451 // Otherwise, replace the extract with the higher index.
452 return Index0 > Index1 ? Ext0 : Ext1;
453}
454
455/// Compare the relative costs of 2 extracts followed by scalar operation vs.
456/// vector operation(s) followed by extract. Return true if the existing
457/// instructions are cheaper than a vector alternative. Otherwise, return false
458/// and if one of the extracts should be transformed to a shufflevector, set
459/// \p ConvertToShuffle to that extract instruction.
460bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
461 ExtractElementInst *Ext1,
462 const Instruction &I,
463 ExtractElementInst *&ConvertToShuffle,
464 unsigned PreferredExtractIndex) {
465 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
466 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
467 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
468
469 unsigned Opcode = I.getOpcode();
470 Value *Ext0Src = Ext0->getVectorOperand();
471 Value *Ext1Src = Ext1->getVectorOperand();
472 Type *ScalarTy = Ext0->getType();
473 auto *VecTy = cast<VectorType>(Ext0Src->getType());
474 InstructionCost ScalarOpCost, VectorOpCost;
475
476 // Get cost estimates for scalar and vector versions of the operation.
477 bool IsBinOp = Instruction::isBinaryOp(Opcode);
478 if (IsBinOp) {
479 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
480 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
481 } else {
482 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
483 "Expected a compare");
484 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
485 ScalarOpCost = TTI.getCmpSelInstrCost(
486 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
487 VectorOpCost = TTI.getCmpSelInstrCost(
488 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
489 }
490
491 // Get cost estimates for the extract elements. These costs will factor into
492 // both sequences.
493 unsigned Ext0Index = Ext0IndexC->getZExtValue();
494 unsigned Ext1Index = Ext1IndexC->getZExtValue();
495
496 InstructionCost Extract0Cost =
497 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
498 InstructionCost Extract1Cost =
499 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
500
501 // A more expensive extract will always be replaced by a splat shuffle.
502 // For example, if Ext0 is more expensive:
503 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
504 // extelt (opcode (splat V0, Ext0), V1), Ext1
505 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
506 // check the cost of creating a broadcast shuffle and shuffling both
507 // operands to element 0.
508 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
509 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
510 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
511
512 // Extra uses of the extracts mean that we include those costs in the
513 // vector total because those instructions will not be eliminated.
514 InstructionCost OldCost, NewCost;
515 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
516 // Handle a special case. If the 2 extracts are identical, adjust the
517 // formulas to account for that. The extra use charge allows for either the
518 // CSE'd pattern or an unoptimized form with identical values:
519 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
520 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
521 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
522 OldCost = CheapExtractCost + ScalarOpCost;
523 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
524 } else {
525 // Handle the general case. Each extract is actually a different value:
526 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
527 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
528 NewCost = VectorOpCost + CheapExtractCost +
529 !Ext0->hasOneUse() * Extract0Cost +
530 !Ext1->hasOneUse() * Extract1Cost;
531 }
532
533 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
534 if (ConvertToShuffle) {
535 if (IsBinOp && DisableBinopExtractShuffle)
536 return true;
537
538 // If we are extracting from 2 different indexes, then one operand must be
539 // shuffled before performing the vector operation. The shuffle mask is
540 // poison except for 1 lane that is being translated to the remaining
541 // extraction lane. Therefore, it is a splat shuffle. Ex:
542 // ShufMask = { poison, poison, 0, poison }
543 // TODO: The cost model has an option for a "broadcast" shuffle
544 // (splat-from-element-0), but no option for a more general splat.
545 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
546 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
548 ShuffleMask[BestInsIndex] = BestExtIndex;
550 VecTy, VecTy, ShuffleMask, CostKind, 0,
551 nullptr, {ConvertToShuffle});
552 } else {
554 VecTy, VecTy, {}, CostKind, 0, nullptr,
555 {ConvertToShuffle});
556 }
557 }
558
559 // Aggressively form a vector op if the cost is equal because the transform
560 // may enable further optimization.
561 // Codegen can reverse this transform (scalarize) if it was not profitable.
562 return OldCost < NewCost;
563}
564
565/// Create a shuffle that translates (shifts) 1 element from the input vector
566/// to a new element location.
567static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
568 unsigned NewIndex, IRBuilderBase &Builder) {
569 // The shuffle mask is poison except for 1 lane that is being translated
570 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
571 // ShufMask = { 2, poison, poison, poison }
572 auto *VecTy = cast<FixedVectorType>(Vec->getType());
573 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
574 ShufMask[NewIndex] = OldIndex;
575 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
576}
577
578/// Given an extract element instruction with constant index operand, shuffle
579/// the source vector (shift the scalar element) to a NewIndex for extraction.
580/// Return null if the input can be constant folded, so that we are not creating
581/// unnecessary instructions.
582static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
583 IRBuilderBase &Builder) {
584 // Shufflevectors can only be created for fixed-width vectors.
585 Value *X = ExtElt->getVectorOperand();
586 if (!isa<FixedVectorType>(X->getType()))
587 return nullptr;
588
589 // If the extract can be constant-folded, this code is unsimplified. Defer
590 // to other passes to handle that.
591 Value *C = ExtElt->getIndexOperand();
592 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
593 if (isa<Constant>(X))
594 return nullptr;
595
596 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
597 NewIndex, Builder);
598 return Shuf;
599}
600
601/// Try to reduce extract element costs by converting scalar compares to vector
602/// compares followed by extract.
603/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
604Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
605 Instruction &I) {
606 assert(isa<CmpInst>(&I) && "Expected a compare");
607
608 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
609 // --> extelt (cmp Pred V0, V1), ExtIndex
610 ++NumVecCmp;
611 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
612 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
613 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
614}
615
616/// Try to reduce extract element costs by converting scalar binops to vector
617/// binops followed by extract.
618/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
619Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
620 Instruction &I) {
621 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
622
623 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
624 // --> extelt (bo V0, V1), ExtIndex
625 ++NumVecBO;
626 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
627 V1, "foldExtExtBinop");
628
629 // All IR flags are safe to back-propagate because any potential poison
630 // created in unused vector elements is discarded by the extract.
631 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
632 VecBOInst->copyIRFlags(&I);
633
634 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
635}
636
637/// Match an instruction with extracted vector operands.
638bool VectorCombine::foldExtractExtract(Instruction &I) {
639 // It is not safe to transform things like div, urem, etc. because we may
640 // create undefined behavior when executing those on unknown vector elements.
642 return false;
643
644 Instruction *I0, *I1;
645 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
646 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
648 return false;
649
650 Value *V0, *V1;
651 uint64_t C0, C1;
652 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
653 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
654 V0->getType() != V1->getType())
655 return false;
656
657 // If the scalar value 'I' is going to be re-inserted into a vector, then try
658 // to create an extract to that same element. The extract/insert can be
659 // reduced to a "select shuffle".
660 // TODO: If we add a larger pattern match that starts from an insert, this
661 // probably becomes unnecessary.
662 auto *Ext0 = cast<ExtractElementInst>(I0);
663 auto *Ext1 = cast<ExtractElementInst>(I1);
664 uint64_t InsertIndex = InvalidIndex;
665 if (I.hasOneUse())
666 match(I.user_back(),
667 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
668
669 ExtractElementInst *ExtractToChange;
670 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
671 return false;
672
673 Value *ExtOp0 = Ext0->getVectorOperand();
674 Value *ExtOp1 = Ext1->getVectorOperand();
675
676 if (ExtractToChange) {
677 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
678 Value *NewExtOp =
679 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
680 if (!NewExtOp)
681 return false;
682 if (ExtractToChange == Ext0)
683 ExtOp0 = NewExtOp;
684 else
685 ExtOp1 = NewExtOp;
686 }
687
688 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
689 : Ext0->getIndexOperand();
690 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
691 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
692 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
693 Worklist.push(Ext0);
694 Worklist.push(Ext1);
695 replaceValue(I, *NewExt);
696 return true;
697}
698
699/// Try to replace an extract + scalar fneg + insert with a vector fneg +
700/// shuffle.
701bool VectorCombine::foldInsExtFNeg(Instruction &I) {
702 // Match an insert (op (extract)) pattern.
703 Value *DstVec;
704 uint64_t ExtIdx, InsIdx;
705 Instruction *FNeg;
706 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
707 m_ConstantInt(InsIdx))))
708 return false;
709
710 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
711 Value *SrcVec;
712 Instruction *Extract;
713 if (!match(FNeg, m_FNeg(m_CombineAnd(
714 m_Instruction(Extract),
715 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
716 return false;
717
718 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
719 auto *DstVecScalarTy = DstVecTy->getScalarType();
720 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
721 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
722 return false;
723
724 // Ignore if insert/extract index is out of bounds or destination vector has
725 // one element
726 unsigned NumDstElts = DstVecTy->getNumElements();
727 unsigned NumSrcElts = SrcVecTy->getNumElements();
728 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
729 return false;
730
731 // We are inserting the negated element into the same lane that we extracted
732 // from. This is equivalent to a select-shuffle that chooses all but the
733 // negated element from the destination vector.
734 SmallVector<int> Mask(NumDstElts);
735 std::iota(Mask.begin(), Mask.end(), 0);
736 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
737 InstructionCost OldCost =
738 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
739 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
740
741 // If the extract has one use, it will be eliminated, so count it in the
742 // original cost. If it has more than one use, ignore the cost because it will
743 // be the same before/after.
744 if (Extract->hasOneUse())
745 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
746
747 InstructionCost NewCost =
748 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
750 DstVecTy, Mask, CostKind);
751
752 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
753 // If the lengths of the two vectors are not equal,
754 // we need to add a length-change vector. Add this cost.
755 SmallVector<int> SrcMask;
756 if (NeedLenChg) {
757 SrcMask.assign(NumDstElts, PoisonMaskElem);
758 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
760 DstVecTy, SrcVecTy, SrcMask, CostKind);
761 }
762
763 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
764 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
765 << "\n");
766 if (NewCost > OldCost)
767 return false;
768
769 Value *NewShuf, *LenChgShuf = nullptr;
770 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
771 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
772 if (NeedLenChg) {
773 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
774 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
775 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
776 Worklist.pushValue(LenChgShuf);
777 } else {
778 // shuffle DstVec, (fneg SrcVec), Mask
779 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
780 }
781
782 Worklist.pushValue(VecFNeg);
783 replaceValue(I, *NewShuf);
784 return true;
785}
786
787/// Try to fold insert(binop(x,y),binop(a,b),idx)
788/// --> binop(insert(x,a,idx),insert(y,b,idx))
789bool VectorCombine::foldInsExtBinop(Instruction &I) {
790 BinaryOperator *VecBinOp, *SclBinOp;
791 uint64_t Index;
792 if (!match(&I,
793 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
794 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
795 return false;
796
797 // TODO: Add support for addlike etc.
798 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
799 if (BinOpcode != SclBinOp->getOpcode())
800 return false;
801
802 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
803 if (!ResultTy)
804 return false;
805
806 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
807 // shuffle?
808
810 TTI.getInstructionCost(VecBinOp, CostKind) +
812 InstructionCost NewCost =
813 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
814 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
815 Index, VecBinOp->getOperand(0),
816 SclBinOp->getOperand(0)) +
817 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
818 Index, VecBinOp->getOperand(1),
819 SclBinOp->getOperand(1));
820
821 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
822 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
823 << "\n");
824 if (NewCost > OldCost)
825 return false;
826
827 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
828 SclBinOp->getOperand(0), Index);
829 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
830 SclBinOp->getOperand(1), Index);
831 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
832
833 // Intersect flags from the old binops.
834 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
835 NewInst->copyIRFlags(VecBinOp);
836 NewInst->andIRFlags(SclBinOp);
837 }
838
839 Worklist.pushValue(NewIns0);
840 Worklist.pushValue(NewIns1);
841 replaceValue(I, *NewBO);
842 return true;
843}
844
845/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
846/// Supports: bitcast, trunc, sext, zext
847bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
848 // Check if this is a bitwise logic operation
849 auto *BinOp = dyn_cast<BinaryOperator>(&I);
850 if (!BinOp || !BinOp->isBitwiseLogicOp())
851 return false;
852
853 // Get the cast instructions
854 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
855 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
856 if (!LHSCast || !RHSCast) {
857 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
858 return false;
859 }
860
861 // Both casts must be the same type
862 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
863 if (CastOpcode != RHSCast->getOpcode())
864 return false;
865
866 // Only handle supported cast operations
867 switch (CastOpcode) {
868 case Instruction::BitCast:
869 case Instruction::Trunc:
870 case Instruction::SExt:
871 case Instruction::ZExt:
872 break;
873 default:
874 return false;
875 }
876
877 Value *LHSSrc = LHSCast->getOperand(0);
878 Value *RHSSrc = RHSCast->getOperand(0);
879
880 // Source types must match
881 if (LHSSrc->getType() != RHSSrc->getType())
882 return false;
883
884 auto *SrcTy = LHSSrc->getType();
885 auto *DstTy = I.getType();
886 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
887 // Other casts only handle vector types with integer elements.
888 if (CastOpcode != Instruction::BitCast &&
889 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
890 return false;
891
892 // Only integer scalar/vector values are legal for bitwise logic operations.
893 if (!SrcTy->getScalarType()->isIntegerTy() ||
894 !DstTy->getScalarType()->isIntegerTy())
895 return false;
896
897 // Cost Check :
898 // OldCost = bitlogic + 2*casts
899 // NewCost = bitlogic + cast
900
901 // Calculate specific costs for each cast with instruction context
903 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
905 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
906
907 InstructionCost OldCost =
908 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
909 LHSCastCost + RHSCastCost;
910
911 // For new cost, we can't provide an instruction (it doesn't exist yet)
912 InstructionCost GenericCastCost = TTI.getCastInstrCost(
913 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
914
915 InstructionCost NewCost =
916 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
917 GenericCastCost;
918
919 // Account for multi-use casts using specific costs
920 if (!LHSCast->hasOneUse())
921 NewCost += LHSCastCost;
922 if (!RHSCast->hasOneUse())
923 NewCost += RHSCastCost;
924
925 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
926 << " NewCost=" << NewCost << "\n");
927
928 if (NewCost > OldCost)
929 return false;
930
931 // Create the operation on the source type
932 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
933 BinOp->getName() + ".inner");
934 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
935 NewBinOp->copyIRFlags(BinOp);
936
937 Worklist.pushValue(NewOp);
938
939 // Create the cast operation directly to ensure we get a new instruction
940 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
941
942 // Preserve cast instruction flags
943 NewCast->copyIRFlags(LHSCast);
944 NewCast->andIRFlags(RHSCast);
945
946 // Insert the new instruction
947 Value *Result = Builder.Insert(NewCast);
948
949 replaceValue(I, *Result);
950 return true;
951}
952
953/// Match:
954// bitop(castop(x), C) ->
955// bitop(castop(x), castop(InvC)) ->
956// castop(bitop(x, InvC))
957// Supports: bitcast
958bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
960 Constant *C;
961
962 // Check if this is a bitwise logic operation
964 return false;
965
966 // Get the cast instructions
967 auto *LHSCast = dyn_cast<CastInst>(LHS);
968 if (!LHSCast)
969 return false;
970
971 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
972
973 // Only handle supported cast operations
974 switch (CastOpcode) {
975 case Instruction::BitCast:
976 case Instruction::ZExt:
977 case Instruction::SExt:
978 case Instruction::Trunc:
979 break;
980 default:
981 return false;
982 }
983
984 Value *LHSSrc = LHSCast->getOperand(0);
985
986 auto *SrcTy = LHSSrc->getType();
987 auto *DstTy = I.getType();
988 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
989 // Other casts only handle vector types with integer elements.
990 if (CastOpcode != Instruction::BitCast &&
991 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
992 return false;
993
994 // Only integer scalar/vector values are legal for bitwise logic operations.
995 if (!SrcTy->getScalarType()->isIntegerTy() ||
996 !DstTy->getScalarType()->isIntegerTy())
997 return false;
998
999 // Find the constant InvC, such that castop(InvC) equals to C.
1000 PreservedCastFlags RHSFlags;
1001 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1002 if (!InvC)
1003 return false;
1004
1005 // Cost Check :
1006 // OldCost = bitlogic + cast
1007 // NewCost = bitlogic + cast
1008
1009 // Calculate specific costs for each cast with instruction context
1010 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1011 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1012
1013 InstructionCost OldCost =
1014 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1015
1016 // For new cost, we can't provide an instruction (it doesn't exist yet)
1017 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1018 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1019
1020 InstructionCost NewCost =
1021 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1022 GenericCastCost;
1023
1024 // Account for multi-use casts using specific costs
1025 if (!LHSCast->hasOneUse())
1026 NewCost += LHSCastCost;
1027
1028 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1029 << " NewCost=" << NewCost << "\n");
1030
1031 if (NewCost > OldCost)
1032 return false;
1033
1034 // Create the operation on the source type
1035 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1036 LHSSrc, InvC, I.getName() + ".inner");
1037 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1038 NewBinOp->copyIRFlags(&I);
1039
1040 Worklist.pushValue(NewOp);
1041
1042 // Create the cast operation directly to ensure we get a new instruction
1043 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1044
1045 // Preserve cast instruction flags
1046 if (RHSFlags.NNeg)
1047 NewCast->setNonNeg();
1048 if (RHSFlags.NUW)
1049 NewCast->setHasNoUnsignedWrap();
1050 if (RHSFlags.NSW)
1051 NewCast->setHasNoSignedWrap();
1052
1053 NewCast->andIRFlags(LHSCast);
1054
1055 // Insert the new instruction
1056 Value *Result = Builder.Insert(NewCast);
1057
1058 replaceValue(I, *Result);
1059 return true;
1060}
1061
1062/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1063/// destination type followed by shuffle. This can enable further transforms by
1064/// moving bitcasts or shuffles together.
1065bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1066 Value *V0, *V1;
1067 ArrayRef<int> Mask;
1068 if (!match(&I, m_BitCast(m_OneUse(
1069 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1070 return false;
1071
1072 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1073 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1074 // mask for scalable type is a splat or not.
1075 // 2) Disallow non-vector casts.
1076 // TODO: We could allow any shuffle.
1077 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1078 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1079 if (!DestTy || !SrcTy)
1080 return false;
1081
1082 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1083 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1084 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1085 return false;
1086
1087 bool IsUnary = isa<UndefValue>(V1);
1088
1089 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1090 // if it won't increase the number of bitcasts.
1091 if (!IsUnary) {
1094 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1095 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1096 return false;
1097 }
1098
1099 SmallVector<int, 16> NewMask;
1100 if (DestEltSize <= SrcEltSize) {
1101 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1102 // always be expanded to the equivalent form choosing narrower elements.
1103 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1104 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1105 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1106 } else {
1107 // The bitcast is from narrow elements to wide elements. The shuffle mask
1108 // must choose consecutive elements to allow casting first.
1109 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1110 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1111 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1112 return false;
1113 }
1114
1115 // Bitcast the shuffle src - keep its original width but using the destination
1116 // scalar type.
1117 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1118 auto *NewShuffleTy =
1119 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1120 auto *OldShuffleTy =
1121 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1122 unsigned NumOps = IsUnary ? 1 : 2;
1123
1124 // The new shuffle must not cost more than the old shuffle.
1128
1129 InstructionCost NewCost =
1130 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1131 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1132 TargetTransformInfo::CastContextHint::None,
1133 CostKind));
1134 InstructionCost OldCost =
1135 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1136 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1137 TargetTransformInfo::CastContextHint::None,
1138 CostKind);
1139
1140 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1141 << OldCost << " vs NewCost: " << NewCost << "\n");
1142
1143 if (NewCost > OldCost || !NewCost.isValid())
1144 return false;
1145
1146 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1147 ++NumShufOfBitcast;
1148 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1149 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1150 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1151 replaceValue(I, *Shuf);
1152 return true;
1153}
1154
1155/// VP Intrinsics whose vector operands are both splat values may be simplified
1156/// into the scalar version of the operation and the result splatted. This
1157/// can lead to scalarization down the line.
1158bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1159 if (!isa<VPIntrinsic>(I))
1160 return false;
1161 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1162 Value *Op0 = VPI.getArgOperand(0);
1163 Value *Op1 = VPI.getArgOperand(1);
1164
1165 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1166 return false;
1167
1168 // Check getSplatValue early in this function, to avoid doing unnecessary
1169 // work.
1170 Value *ScalarOp0 = getSplatValue(Op0);
1171 Value *ScalarOp1 = getSplatValue(Op1);
1172 if (!ScalarOp0 || !ScalarOp1)
1173 return false;
1174
1175 // For the binary VP intrinsics supported here, the result on disabled lanes
1176 // is a poison value. For now, only do this simplification if all lanes
1177 // are active.
1178 // TODO: Relax the condition that all lanes are active by using insertelement
1179 // on inactive lanes.
1180 auto IsAllTrueMask = [](Value *MaskVal) {
1181 if (Value *SplattedVal = getSplatValue(MaskVal))
1182 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1183 return ConstValue->isAllOnesValue();
1184 return false;
1185 };
1186 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1187 return false;
1188
1189 // Check to make sure we support scalarization of the intrinsic
1190 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1191 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1192 return false;
1193
1194 // Calculate cost of splatting both operands into vectors and the vector
1195 // intrinsic
1196 VectorType *VecTy = cast<VectorType>(VPI.getType());
1197 SmallVector<int> Mask;
1198 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1199 Mask.resize(FVTy->getNumElements(), 0);
1200 InstructionCost SplatCost =
1201 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1203 CostKind);
1204
1205 // Calculate the cost of the VP Intrinsic
1207 for (Value *V : VPI.args())
1208 Args.push_back(V->getType());
1209 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1210 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1211 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1212
1213 // Determine scalar opcode
1214 std::optional<unsigned> FunctionalOpcode =
1215 VPI.getFunctionalOpcode();
1216 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1217 if (!FunctionalOpcode) {
1218 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1219 if (!ScalarIntrID)
1220 return false;
1221 }
1222
1223 // Calculate cost of scalarizing
1224 InstructionCost ScalarOpCost = 0;
1225 if (ScalarIntrID) {
1226 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1227 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1228 } else {
1229 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1230 VecTy->getScalarType(), CostKind);
1231 }
1232
1233 // The existing splats may be kept around if other instructions use them.
1234 InstructionCost CostToKeepSplats =
1235 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1236 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1237
1238 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1239 << "\n");
1240 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1241 << ", Cost of scalarizing:" << NewCost << "\n");
1242
1243 // We want to scalarize unless the vector variant actually has lower cost.
1244 if (OldCost < NewCost || !NewCost.isValid())
1245 return false;
1246
1247 // Scalarize the intrinsic
1248 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1249 Value *EVL = VPI.getArgOperand(3);
1250
1251 // If the VP op might introduce UB or poison, we can scalarize it provided
1252 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1253 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1254 // scalarizing it.
1255 bool SafeToSpeculate;
1256 if (ScalarIntrID)
1257 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1258 .hasAttribute(Attribute::AttrKind::Speculatable);
1259 else
1261 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1262 if (!SafeToSpeculate &&
1263 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1264 return false;
1265
1266 Value *ScalarVal =
1267 ScalarIntrID
1268 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1269 {ScalarOp0, ScalarOp1})
1270 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1271 ScalarOp0, ScalarOp1);
1272
1273 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1274 return true;
1275}
1276
1277/// Match a vector op/compare/intrinsic with at least one
1278/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1279/// by insertelement.
1280bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1281 auto *UO = dyn_cast<UnaryOperator>(&I);
1282 auto *BO = dyn_cast<BinaryOperator>(&I);
1283 auto *CI = dyn_cast<CmpInst>(&I);
1284 auto *II = dyn_cast<IntrinsicInst>(&I);
1285 if (!UO && !BO && !CI && !II)
1286 return false;
1287
1288 // TODO: Allow intrinsics with different argument types
1289 if (II) {
1290 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1291 return false;
1292 for (auto [Idx, Arg] : enumerate(II->args()))
1293 if (Arg->getType() != II->getType() &&
1294 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1295 return false;
1296 }
1297
1298 // Do not convert the vector condition of a vector select into a scalar
1299 // condition. That may cause problems for codegen because of differences in
1300 // boolean formats and register-file transfers.
1301 // TODO: Can we account for that in the cost model?
1302 if (CI)
1303 for (User *U : I.users())
1304 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1305 return false;
1306
1307 // Match constant vectors or scalars being inserted into constant vectors:
1308 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1309 SmallVector<Value *> VecCs, ScalarOps;
1310 std::optional<uint64_t> Index;
1311
1312 auto Ops = II ? II->args() : I.operands();
1313 for (auto [OpNum, Op] : enumerate(Ops)) {
1314 Constant *VecC;
1315 Value *V;
1316 uint64_t InsIdx = 0;
1317 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1318 m_ConstantInt(InsIdx)))) {
1319 // Bail if any inserts are out of bounds.
1320 VectorType *OpTy = cast<VectorType>(Op->getType());
1321 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1322 return false;
1323 // All inserts must have the same index.
1324 // TODO: Deal with mismatched index constants and variable indexes?
1325 if (!Index)
1326 Index = InsIdx;
1327 else if (InsIdx != *Index)
1328 return false;
1329 VecCs.push_back(VecC);
1330 ScalarOps.push_back(V);
1331 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1332 OpNum, &TTI)) {
1333 VecCs.push_back(Op.get());
1334 ScalarOps.push_back(Op.get());
1335 } else if (match(Op.get(), m_Constant(VecC))) {
1336 VecCs.push_back(VecC);
1337 ScalarOps.push_back(nullptr);
1338 } else {
1339 return false;
1340 }
1341 }
1342
1343 // Bail if all operands are constant.
1344 if (!Index.has_value())
1345 return false;
1346
1347 VectorType *VecTy = cast<VectorType>(I.getType());
1348 Type *ScalarTy = VecTy->getScalarType();
1349 assert(VecTy->isVectorTy() &&
1350 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1351 ScalarTy->isPointerTy()) &&
1352 "Unexpected types for insert element into binop or cmp");
1353
1354 unsigned Opcode = I.getOpcode();
1355 InstructionCost ScalarOpCost, VectorOpCost;
1356 if (CI) {
1357 CmpInst::Predicate Pred = CI->getPredicate();
1358 ScalarOpCost = TTI.getCmpSelInstrCost(
1359 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1360 VectorOpCost = TTI.getCmpSelInstrCost(
1361 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1362 } else if (UO || BO) {
1363 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1364 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1365 } else {
1366 IntrinsicCostAttributes ScalarICA(
1367 II->getIntrinsicID(), ScalarTy,
1368 SmallVector<Type *>(II->arg_size(), ScalarTy));
1369 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1370 IntrinsicCostAttributes VectorICA(
1371 II->getIntrinsicID(), VecTy,
1372 SmallVector<Type *>(II->arg_size(), VecTy));
1373 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1374 }
1375
1376 // Fold the vector constants in the original vectors into a new base vector to
1377 // get more accurate cost modelling.
1378 Value *NewVecC = nullptr;
1379 if (CI)
1380 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1381 else if (UO)
1382 NewVecC =
1383 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1384 else if (BO)
1385 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1386 else if (II)
1387 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1388
1389 if (!NewVecC)
1390 return false;
1391
1392 // Get cost estimate for the insert element. This cost will factor into
1393 // both sequences.
1394 InstructionCost OldCost = VectorOpCost;
1395 InstructionCost NewCost =
1396 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1397 CostKind, *Index, NewVecC);
1398
1399 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1400 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1401 II->getIntrinsicID(), Idx, &TTI)))
1402 continue;
1404 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1405 OldCost += InsertCost;
1406 NewCost += !Op->hasOneUse() * InsertCost;
1407 }
1408
1409 // We want to scalarize unless the vector variant actually has lower cost.
1410 if (OldCost < NewCost || !NewCost.isValid())
1411 return false;
1412
1413 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1414 // inselt NewVecC, (scalar_op V0, V1), Index
1415 if (CI)
1416 ++NumScalarCmp;
1417 else if (UO || BO)
1418 ++NumScalarOps;
1419 else
1420 ++NumScalarIntrinsic;
1421
1422 // For constant cases, extract the scalar element, this should constant fold.
1423 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1424 if (!Scalar)
1426 cast<Constant>(VecC), Builder.getInt64(*Index));
1427
1428 Value *Scalar;
1429 if (CI)
1430 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1431 else if (UO || BO)
1432 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1433 else
1434 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1435
1436 Scalar->setName(I.getName() + ".scalar");
1437
1438 // All IR flags are safe to back-propagate. There is no potential for extra
1439 // poison to be created by the scalar instruction.
1440 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1441 ScalarInst->copyIRFlags(&I);
1442
1443 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1444 replaceValue(I, *Insert);
1445 return true;
1446}
1447
1448/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1449/// a vector into vector operations followed by extract. Note: The SLP pass
1450/// may miss this pattern because of implementation problems.
1451bool VectorCombine::foldExtractedCmps(Instruction &I) {
1452 auto *BI = dyn_cast<BinaryOperator>(&I);
1453
1454 // We are looking for a scalar binop of booleans.
1455 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1456 if (!BI || !I.getType()->isIntegerTy(1))
1457 return false;
1458
1459 // The compare predicates should match, and each compare should have a
1460 // constant operand.
1461 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1462 Instruction *I0, *I1;
1463 Constant *C0, *C1;
1464 CmpPredicate P0, P1;
1465 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1466 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1467 return false;
1468
1469 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1470 if (!MatchingPred)
1471 return false;
1472
1473 // The compare operands must be extracts of the same vector with constant
1474 // extract indexes.
1475 Value *X;
1476 uint64_t Index0, Index1;
1477 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1478 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1479 return false;
1480
1481 auto *Ext0 = cast<ExtractElementInst>(I0);
1482 auto *Ext1 = cast<ExtractElementInst>(I1);
1483 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1484 if (!ConvertToShuf)
1485 return false;
1486 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1487 "Unknown ExtractElementInst");
1488
1489 // The original scalar pattern is:
1490 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1491 CmpInst::Predicate Pred = *MatchingPred;
1492 unsigned CmpOpcode =
1493 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1494 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1495 if (!VecTy)
1496 return false;
1497
1498 InstructionCost Ext0Cost =
1499 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1500 InstructionCost Ext1Cost =
1501 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1503 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1504 CostKind);
1505
1506 InstructionCost OldCost =
1507 Ext0Cost + Ext1Cost + CmpCost * 2 +
1508 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1509
1510 // The proposed vector pattern is:
1511 // vcmp = cmp Pred X, VecC
1512 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1513 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1514 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1517 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1518 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1519 ShufMask[CheapIndex] = ExpensiveIndex;
1521 CmpTy, ShufMask, CostKind);
1522 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1523 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1524 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1525 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1526
1527 // Aggressively form vector ops if the cost is equal because the transform
1528 // may enable further optimization.
1529 // Codegen can reverse this transform (scalarize) if it was not profitable.
1530 if (OldCost < NewCost || !NewCost.isValid())
1531 return false;
1532
1533 // Create a vector constant from the 2 scalar constants.
1534 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1535 PoisonValue::get(VecTy->getElementType()));
1536 CmpC[Index0] = C0;
1537 CmpC[Index1] = C1;
1538 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1539 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1540 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1541 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1542 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1543 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1544 replaceValue(I, *NewExt);
1545 ++NumVecCmpBO;
1546 return true;
1547}
1548
1551 const TargetTransformInfo &TTI,
1552 InstructionCost &CostBeforeReduction,
1553 InstructionCost &CostAfterReduction) {
1554 Instruction *Op0, *Op1;
1555 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1556 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1557 unsigned ReductionOpc =
1558 getArithmeticReductionInstruction(II.getIntrinsicID());
1559 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1560 bool IsUnsigned = isa<ZExtInst>(RedOp);
1561 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1562
1563 CostBeforeReduction =
1564 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1566 CostAfterReduction =
1567 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1568 ExtType, FastMathFlags(), CostKind);
1569 return;
1570 }
1571 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1572 match(RedOp,
1574 match(Op0, m_ZExtOrSExt(m_Value())) &&
1575 Op0->getOpcode() == Op1->getOpcode() &&
1576 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1577 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1578 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1579 bool IsUnsigned = isa<ZExtInst>(Op0);
1580 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1581 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1582
1583 InstructionCost ExtCost =
1584 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1586 InstructionCost MulCost =
1587 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1588 InstructionCost Ext2Cost =
1589 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1591
1592 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1593 CostAfterReduction = TTI.getMulAccReductionCost(
1594 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1595 return;
1596 }
1597 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1598 std::nullopt, CostKind);
1599}
1600
1601bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1602 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1603 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1604 if (BinOpOpc == Instruction::Sub)
1605 ReductionIID = Intrinsic::vector_reduce_add;
1606 if (ReductionIID == Intrinsic::not_intrinsic)
1607 return false;
1608
1609 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1610 Intrinsic::ID IID) -> Value * {
1611 auto *II = dyn_cast<IntrinsicInst>(V);
1612 if (!II)
1613 return nullptr;
1614 if (II->getIntrinsicID() == IID && II->hasOneUse())
1615 return II->getArgOperand(0);
1616 return nullptr;
1617 };
1618
1619 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1620 if (!V0)
1621 return false;
1622 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1623 if (!V1)
1624 return false;
1625
1626 auto *VTy = cast<VectorType>(V0->getType());
1627 if (V1->getType() != VTy)
1628 return false;
1629 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1630 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1631 unsigned ReductionOpc =
1632 getArithmeticReductionInstruction(II0.getIntrinsicID());
1633
1634 InstructionCost OldCost = 0;
1635 InstructionCost NewCost = 0;
1636 InstructionCost CostOfRedOperand0 = 0;
1637 InstructionCost CostOfRed0 = 0;
1638 InstructionCost CostOfRedOperand1 = 0;
1639 InstructionCost CostOfRed1 = 0;
1640 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1641 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1642 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1643 NewCost =
1644 CostOfRedOperand0 + CostOfRedOperand1 +
1645 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1646 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1647 if (NewCost >= OldCost || !NewCost.isValid())
1648 return false;
1649
1650 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1651 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1652 << "\n");
1653 Value *VectorBO;
1654 if (BinOpOpc == Instruction::Or)
1655 VectorBO = Builder.CreateOr(V0, V1, "",
1656 cast<PossiblyDisjointInst>(I).isDisjoint());
1657 else
1658 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1659
1660 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1661 replaceValue(I, *Rdx);
1662 return true;
1663}
1664
1665// Check if memory loc modified between two instrs in the same BB
1668 const MemoryLocation &Loc, AAResults &AA) {
1669 unsigned NumScanned = 0;
1670 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1671 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1672 ++NumScanned > MaxInstrsToScan;
1673 });
1674}
1675
1676namespace {
1677/// Helper class to indicate whether a vector index can be safely scalarized and
1678/// if a freeze needs to be inserted.
1679class ScalarizationResult {
1680 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1681
1682 StatusTy Status;
1683 Value *ToFreeze;
1684
1685 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1686 : Status(Status), ToFreeze(ToFreeze) {}
1687
1688public:
1689 ScalarizationResult(const ScalarizationResult &Other) = default;
1690 ~ScalarizationResult() {
1691 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1692 }
1693
1694 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1695 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1696 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1697 return {StatusTy::SafeWithFreeze, ToFreeze};
1698 }
1699
1700 /// Returns true if the index can be scalarize without requiring a freeze.
1701 bool isSafe() const { return Status == StatusTy::Safe; }
1702 /// Returns true if the index cannot be scalarized.
1703 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1704 /// Returns true if the index can be scalarize, but requires inserting a
1705 /// freeze.
1706 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1707
1708 /// Reset the state of Unsafe and clear ToFreze if set.
1709 void discard() {
1710 ToFreeze = nullptr;
1711 Status = StatusTy::Unsafe;
1712 }
1713
1714 /// Freeze the ToFreeze and update the use in \p User to use it.
1715 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1716 assert(isSafeWithFreeze() &&
1717 "should only be used when freezing is required");
1718 assert(is_contained(ToFreeze->users(), &UserI) &&
1719 "UserI must be a user of ToFreeze");
1720 IRBuilder<>::InsertPointGuard Guard(Builder);
1721 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1722 Value *Frozen =
1723 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1724 for (Use &U : make_early_inc_range((UserI.operands())))
1725 if (U.get() == ToFreeze)
1726 U.set(Frozen);
1727
1728 ToFreeze = nullptr;
1729 }
1730};
1731} // namespace
1732
1733/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1734/// Idx. \p Idx must access a valid vector element.
1735static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1736 Instruction *CtxI,
1737 AssumptionCache &AC,
1738 const DominatorTree &DT) {
1739 // We do checks for both fixed vector types and scalable vector types.
1740 // This is the number of elements of fixed vector types,
1741 // or the minimum number of elements of scalable vector types.
1742 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1743 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1744
1745 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1746 if (C->getValue().ult(NumElements))
1747 return ScalarizationResult::safe();
1748 return ScalarizationResult::unsafe();
1749 }
1750
1751 // Always unsafe if the index type can't handle all inbound values.
1752 if (!llvm::isUIntN(IntWidth, NumElements))
1753 return ScalarizationResult::unsafe();
1754
1755 APInt Zero(IntWidth, 0);
1756 APInt MaxElts(IntWidth, NumElements);
1757 ConstantRange ValidIndices(Zero, MaxElts);
1758 ConstantRange IdxRange(IntWidth, true);
1759
1760 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1761 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1762 true, &AC, CtxI, &DT)))
1763 return ScalarizationResult::safe();
1764 return ScalarizationResult::unsafe();
1765 }
1766
1767 // If the index may be poison, check if we can insert a freeze before the
1768 // range of the index is restricted.
1769 Value *IdxBase;
1770 ConstantInt *CI;
1771 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1772 IdxRange = IdxRange.binaryAnd(CI->getValue());
1773 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1774 IdxRange = IdxRange.urem(CI->getValue());
1775 }
1776
1777 if (ValidIndices.contains(IdxRange))
1778 return ScalarizationResult::safeWithFreeze(IdxBase);
1779 return ScalarizationResult::unsafe();
1780}
1781
1782/// The memory operation on a vector of \p ScalarType had alignment of
1783/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1784/// alignment that will be valid for the memory operation on a single scalar
1785/// element of the same type with index \p Idx.
1787 Type *ScalarType, Value *Idx,
1788 const DataLayout &DL) {
1789 if (auto *C = dyn_cast<ConstantInt>(Idx))
1790 return commonAlignment(VectorAlignment,
1791 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1792 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1793}
1794
1795// Combine patterns like:
1796// %0 = load <4 x i32>, <4 x i32>* %a
1797// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1798// store <4 x i32> %1, <4 x i32>* %a
1799// to:
1800// %0 = bitcast <4 x i32>* %a to i32*
1801// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1802// store i32 %b, i32* %1
1803bool VectorCombine::foldSingleElementStore(Instruction &I) {
1805 return false;
1806 auto *SI = cast<StoreInst>(&I);
1807 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1808 return false;
1809
1810 // TODO: Combine more complicated patterns (multiple insert) by referencing
1811 // TargetTransformInfo.
1813 Value *NewElement;
1814 Value *Idx;
1815 if (!match(SI->getValueOperand(),
1816 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1817 m_Value(Idx))))
1818 return false;
1819
1820 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1821 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1822 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1823 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1824 // modified between, vector type matches store size, and index is inbounds.
1825 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1826 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1827 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1828 return false;
1829
1830 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1831 if (ScalarizableIdx.isUnsafe() ||
1832 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1833 MemoryLocation::get(SI), AA))
1834 return false;
1835
1836 // Ensure we add the load back to the worklist BEFORE its users so they can
1837 // erased in the correct order.
1838 Worklist.push(Load);
1839
1840 if (ScalarizableIdx.isSafeWithFreeze())
1841 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1842 Value *GEP = Builder.CreateInBoundsGEP(
1843 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1844 {ConstantInt::get(Idx->getType(), 0), Idx});
1845 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1846 NSI->copyMetadata(*SI);
1847 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1848 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1849 *DL);
1850 NSI->setAlignment(ScalarOpAlignment);
1851 replaceValue(I, *NSI);
1853 return true;
1854 }
1855
1856 return false;
1857}
1858
1859/// Try to scalarize vector loads feeding extractelement or bitcast
1860/// instructions.
1861bool VectorCombine::scalarizeLoad(Instruction &I) {
1862 Value *Ptr;
1863 if (!match(&I, m_Load(m_Value(Ptr))))
1864 return false;
1865
1866 auto *LI = cast<LoadInst>(&I);
1867 auto *VecTy = cast<VectorType>(LI->getType());
1868 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1869 return false;
1870
1871 bool AllExtracts = true;
1872 bool AllBitcasts = true;
1873 Instruction *LastCheckedInst = LI;
1874 unsigned NumInstChecked = 0;
1875
1876 // Check what type of users we have (must either all be extracts or
1877 // bitcasts) and ensure no memory modifications between the load and
1878 // its users.
1879 for (User *U : LI->users()) {
1880 auto *UI = dyn_cast<Instruction>(U);
1881 if (!UI || UI->getParent() != LI->getParent())
1882 return false;
1883
1884 // If any user is waiting to be erased, then bail out as this will
1885 // distort the cost calculation and possibly lead to infinite loops.
1886 if (UI->use_empty())
1887 return false;
1888
1889 if (!isa<ExtractElementInst>(UI))
1890 AllExtracts = false;
1891 if (!isa<BitCastInst>(UI))
1892 AllBitcasts = false;
1893
1894 // Check if any instruction between the load and the user may modify memory.
1895 if (LastCheckedInst->comesBefore(UI)) {
1896 for (Instruction &I :
1897 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1898 // Bail out if we reached the check limit or the instruction may write
1899 // to memory.
1900 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1901 return false;
1902 NumInstChecked++;
1903 }
1904 LastCheckedInst = UI;
1905 }
1906 }
1907
1908 if (AllExtracts)
1909 return scalarizeLoadExtract(LI, VecTy, Ptr);
1910 if (AllBitcasts)
1911 return scalarizeLoadBitcast(LI, VecTy, Ptr);
1912 return false;
1913}
1914
1915/// Try to scalarize vector loads feeding extractelement instructions.
1916bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
1917 Value *Ptr) {
1919 return false;
1920
1921 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1922 auto FailureGuard = make_scope_exit([&]() {
1923 // If the transform is aborted, discard the ScalarizationResults.
1924 for (auto &Pair : NeedFreeze)
1925 Pair.second.discard();
1926 });
1927
1928 InstructionCost OriginalCost =
1929 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1931 InstructionCost ScalarizedCost = 0;
1932
1933 for (User *U : LI->users()) {
1934 auto *UI = cast<ExtractElementInst>(U);
1935
1936 auto ScalarIdx =
1937 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1938 if (ScalarIdx.isUnsafe())
1939 return false;
1940 if (ScalarIdx.isSafeWithFreeze()) {
1941 NeedFreeze.try_emplace(UI, ScalarIdx);
1942 ScalarIdx.discard();
1943 }
1944
1945 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1946 OriginalCost +=
1947 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1948 Index ? Index->getZExtValue() : -1);
1949 ScalarizedCost +=
1950 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1952 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1953 nullptr, nullptr, CostKind);
1954 }
1955
1956 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
1957 << "\n LoadExtractCost: " << OriginalCost
1958 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1959
1960 if (ScalarizedCost >= OriginalCost)
1961 return false;
1962
1963 // Ensure we add the load back to the worklist BEFORE its users so they can
1964 // erased in the correct order.
1965 Worklist.push(LI);
1966
1967 Type *ElemType = VecTy->getElementType();
1968
1969 // Replace extracts with narrow scalar loads.
1970 for (User *U : LI->users()) {
1971 auto *EI = cast<ExtractElementInst>(U);
1972 Value *Idx = EI->getIndexOperand();
1973
1974 // Insert 'freeze' for poison indexes.
1975 auto It = NeedFreeze.find(EI);
1976 if (It != NeedFreeze.end())
1977 It->second.freeze(Builder, *cast<Instruction>(Idx));
1978
1979 Builder.SetInsertPoint(EI);
1980 Value *GEP =
1981 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1982 auto *NewLoad = cast<LoadInst>(
1983 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1984
1985 Align ScalarOpAlignment =
1986 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1987 NewLoad->setAlignment(ScalarOpAlignment);
1988
1989 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1990 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1991 AAMDNodes OldAAMD = LI->getAAMetadata();
1992 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1993 }
1994
1995 replaceValue(*EI, *NewLoad, false);
1996 }
1997
1998 FailureGuard.release();
1999 return true;
2000}
2001
2002/// Try to scalarize vector loads feeding bitcast instructions.
2003bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2004 Value *Ptr) {
2005 InstructionCost OriginalCost =
2006 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2008
2009 Type *TargetScalarType = nullptr;
2010 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2011
2012 for (User *U : LI->users()) {
2013 auto *BC = cast<BitCastInst>(U);
2014
2015 Type *DestTy = BC->getDestTy();
2016 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2017 return false;
2018
2019 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2020 if (DestBitWidth != VecBitWidth)
2021 return false;
2022
2023 // All bitcasts must target the same scalar type.
2024 if (!TargetScalarType)
2025 TargetScalarType = DestTy;
2026 else if (TargetScalarType != DestTy)
2027 return false;
2028
2029 OriginalCost +=
2030 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2032 }
2033
2034 if (!TargetScalarType)
2035 return false;
2036
2037 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2038 InstructionCost ScalarizedCost =
2039 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2041
2042 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2043 << "\n OriginalCost: " << OriginalCost
2044 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2045
2046 if (ScalarizedCost >= OriginalCost)
2047 return false;
2048
2049 // Ensure we add the load back to the worklist BEFORE its users so they can
2050 // erased in the correct order.
2051 Worklist.push(LI);
2052
2053 Builder.SetInsertPoint(LI);
2054 auto *ScalarLoad =
2055 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2056 ScalarLoad->setAlignment(LI->getAlign());
2057 ScalarLoad->copyMetadata(*LI);
2058
2059 // Replace all bitcast users with the scalar load.
2060 for (User *U : LI->users()) {
2061 auto *BC = cast<BitCastInst>(U);
2062 replaceValue(*BC, *ScalarLoad, false);
2063 }
2064
2065 return true;
2066}
2067
2068bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2070 return false;
2071 auto *Ext = dyn_cast<ZExtInst>(&I);
2072 if (!Ext)
2073 return false;
2074
2075 // Try to convert a vector zext feeding only extracts to a set of scalar
2076 // (Src << ExtIdx *Size) & (Size -1)
2077 // if profitable .
2078 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2079 if (!SrcTy)
2080 return false;
2081 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2082
2083 Type *ScalarDstTy = DstTy->getElementType();
2084 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2085 return false;
2086
2087 InstructionCost VectorCost =
2088 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2090 unsigned ExtCnt = 0;
2091 bool ExtLane0 = false;
2092 for (User *U : Ext->users()) {
2093 uint64_t Idx;
2094 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2095 return false;
2096 if (cast<Instruction>(U)->use_empty())
2097 continue;
2098 ExtCnt += 1;
2099 ExtLane0 |= !Idx;
2100 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2101 CostKind, Idx, U);
2102 }
2103
2104 InstructionCost ScalarCost =
2105 ExtCnt * TTI.getArithmeticInstrCost(
2106 Instruction::And, ScalarDstTy, CostKind,
2109 (ExtCnt - ExtLane0) *
2111 Instruction::LShr, ScalarDstTy, CostKind,
2114 if (ScalarCost > VectorCost)
2115 return false;
2116
2117 Value *ScalarV = Ext->getOperand(0);
2118 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2119 &DT)) {
2120 // Check wether all lanes are extracted, all extracts trigger UB
2121 // on poison, and the last extract (and hence all previous ones)
2122 // are guaranteed to execute if Ext executes. If so, we do not
2123 // need to insert a freeze.
2124 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2125 bool AllExtractsTriggerUB = true;
2126 ExtractElementInst *LastExtract = nullptr;
2127 BasicBlock *ExtBB = Ext->getParent();
2128 for (User *U : Ext->users()) {
2129 auto *Extract = cast<ExtractElementInst>(U);
2130 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2131 AllExtractsTriggerUB = false;
2132 break;
2133 }
2134 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2135 if (!LastExtract || LastExtract->comesBefore(Extract))
2136 LastExtract = Extract;
2137 }
2138 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2139 !AllExtractsTriggerUB ||
2141 LastExtract->getIterator()))
2142 ScalarV = Builder.CreateFreeze(ScalarV);
2143 }
2144 ScalarV = Builder.CreateBitCast(
2145 ScalarV,
2146 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2147 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2148 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2149 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2150 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2151 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2152 for (User *U : Ext->users()) {
2153 auto *Extract = cast<ExtractElementInst>(U);
2154 uint64_t Idx =
2155 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2156 uint64_t ShiftAmt =
2157 DL->isBigEndian()
2158 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2159 : (Idx * SrcEltSizeInBits);
2160 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2161 Value *And = Builder.CreateAnd(LShr, Mask);
2162 U->replaceAllUsesWith(And);
2163 }
2164 return true;
2165}
2166
2167/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2168/// to "(bitcast (concat X, Y))"
2169/// where X/Y are bitcasted from i1 mask vectors.
2170bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2171 Type *Ty = I.getType();
2172 if (!Ty->isIntegerTy())
2173 return false;
2174
2175 // TODO: Add big endian test coverage
2176 if (DL->isBigEndian())
2177 return false;
2178
2179 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2180 Instruction *X, *Y;
2182 return false;
2183
2184 // Allow both sources to contain shl, to handle more generic pattern:
2185 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2186 Value *SrcX;
2187 uint64_t ShAmtX = 0;
2188 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2189 !match(X, m_OneUse(
2191 m_ConstantInt(ShAmtX)))))
2192 return false;
2193
2194 Value *SrcY;
2195 uint64_t ShAmtY = 0;
2196 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2197 !match(Y, m_OneUse(
2199 m_ConstantInt(ShAmtY)))))
2200 return false;
2201
2202 // Canonicalize larger shift to the RHS.
2203 if (ShAmtX > ShAmtY) {
2204 std::swap(X, Y);
2205 std::swap(SrcX, SrcY);
2206 std::swap(ShAmtX, ShAmtY);
2207 }
2208
2209 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2210 // difference is the mask width so they can be easily concatenated together.
2211 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2212 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2213 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2214 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2215 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2216 !MaskTy->getElementType()->isIntegerTy(1) ||
2217 MaskTy->getNumElements() != ShAmtDiff ||
2218 MaskTy->getNumElements() > (BitWidth / 2))
2219 return false;
2220
2221 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2222 auto *ConcatIntTy =
2223 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2224 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2225
2226 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2227 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2228
2229 // TODO: Is it worth supporting multi use cases?
2230 InstructionCost OldCost = 0;
2231 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2232 OldCost +=
2233 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2234 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2236 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2238
2239 InstructionCost NewCost = 0;
2241 MaskTy, ConcatMask, CostKind);
2242 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2244 if (Ty != ConcatIntTy)
2245 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2247 if (ShAmtX > 0)
2248 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2249
2250 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2251 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2252 << "\n");
2253
2254 if (NewCost > OldCost)
2255 return false;
2256
2257 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2258 // any residual zero-extension or shifting.
2259 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2260 Worklist.pushValue(Concat);
2261
2262 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2263
2264 if (Ty != ConcatIntTy) {
2265 Worklist.pushValue(Result);
2266 Result = Builder.CreateZExt(Result, Ty);
2267 }
2268
2269 if (ShAmtX > 0) {
2270 Worklist.pushValue(Result);
2271 Result = Builder.CreateShl(Result, ShAmtX);
2272 }
2273
2274 replaceValue(I, *Result);
2275 return true;
2276}
2277
2278/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2279/// --> "binop (shuffle), (shuffle)".
2280bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2281 BinaryOperator *BinOp;
2282 ArrayRef<int> OuterMask;
2283 if (!match(&I,
2284 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2285 return false;
2286
2287 // Don't introduce poison into div/rem.
2288 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2289 return false;
2290
2291 Value *Op00, *Op01, *Op10, *Op11;
2292 ArrayRef<int> Mask0, Mask1;
2293 bool Match0 =
2294 match(BinOp->getOperand(0),
2295 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2296 bool Match1 =
2297 match(BinOp->getOperand(1),
2298 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2299 if (!Match0 && !Match1)
2300 return false;
2301
2302 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2303 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2304 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2305 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2306
2307 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2308 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2309 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2310 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2311 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2312 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2313 return false;
2314
2315 unsigned NumSrcElts = BinOpTy->getNumElements();
2316
2317 // Don't accept shuffles that reference the second operand in
2318 // div/rem or if its an undef arg.
2319 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2320 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2321 return false;
2322
2323 // Merge outer / inner (or identity if no match) shuffles.
2324 SmallVector<int> NewMask0, NewMask1;
2325 for (int M : OuterMask) {
2326 if (M < 0 || M >= (int)NumSrcElts) {
2327 NewMask0.push_back(PoisonMaskElem);
2328 NewMask1.push_back(PoisonMaskElem);
2329 } else {
2330 NewMask0.push_back(Match0 ? Mask0[M] : M);
2331 NewMask1.push_back(Match1 ? Mask1[M] : M);
2332 }
2333 }
2334
2335 unsigned NumOpElts = Op0Ty->getNumElements();
2336 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2337 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2338 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2339 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2340 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2341 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2342
2343 // Try to merge shuffles across the binop if the new shuffles are not costly.
2344 InstructionCost OldCost =
2345 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2347 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2348 if (Match0)
2349 OldCost += TTI.getShuffleCost(
2350 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2351 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2352 if (Match1)
2353 OldCost += TTI.getShuffleCost(
2354 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2355 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2356
2357 InstructionCost NewCost =
2358 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2359
2360 if (!IsIdentity0)
2361 NewCost +=
2363 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2364 if (!IsIdentity1)
2365 NewCost +=
2367 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2368
2369 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2370 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2371 << "\n");
2372
2373 // If costs are equal, still fold as we reduce instruction count.
2374 if (NewCost > OldCost)
2375 return false;
2376
2377 Value *LHS =
2378 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2379 Value *RHS =
2380 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2381 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2382
2383 // Intersect flags from the old binops.
2384 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2385 NewInst->copyIRFlags(BinOp);
2386
2387 Worklist.pushValue(LHS);
2388 Worklist.pushValue(RHS);
2389 replaceValue(I, *NewBO);
2390 return true;
2391}
2392
2393/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2394/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2395bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2396 ArrayRef<int> OldMask;
2397 Instruction *LHS, *RHS;
2399 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2400 return false;
2401
2402 // TODO: Add support for addlike etc.
2403 if (LHS->getOpcode() != RHS->getOpcode())
2404 return false;
2405
2406 Value *X, *Y, *Z, *W;
2407 bool IsCommutative = false;
2408 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2409 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2410 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2411 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2412 auto *BO = cast<BinaryOperator>(LHS);
2413 // Don't introduce poison into div/rem.
2414 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2415 return false;
2416 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2417 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2418 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2419 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2420 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2421 } else
2422 return false;
2423
2424 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2425 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2426 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2427 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2428 return false;
2429
2430 unsigned NumSrcElts = BinOpTy->getNumElements();
2431
2432 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2433 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2434 std::swap(X, Y);
2435
2436 auto ConvertToUnary = [NumSrcElts](int &M) {
2437 if (M >= (int)NumSrcElts)
2438 M -= NumSrcElts;
2439 };
2440
2441 SmallVector<int> NewMask0(OldMask);
2443 if (X == Z) {
2444 llvm::for_each(NewMask0, ConvertToUnary);
2446 Z = PoisonValue::get(BinOpTy);
2447 }
2448
2449 SmallVector<int> NewMask1(OldMask);
2451 if (Y == W) {
2452 llvm::for_each(NewMask1, ConvertToUnary);
2454 W = PoisonValue::get(BinOpTy);
2455 }
2456
2457 // Try to replace a binop with a shuffle if the shuffle is not costly.
2458 InstructionCost OldCost =
2462 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2463 &I);
2464
2465 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2466 // where one use shuffles have gotten split across the binop/cmp. These
2467 // often allow a major reduction in total cost that wouldn't happen as
2468 // individual folds.
2469 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2470 TTI::TargetCostKind CostKind) -> bool {
2471 Value *InnerOp;
2472 ArrayRef<int> InnerMask;
2473 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2474 m_Mask(InnerMask)))) &&
2475 InnerOp->getType() == Op->getType() &&
2476 all_of(InnerMask,
2477 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2478 for (int &M : Mask)
2479 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2480 M = InnerMask[M - Offset];
2481 M = 0 <= M ? M + Offset : M;
2482 }
2484 Op = InnerOp;
2485 return true;
2486 }
2487 return false;
2488 };
2489 bool ReducedInstCount = false;
2490 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2491 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2492 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2493 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2494
2495 auto *ShuffleCmpTy =
2496 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2497 InstructionCost NewCost =
2498 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2499 nullptr, {X, Z}) +
2500 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2501 nullptr, {Y, W});
2502
2503 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2504 NewCost +=
2505 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2506 } else {
2507 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2508 ShuffleDstTy, PredLHS, CostKind);
2509 }
2510
2511 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2512 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2513 << "\n");
2514
2515 // If either shuffle will constant fold away, then fold for the same cost as
2516 // we will reduce the instruction count.
2517 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2518 (isa<Constant>(Y) && isa<Constant>(W));
2519 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2520 return false;
2521
2522 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2523 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2524 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2525 ? Builder.CreateBinOp(
2526 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2527 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2528
2529 // Intersect flags from the old binops.
2530 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2531 NewInst->copyIRFlags(LHS);
2532 NewInst->andIRFlags(RHS);
2533 }
2534
2535 Worklist.pushValue(Shuf0);
2536 Worklist.pushValue(Shuf1);
2537 replaceValue(I, *NewBO);
2538 return true;
2539}
2540
2541/// Try to convert,
2542/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2543/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2544bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2545 ArrayRef<int> Mask;
2546 Value *C1, *T1, *F1, *C2, *T2, *F2;
2547 if (!match(&I, m_Shuffle(
2549 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2550 m_Mask(Mask))))
2551 return false;
2552
2553 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2554 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2555 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2556 return false;
2557
2558 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2559 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2560 // SelectInsts must have the same FMF.
2561 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2562 ((SI0FOp != nullptr) &&
2563 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2564 return false;
2565
2566 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2567 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2569 auto SelOp = Instruction::Select;
2571 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2572 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2574 OldCost +=
2575 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2576 {I.getOperand(0), I.getOperand(1)}, &I);
2577
2579 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2580 Mask, CostKind, 0, nullptr, {C1, C2});
2581 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2582 nullptr, {T1, T2});
2583 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2584 nullptr, {F1, F2});
2585 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2586 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2587 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2589
2590 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2591 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2592 << "\n");
2593 if (NewCost > OldCost)
2594 return false;
2595
2596 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2597 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2598 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2599 Value *NewSel;
2600 // We presuppose that the SelectInsts have the same FMF.
2601 if (SI0FOp)
2602 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2603 SI0FOp->getFastMathFlags());
2604 else
2605 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2606
2607 Worklist.pushValue(ShuffleCmp);
2608 Worklist.pushValue(ShuffleTrue);
2609 Worklist.pushValue(ShuffleFalse);
2610 replaceValue(I, *NewSel);
2611 return true;
2612}
2613
2614/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2615/// into "castop (shuffle)".
2616bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2617 Value *V0, *V1;
2618 ArrayRef<int> OldMask;
2619 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2620 return false;
2621
2622 // Check whether this is a binary shuffle.
2623 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2624
2625 auto *C0 = dyn_cast<CastInst>(V0);
2626 auto *C1 = dyn_cast<CastInst>(V1);
2627 if (!C0 || (IsBinaryShuffle && !C1))
2628 return false;
2629
2630 Instruction::CastOps Opcode = C0->getOpcode();
2631
2632 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2633 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2634 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2635 return false;
2636
2637 if (IsBinaryShuffle) {
2638 if (C0->getSrcTy() != C1->getSrcTy())
2639 return false;
2640 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2641 if (Opcode != C1->getOpcode()) {
2642 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2643 Opcode = Instruction::SExt;
2644 else
2645 return false;
2646 }
2647 }
2648
2649 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2650 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2651 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2652 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2653 return false;
2654
2655 unsigned NumSrcElts = CastSrcTy->getNumElements();
2656 unsigned NumDstElts = CastDstTy->getNumElements();
2657 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2658 "Only bitcasts expected to alter src/dst element counts");
2659
2660 // Check for bitcasting of unscalable vector types.
2661 // e.g. <32 x i40> -> <40 x i32>
2662 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2663 (NumDstElts % NumSrcElts) != 0)
2664 return false;
2665
2666 SmallVector<int, 16> NewMask;
2667 if (NumSrcElts >= NumDstElts) {
2668 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2669 // always be expanded to the equivalent form choosing narrower elements.
2670 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2671 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2672 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2673 } else {
2674 // The bitcast is from narrow elements to wide elements. The shuffle mask
2675 // must choose consecutive elements to allow casting first.
2676 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2677 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2678 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2679 return false;
2680 }
2681
2682 auto *NewShuffleDstTy =
2683 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2684
2685 // Try to replace a castop with a shuffle if the shuffle is not costly.
2686 InstructionCost CostC0 =
2687 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2689
2691 if (IsBinaryShuffle)
2693 else
2695
2696 InstructionCost OldCost = CostC0;
2697 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2698 CostKind, 0, nullptr, {}, &I);
2699
2700 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2701 CastSrcTy, NewMask, CostKind);
2702 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2704 if (!C0->hasOneUse())
2705 NewCost += CostC0;
2706 if (IsBinaryShuffle) {
2707 InstructionCost CostC1 =
2708 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2710 OldCost += CostC1;
2711 if (!C1->hasOneUse())
2712 NewCost += CostC1;
2713 }
2714
2715 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2716 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2717 << "\n");
2718 if (NewCost > OldCost)
2719 return false;
2720
2721 Value *Shuf;
2722 if (IsBinaryShuffle)
2723 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2724 NewMask);
2725 else
2726 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2727
2728 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2729
2730 // Intersect flags from the old casts.
2731 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2732 NewInst->copyIRFlags(C0);
2733 if (IsBinaryShuffle)
2734 NewInst->andIRFlags(C1);
2735 }
2736
2737 Worklist.pushValue(Shuf);
2738 replaceValue(I, *Cast);
2739 return true;
2740}
2741
2742/// Try to convert any of:
2743/// "shuffle (shuffle x, y), (shuffle y, x)"
2744/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2745/// "shuffle (shuffle x, undef), y"
2746/// "shuffle x, (shuffle y, undef)"
2747/// into "shuffle x, y".
2748bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2749 ArrayRef<int> OuterMask;
2750 Value *OuterV0, *OuterV1;
2751 if (!match(&I,
2752 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2753 return false;
2754
2755 ArrayRef<int> InnerMask0, InnerMask1;
2756 Value *X0, *X1, *Y0, *Y1;
2757 bool Match0 =
2758 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2759 bool Match1 =
2760 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2761 if (!Match0 && !Match1)
2762 return false;
2763
2764 // If the outer shuffle is a permute, then create a fake inner all-poison
2765 // shuffle. This is easier than accounting for length-changing shuffles below.
2766 SmallVector<int, 16> PoisonMask1;
2767 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2768 X1 = X0;
2769 Y1 = Y0;
2770 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2771 InnerMask1 = PoisonMask1;
2772 Match1 = true; // fake match
2773 }
2774
2775 X0 = Match0 ? X0 : OuterV0;
2776 Y0 = Match0 ? Y0 : OuterV0;
2777 X1 = Match1 ? X1 : OuterV1;
2778 Y1 = Match1 ? Y1 : OuterV1;
2779 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2780 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2781 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2782 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2783 X0->getType() != X1->getType())
2784 return false;
2785
2786 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2787 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2788
2789 // Attempt to merge shuffles, matching upto 2 source operands.
2790 // Replace index to a poison arg with PoisonMaskElem.
2791 // Bail if either inner masks reference an undef arg.
2792 SmallVector<int, 16> NewMask(OuterMask);
2793 Value *NewX = nullptr, *NewY = nullptr;
2794 for (int &M : NewMask) {
2795 Value *Src = nullptr;
2796 if (0 <= M && M < (int)NumImmElts) {
2797 Src = OuterV0;
2798 if (Match0) {
2799 M = InnerMask0[M];
2800 Src = M >= (int)NumSrcElts ? Y0 : X0;
2801 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2802 }
2803 } else if (M >= (int)NumImmElts) {
2804 Src = OuterV1;
2805 M -= NumImmElts;
2806 if (Match1) {
2807 M = InnerMask1[M];
2808 Src = M >= (int)NumSrcElts ? Y1 : X1;
2809 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2810 }
2811 }
2812 if (Src && M != PoisonMaskElem) {
2813 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2814 if (isa<UndefValue>(Src)) {
2815 // We've referenced an undef element - if its poison, update the shuffle
2816 // mask, else bail.
2817 if (!isa<PoisonValue>(Src))
2818 return false;
2819 M = PoisonMaskElem;
2820 continue;
2821 }
2822 if (!NewX || NewX == Src) {
2823 NewX = Src;
2824 continue;
2825 }
2826 if (!NewY || NewY == Src) {
2827 M += NumSrcElts;
2828 NewY = Src;
2829 continue;
2830 }
2831 return false;
2832 }
2833 }
2834
2835 if (!NewX)
2836 return PoisonValue::get(ShuffleDstTy);
2837 if (!NewY)
2838 NewY = PoisonValue::get(ShuffleSrcTy);
2839
2840 // Have we folded to an Identity shuffle?
2841 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2842 replaceValue(I, *NewX);
2843 return true;
2844 }
2845
2846 // Try to merge the shuffles if the new shuffle is not costly.
2847 InstructionCost InnerCost0 = 0;
2848 if (Match0)
2849 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2850
2851 InstructionCost InnerCost1 = 0;
2852 if (Match1)
2853 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2854
2856
2857 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2858
2859 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2863 InstructionCost NewCost =
2864 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2865 nullptr, {NewX, NewY});
2866 if (!OuterV0->hasOneUse())
2867 NewCost += InnerCost0;
2868 if (!OuterV1->hasOneUse())
2869 NewCost += InnerCost1;
2870
2871 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2872 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2873 << "\n");
2874 if (NewCost > OldCost)
2875 return false;
2876
2877 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2878 replaceValue(I, *Shuf);
2879 return true;
2880}
2881
2882/// Try to convert a chain of length-preserving shuffles that are fed by
2883/// length-changing shuffles from the same source, e.g. a chain of length 3:
2884///
2885/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
2886/// (shuffle y, undef)),
2887// (shuffle y, undef)"
2888///
2889/// into a single shuffle fed by a length-changing shuffle:
2890///
2891/// "shuffle x, (shuffle y, undef)"
2892///
2893/// Such chains arise e.g. from folding extract/insert sequences.
2894bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
2895 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
2896 if (!TrunkType)
2897 return false;
2898
2899 unsigned ChainLength = 0;
2900 SmallVector<int> Mask;
2901 SmallVector<int> YMask;
2902 InstructionCost OldCost = 0;
2903 InstructionCost NewCost = 0;
2904 Value *Trunk = &I;
2905 unsigned NumTrunkElts = TrunkType->getNumElements();
2906 Value *Y = nullptr;
2907
2908 for (;;) {
2909 // Match the current trunk against (commutations of) the pattern
2910 // "shuffle trunk', (shuffle y, undef)"
2911 ArrayRef<int> OuterMask;
2912 Value *OuterV0, *OuterV1;
2913 if (ChainLength != 0 && !Trunk->hasOneUse())
2914 break;
2915 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
2916 m_Mask(OuterMask))))
2917 break;
2918 if (OuterV0->getType() != TrunkType) {
2919 // This shuffle is not length-preserving, so it cannot be part of the
2920 // chain.
2921 break;
2922 }
2923
2924 ArrayRef<int> InnerMask0, InnerMask1;
2925 Value *A0, *A1, *B0, *B1;
2926 bool Match0 =
2927 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
2928 bool Match1 =
2929 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
2930 bool Match0Leaf = Match0 && A0->getType() != I.getType();
2931 bool Match1Leaf = Match1 && A1->getType() != I.getType();
2932 if (Match0Leaf == Match1Leaf) {
2933 // Only handle the case of exactly one leaf in each step. The "two leaves"
2934 // case is handled by foldShuffleOfShuffles.
2935 break;
2936 }
2937
2938 SmallVector<int> CommutedOuterMask;
2939 if (Match0Leaf) {
2940 std::swap(OuterV0, OuterV1);
2941 std::swap(InnerMask0, InnerMask1);
2942 std::swap(A0, A1);
2943 std::swap(B0, B1);
2944 llvm::append_range(CommutedOuterMask, OuterMask);
2945 for (int &M : CommutedOuterMask) {
2946 if (M == PoisonMaskElem)
2947 continue;
2948 if (M < (int)NumTrunkElts)
2949 M += NumTrunkElts;
2950 else
2951 M -= NumTrunkElts;
2952 }
2953 OuterMask = CommutedOuterMask;
2954 }
2955 if (!OuterV1->hasOneUse())
2956 break;
2957
2958 if (!isa<UndefValue>(A1)) {
2959 if (!Y)
2960 Y = A1;
2961 else if (Y != A1)
2962 break;
2963 }
2964 if (!isa<UndefValue>(B1)) {
2965 if (!Y)
2966 Y = B1;
2967 else if (Y != B1)
2968 break;
2969 }
2970
2971 auto *YType = cast<FixedVectorType>(A1->getType());
2972 int NumLeafElts = YType->getNumElements();
2973 SmallVector<int> LocalYMask(InnerMask1);
2974 for (int &M : LocalYMask) {
2975 if (M >= NumLeafElts)
2976 M -= NumLeafElts;
2977 }
2978
2979 InstructionCost LocalOldCost =
2982
2983 // Handle the initial (start of chain) case.
2984 if (!ChainLength) {
2985 Mask.assign(OuterMask);
2986 YMask.assign(LocalYMask);
2987 OldCost = NewCost = LocalOldCost;
2988 Trunk = OuterV0;
2989 ChainLength++;
2990 continue;
2991 }
2992
2993 // For the non-root case, first attempt to combine masks.
2994 SmallVector<int> NewYMask(YMask);
2995 bool Valid = true;
2996 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
2997 if (LeafM == -1 || CombinedM == LeafM)
2998 continue;
2999 if (CombinedM == -1) {
3000 CombinedM = LeafM;
3001 } else {
3002 Valid = false;
3003 break;
3004 }
3005 }
3006 if (!Valid)
3007 break;
3008
3009 SmallVector<int> NewMask;
3010 NewMask.reserve(NumTrunkElts);
3011 for (int M : Mask) {
3012 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3013 NewMask.push_back(M);
3014 else
3015 NewMask.push_back(OuterMask[M]);
3016 }
3017
3018 // Break the chain if adding this new step complicates the shuffles such
3019 // that it would increase the new cost by more than the old cost of this
3020 // step.
3021 InstructionCost LocalNewCost =
3023 YType, NewYMask, CostKind) +
3025 TrunkType, NewMask, CostKind);
3026
3027 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3028 break;
3029
3030 LLVM_DEBUG({
3031 if (ChainLength == 1) {
3032 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3033 << I << '\n';
3034 }
3035 dbgs() << " next chain link: " << *Trunk << '\n'
3036 << " old cost: " << (OldCost + LocalOldCost)
3037 << " new cost: " << LocalNewCost << '\n';
3038 });
3039
3040 Mask = NewMask;
3041 YMask = NewYMask;
3042 OldCost += LocalOldCost;
3043 NewCost = LocalNewCost;
3044 Trunk = OuterV0;
3045 ChainLength++;
3046 }
3047 if (ChainLength <= 1)
3048 return false;
3049
3050 if (llvm::all_of(Mask, [&](int M) {
3051 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3052 })) {
3053 // Produce a canonical simplified form if all elements are sourced from Y.
3054 for (int &M : Mask) {
3055 if (M >= static_cast<int>(NumTrunkElts))
3056 M = YMask[M - NumTrunkElts];
3057 }
3058 Value *Root =
3059 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3060 replaceValue(I, *Root);
3061 return true;
3062 }
3063
3064 Value *Leaf =
3065 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3066 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3067 replaceValue(I, *Root);
3068 return true;
3069}
3070
3071/// Try to convert
3072/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3073bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3074 Value *V0, *V1;
3075 ArrayRef<int> OldMask;
3076 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
3077 m_Mask(OldMask))))
3078 return false;
3079
3080 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3081 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3082 if (!II0 || !II1)
3083 return false;
3084
3085 Intrinsic::ID IID = II0->getIntrinsicID();
3086 if (IID != II1->getIntrinsicID())
3087 return false;
3088
3089 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3090 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3091 if (!ShuffleDstTy || !II0Ty)
3092 return false;
3093
3094 if (!isTriviallyVectorizable(IID))
3095 return false;
3096
3097 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3099 II0->getArgOperand(I) != II1->getArgOperand(I))
3100 return false;
3101
3102 InstructionCost OldCost =
3103 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
3104 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
3106 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3107
3108 SmallVector<Type *> NewArgsTy;
3109 InstructionCost NewCost = 0;
3110 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3112 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3113 } else {
3114 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3115 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3116 ShuffleDstTy->getNumElements());
3117 NewArgsTy.push_back(ArgTy);
3118 NewCost += TTI.getShuffleCost(
3119 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3120 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3121 }
3122 }
3123 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3124 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3125
3126 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3127 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3128 << "\n");
3129
3130 if (NewCost > OldCost)
3131 return false;
3132
3133 SmallVector<Value *> NewArgs;
3134 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3136 NewArgs.push_back(II0->getArgOperand(I));
3137 } else {
3138 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3139 II1->getArgOperand(I), OldMask);
3140 NewArgs.push_back(Shuf);
3141 Worklist.pushValue(Shuf);
3142 }
3143 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3144
3145 // Intersect flags from the old intrinsics.
3146 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3147 NewInst->copyIRFlags(II0);
3148 NewInst->andIRFlags(II1);
3149 }
3150
3151 replaceValue(I, *NewIntrinsic);
3152 return true;
3153}
3154
3155/// Try to convert
3156/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3157bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3158 Value *V0;
3159 ArrayRef<int> Mask;
3160 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Undef(), m_Mask(Mask))))
3161 return false;
3162
3163 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3164 if (!II0)
3165 return false;
3166
3167 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3168 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3169 if (!ShuffleDstTy || !IntrinsicSrcTy)
3170 return false;
3171
3172 // Validate it's a pure permute, mask should only reference the first vector
3173 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3174 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3175 return false;
3176
3177 Intrinsic::ID IID = II0->getIntrinsicID();
3178 if (!isTriviallyVectorizable(IID))
3179 return false;
3180
3181 // Cost analysis
3182 InstructionCost OldCost =
3183 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
3185 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3186
3187 SmallVector<Type *> NewArgsTy;
3188 InstructionCost NewCost = 0;
3189 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3191 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3192 } else {
3193 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3194 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3195 ShuffleDstTy->getNumElements());
3196 NewArgsTy.push_back(ArgTy);
3198 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3199 {II0->getArgOperand(I)});
3200 }
3201 }
3202 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3203 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3204
3205 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3206 << OldCost << " vs NewCost: " << NewCost << "\n");
3207
3208 if (NewCost > OldCost)
3209 return false;
3210
3211 // Transform
3212 SmallVector<Value *> NewArgs;
3213 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3215 NewArgs.push_back(II0->getArgOperand(I));
3216 } else {
3217 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3218 NewArgs.push_back(Shuf);
3219 Worklist.pushValue(Shuf);
3220 }
3221 }
3222
3223 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3224
3225 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3226 NewInst->copyIRFlags(II0);
3227
3228 replaceValue(I, *NewIntrinsic);
3229 return true;
3230}
3231
3232using InstLane = std::pair<Use *, int>;
3233
3234static InstLane lookThroughShuffles(Use *U, int Lane) {
3235 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
3236 unsigned NumElts =
3237 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3238 int M = SV->getMaskValue(Lane);
3239 if (M < 0)
3240 return {nullptr, PoisonMaskElem};
3241 if (static_cast<unsigned>(M) < NumElts) {
3242 U = &SV->getOperandUse(0);
3243 Lane = M;
3244 } else {
3245 U = &SV->getOperandUse(1);
3246 Lane = M - NumElts;
3247 }
3248 }
3249 return InstLane{U, Lane};
3250}
3251
3255 for (InstLane IL : Item) {
3256 auto [U, Lane] = IL;
3257 InstLane OpLane =
3258 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
3259 Lane)
3260 : InstLane{nullptr, PoisonMaskElem};
3261 NItem.emplace_back(OpLane);
3262 }
3263 return NItem;
3264}
3265
3266/// Detect concat of multiple values into a vector
3268 const TargetTransformInfo &TTI) {
3269 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3270 unsigned NumElts = Ty->getNumElements();
3271 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3272 return false;
3273
3274 // Check that the concat is free, usually meaning that the type will be split
3275 // during legalization.
3276 SmallVector<int, 16> ConcatMask(NumElts * 2);
3277 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3278 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3279 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3280 Ty, ConcatMask, CostKind) != 0)
3281 return false;
3282
3283 unsigned NumSlices = Item.size() / NumElts;
3284 // Currently we generate a tree of shuffles for the concats, which limits us
3285 // to a power2.
3286 if (!isPowerOf2_32(NumSlices))
3287 return false;
3288 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3289 Use *SliceV = Item[Slice * NumElts].first;
3290 if (!SliceV || SliceV->get()->getType() != Ty)
3291 return false;
3292 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3293 auto [V, Lane] = Item[Slice * NumElts + Elt];
3294 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3295 return false;
3296 }
3297 }
3298 return true;
3299}
3300
3302 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3303 const SmallPtrSet<Use *, 4> &SplatLeafs,
3304 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3305 IRBuilderBase &Builder,
3306 const TargetTransformInfo *TTI) {
3307 auto [FrontU, FrontLane] = Item.front();
3308
3309 if (IdentityLeafs.contains(FrontU)) {
3310 return FrontU->get();
3311 }
3312 if (SplatLeafs.contains(FrontU)) {
3313 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3314 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3315 }
3316 if (ConcatLeafs.contains(FrontU)) {
3317 unsigned NumElts =
3318 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3319 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3320 for (unsigned S = 0; S < Values.size(); ++S)
3321 Values[S] = Item[S * NumElts].first->get();
3322
3323 while (Values.size() > 1) {
3324 NumElts *= 2;
3325 SmallVector<int, 16> Mask(NumElts, 0);
3326 std::iota(Mask.begin(), Mask.end(), 0);
3327 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3328 for (unsigned S = 0; S < NewValues.size(); ++S)
3329 NewValues[S] =
3330 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3331 Values = NewValues;
3332 }
3333 return Values[0];
3334 }
3335
3336 auto *I = cast<Instruction>(FrontU->get());
3337 auto *II = dyn_cast<IntrinsicInst>(I);
3338 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3340 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3341 if (II &&
3342 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3343 Ops[Idx] = II->getOperand(Idx);
3344 continue;
3345 }
3347 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3348 Builder, TTI);
3349 }
3350
3351 SmallVector<Value *, 8> ValueList;
3352 for (const auto &Lane : Item)
3353 if (Lane.first)
3354 ValueList.push_back(Lane.first->get());
3355
3356 Type *DstTy =
3357 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3358 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3359 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3360 Ops[0], Ops[1]);
3361 propagateIRFlags(Value, ValueList);
3362 return Value;
3363 }
3364 if (auto *CI = dyn_cast<CmpInst>(I)) {
3365 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3366 propagateIRFlags(Value, ValueList);
3367 return Value;
3368 }
3369 if (auto *SI = dyn_cast<SelectInst>(I)) {
3370 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3371 propagateIRFlags(Value, ValueList);
3372 return Value;
3373 }
3374 if (auto *CI = dyn_cast<CastInst>(I)) {
3375 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3376 propagateIRFlags(Value, ValueList);
3377 return Value;
3378 }
3379 if (II) {
3380 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3381 propagateIRFlags(Value, ValueList);
3382 return Value;
3383 }
3384 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3385 auto *Value =
3386 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3387 propagateIRFlags(Value, ValueList);
3388 return Value;
3389}
3390
3391// Starting from a shuffle, look up through operands tracking the shuffled index
3392// of each lane. If we can simplify away the shuffles to identities then
3393// do so.
3394bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3395 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3396 if (!Ty || I.use_empty())
3397 return false;
3398
3399 SmallVector<InstLane> Start(Ty->getNumElements());
3400 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3401 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3402
3404 Worklist.push_back(Start);
3405 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3406 unsigned NumVisited = 0;
3407
3408 while (!Worklist.empty()) {
3409 if (++NumVisited > MaxInstrsToScan)
3410 return false;
3411
3412 SmallVector<InstLane> Item = Worklist.pop_back_val();
3413 auto [FrontU, FrontLane] = Item.front();
3414
3415 // If we found an undef first lane then bail out to keep things simple.
3416 if (!FrontU)
3417 return false;
3418
3419 // Helper to peek through bitcasts to the same value.
3420 auto IsEquiv = [&](Value *X, Value *Y) {
3421 return X->getType() == Y->getType() &&
3423 };
3424
3425 // Look for an identity value.
3426 if (FrontLane == 0 &&
3427 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3428 Ty->getNumElements() &&
3429 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3430 Value *FrontV = Item.front().first->get();
3431 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3432 E.value().second == (int)E.index());
3433 })) {
3434 IdentityLeafs.insert(FrontU);
3435 continue;
3436 }
3437 // Look for constants, for the moment only supporting constant splats.
3438 if (auto *C = dyn_cast<Constant>(FrontU);
3439 C && C->getSplatValue() &&
3440 all_of(drop_begin(Item), [Item](InstLane &IL) {
3441 Value *FrontV = Item.front().first->get();
3442 Use *U = IL.first;
3443 return !U || (isa<Constant>(U->get()) &&
3444 cast<Constant>(U->get())->getSplatValue() ==
3445 cast<Constant>(FrontV)->getSplatValue());
3446 })) {
3447 SplatLeafs.insert(FrontU);
3448 continue;
3449 }
3450 // Look for a splat value.
3451 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3452 auto [FrontU, FrontLane] = Item.front();
3453 auto [U, Lane] = IL;
3454 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3455 })) {
3456 SplatLeafs.insert(FrontU);
3457 continue;
3458 }
3459
3460 // We need each element to be the same type of value, and check that each
3461 // element has a single use.
3462 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3463 Value *FrontV = Item.front().first->get();
3464 if (!IL.first)
3465 return true;
3466 Value *V = IL.first->get();
3467 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3468 return false;
3469 if (V->getValueID() != FrontV->getValueID())
3470 return false;
3471 if (auto *CI = dyn_cast<CmpInst>(V))
3472 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3473 return false;
3474 if (auto *CI = dyn_cast<CastInst>(V))
3475 if (CI->getSrcTy()->getScalarType() !=
3476 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3477 return false;
3478 if (auto *SI = dyn_cast<SelectInst>(V))
3479 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3480 SI->getOperand(0)->getType() !=
3481 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3482 return false;
3483 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3484 return false;
3485 auto *II = dyn_cast<IntrinsicInst>(V);
3486 return !II || (isa<IntrinsicInst>(FrontV) &&
3487 II->getIntrinsicID() ==
3488 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3489 !II->hasOperandBundles());
3490 };
3491 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3492 // Check the operator is one that we support.
3493 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3494 // We exclude div/rem in case they hit UB from poison lanes.
3495 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3496 BO && BO->isIntDivRem())
3497 return false;
3500 continue;
3501 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3502 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3504 continue;
3505 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3506 // TODO: Handle vector widening/narrowing bitcasts.
3507 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3508 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3509 if (DstTy && SrcTy &&
3510 SrcTy->getNumElements() == DstTy->getNumElements()) {
3512 continue;
3513 }
3514 } else if (isa<SelectInst>(FrontU)) {
3518 continue;
3519 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3520 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3521 !II->hasOperandBundles()) {
3522 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3523 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3524 &TTI)) {
3525 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3526 Value *FrontV = Item.front().first->get();
3527 Use *U = IL.first;
3528 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3529 cast<Instruction>(FrontV)->getOperand(Op));
3530 }))
3531 return false;
3532 continue;
3533 }
3535 }
3536 continue;
3537 }
3538 }
3539
3540 if (isFreeConcat(Item, CostKind, TTI)) {
3541 ConcatLeafs.insert(FrontU);
3542 continue;
3543 }
3544
3545 return false;
3546 }
3547
3548 if (NumVisited <= 1)
3549 return false;
3550
3551 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3552
3553 // If we got this far, we know the shuffles are superfluous and can be
3554 // removed. Scan through again and generate the new tree of instructions.
3555 Builder.SetInsertPoint(&I);
3556 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3557 ConcatLeafs, Builder, &TTI);
3558 replaceValue(I, *V);
3559 return true;
3560}
3561
3562/// Given a commutative reduction, the order of the input lanes does not alter
3563/// the results. We can use this to remove certain shuffles feeding the
3564/// reduction, removing the need to shuffle at all.
3565bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3566 auto *II = dyn_cast<IntrinsicInst>(&I);
3567 if (!II)
3568 return false;
3569 switch (II->getIntrinsicID()) {
3570 case Intrinsic::vector_reduce_add:
3571 case Intrinsic::vector_reduce_mul:
3572 case Intrinsic::vector_reduce_and:
3573 case Intrinsic::vector_reduce_or:
3574 case Intrinsic::vector_reduce_xor:
3575 case Intrinsic::vector_reduce_smin:
3576 case Intrinsic::vector_reduce_smax:
3577 case Intrinsic::vector_reduce_umin:
3578 case Intrinsic::vector_reduce_umax:
3579 break;
3580 default:
3581 return false;
3582 }
3583
3584 // Find all the inputs when looking through operations that do not alter the
3585 // lane order (binops, for example). Currently we look for a single shuffle,
3586 // and can ignore splat values.
3587 std::queue<Value *> Worklist;
3588 SmallPtrSet<Value *, 4> Visited;
3589 ShuffleVectorInst *Shuffle = nullptr;
3590 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3591 Worklist.push(Op);
3592
3593 while (!Worklist.empty()) {
3594 Value *CV = Worklist.front();
3595 Worklist.pop();
3596 if (Visited.contains(CV))
3597 continue;
3598
3599 // Splats don't change the order, so can be safely ignored.
3600 if (isSplatValue(CV))
3601 continue;
3602
3603 Visited.insert(CV);
3604
3605 if (auto *CI = dyn_cast<Instruction>(CV)) {
3606 if (CI->isBinaryOp()) {
3607 for (auto *Op : CI->operand_values())
3608 Worklist.push(Op);
3609 continue;
3610 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3611 if (Shuffle && Shuffle != SV)
3612 return false;
3613 Shuffle = SV;
3614 continue;
3615 }
3616 }
3617
3618 // Anything else is currently an unknown node.
3619 return false;
3620 }
3621
3622 if (!Shuffle)
3623 return false;
3624
3625 // Check all uses of the binary ops and shuffles are also included in the
3626 // lane-invariant operations (Visited should be the list of lanewise
3627 // instructions, including the shuffle that we found).
3628 for (auto *V : Visited)
3629 for (auto *U : V->users())
3630 if (!Visited.contains(U) && U != &I)
3631 return false;
3632
3633 FixedVectorType *VecType =
3634 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3635 if (!VecType)
3636 return false;
3637 FixedVectorType *ShuffleInputType =
3639 if (!ShuffleInputType)
3640 return false;
3641 unsigned NumInputElts = ShuffleInputType->getNumElements();
3642
3643 // Find the mask from sorting the lanes into order. This is most likely to
3644 // become a identity or concat mask. Undef elements are pushed to the end.
3645 SmallVector<int> ConcatMask;
3646 Shuffle->getShuffleMask(ConcatMask);
3647 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3648 bool UsesSecondVec =
3649 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3650
3652 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3653 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3655 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3656 ShuffleInputType, ConcatMask, CostKind);
3657
3658 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3659 << "\n");
3660 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3661 << "\n");
3662 bool MadeChanges = false;
3663 if (NewCost < OldCost) {
3664 Builder.SetInsertPoint(Shuffle);
3665 Value *NewShuffle = Builder.CreateShuffleVector(
3666 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3667 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3668 replaceValue(*Shuffle, *NewShuffle);
3669 return true;
3670 }
3671
3672 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3673 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3674 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3675 return MadeChanges;
3676}
3677
3678/// For a given chain of patterns of the following form:
3679///
3680/// ```
3681/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3682///
3683/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3684/// ty1> %1)
3685/// OR
3686/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3687///
3688/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3689/// ...
3690/// ...
3691/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3692/// 3), <n x ty1> %(i - 2)
3693/// OR
3694/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3695///
3696/// %(i) = extractelement <n x ty1> %(i - 1), 0
3697/// ```
3698///
3699/// Where:
3700/// `mask` follows a partition pattern:
3701///
3702/// Ex:
3703/// [n = 8, p = poison]
3704///
3705/// 4 5 6 7 | p p p p
3706/// 2 3 | p p p p p p
3707/// 1 | p p p p p p p
3708///
3709/// For powers of 2, there's a consistent pattern, but for other cases
3710/// the parity of the current half value at each step decides the
3711/// next partition half (see `ExpectedParityMask` for more logical details
3712/// in generalising this).
3713///
3714/// Ex:
3715/// [n = 6]
3716///
3717/// 3 4 5 | p p p
3718/// 1 2 | p p p p
3719/// 1 | p p p p p
3720bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3721 // Going bottom-up for the pattern.
3722 std::queue<Value *> InstWorklist;
3723 InstructionCost OrigCost = 0;
3724
3725 // Common instruction operation after each shuffle op.
3726 std::optional<unsigned int> CommonCallOp = std::nullopt;
3727 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3728
3729 bool IsFirstCallOrBinInst = true;
3730 bool ShouldBeCallOrBinInst = true;
3731
3732 // This stores the last used instructions for shuffle/common op.
3733 //
3734 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3735 // instructions from either shuffle/common op.
3736 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3737
3738 Value *VecOpEE;
3739 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3740 return false;
3741
3742 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3743 if (!FVT)
3744 return false;
3745
3746 int64_t VecSize = FVT->getNumElements();
3747 if (VecSize < 2)
3748 return false;
3749
3750 // Number of levels would be ~log2(n), considering we always partition
3751 // by half for this fold pattern.
3752 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3753 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3754
3755 // This is how we generalise for all element sizes.
3756 // At each step, if vector size is odd, we need non-poison
3757 // values to cover the dominant half so we don't miss out on any element.
3758 //
3759 // This mask will help us retrieve this as we go from bottom to top:
3760 //
3761 // Mask Set -> N = N * 2 - 1
3762 // Mask Unset -> N = N * 2
3763 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3764 Cur = (Cur + 1) / 2, --Mask) {
3765 if (Cur & 1)
3766 ExpectedParityMask |= (1ll << Mask);
3767 }
3768
3769 InstWorklist.push(VecOpEE);
3770
3771 while (!InstWorklist.empty()) {
3772 Value *CI = InstWorklist.front();
3773 InstWorklist.pop();
3774
3775 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3776 if (!ShouldBeCallOrBinInst)
3777 return false;
3778
3779 if (!IsFirstCallOrBinInst &&
3780 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3781 return false;
3782
3783 // For the first found call/bin op, the vector has to come from the
3784 // extract element op.
3785 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3786 return false;
3787 IsFirstCallOrBinInst = false;
3788
3789 if (!CommonCallOp)
3790 CommonCallOp = II->getIntrinsicID();
3791 if (II->getIntrinsicID() != *CommonCallOp)
3792 return false;
3793
3794 switch (II->getIntrinsicID()) {
3795 case Intrinsic::umin:
3796 case Intrinsic::umax:
3797 case Intrinsic::smin:
3798 case Intrinsic::smax: {
3799 auto *Op0 = II->getOperand(0);
3800 auto *Op1 = II->getOperand(1);
3801 PrevVecV[0] = Op0;
3802 PrevVecV[1] = Op1;
3803 break;
3804 }
3805 default:
3806 return false;
3807 }
3808 ShouldBeCallOrBinInst ^= 1;
3809
3810 IntrinsicCostAttributes ICA(
3811 *CommonCallOp, II->getType(),
3812 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3813 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3814
3815 // We may need a swap here since it can be (a, b) or (b, a)
3816 // and accordingly change as we go up.
3817 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3818 std::swap(PrevVecV[0], PrevVecV[1]);
3819 InstWorklist.push(PrevVecV[1]);
3820 InstWorklist.push(PrevVecV[0]);
3821 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3822 // Similar logic for bin ops.
3823
3824 if (!ShouldBeCallOrBinInst)
3825 return false;
3826
3827 if (!IsFirstCallOrBinInst &&
3828 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3829 return false;
3830
3831 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3832 return false;
3833 IsFirstCallOrBinInst = false;
3834
3835 if (!CommonBinOp)
3836 CommonBinOp = BinOp->getOpcode();
3837
3838 if (BinOp->getOpcode() != *CommonBinOp)
3839 return false;
3840
3841 switch (*CommonBinOp) {
3842 case BinaryOperator::Add:
3843 case BinaryOperator::Mul:
3844 case BinaryOperator::Or:
3845 case BinaryOperator::And:
3846 case BinaryOperator::Xor: {
3847 auto *Op0 = BinOp->getOperand(0);
3848 auto *Op1 = BinOp->getOperand(1);
3849 PrevVecV[0] = Op0;
3850 PrevVecV[1] = Op1;
3851 break;
3852 }
3853 default:
3854 return false;
3855 }
3856 ShouldBeCallOrBinInst ^= 1;
3857
3858 OrigCost +=
3859 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3860
3861 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3862 std::swap(PrevVecV[0], PrevVecV[1]);
3863 InstWorklist.push(PrevVecV[1]);
3864 InstWorklist.push(PrevVecV[0]);
3865 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3866 // We shouldn't have any null values in the previous vectors,
3867 // is so, there was a mismatch in pattern.
3868 if (ShouldBeCallOrBinInst ||
3869 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3870 return false;
3871
3872 if (SVInst != PrevVecV[1])
3873 return false;
3874
3875 ArrayRef<int> CurMask;
3876 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3877 m_Mask(CurMask))))
3878 return false;
3879
3880 // Subtract the parity mask when checking the condition.
3881 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3882 if (Mask < ShuffleMaskHalf &&
3883 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3884 return false;
3885 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3886 return false;
3887 }
3888
3889 // Update mask values.
3890 ShuffleMaskHalf *= 2;
3891 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3892 ExpectedParityMask >>= 1;
3893
3895 SVInst->getType(), SVInst->getType(),
3896 CurMask, CostKind);
3897
3898 VisitedCnt += 1;
3899 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3900 break;
3901
3902 ShouldBeCallOrBinInst ^= 1;
3903 } else {
3904 return false;
3905 }
3906 }
3907
3908 // Pattern should end with a shuffle op.
3909 if (ShouldBeCallOrBinInst)
3910 return false;
3911
3912 assert(VecSize != -1 && "Expected Match for Vector Size");
3913
3914 Value *FinalVecV = PrevVecV[0];
3915 if (!FinalVecV)
3916 return false;
3917
3918 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3919
3920 Intrinsic::ID ReducedOp =
3921 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3922 : getReductionForBinop(*CommonBinOp));
3923 if (!ReducedOp)
3924 return false;
3925
3926 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3928
3929 if (NewCost >= OrigCost)
3930 return false;
3931
3932 auto *ReducedResult =
3933 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3934 replaceValue(I, *ReducedResult);
3935
3936 return true;
3937}
3938
3939/// Determine if its more efficient to fold:
3940/// reduce(trunc(x)) -> trunc(reduce(x)).
3941/// reduce(sext(x)) -> sext(reduce(x)).
3942/// reduce(zext(x)) -> zext(reduce(x)).
3943bool VectorCombine::foldCastFromReductions(Instruction &I) {
3944 auto *II = dyn_cast<IntrinsicInst>(&I);
3945 if (!II)
3946 return false;
3947
3948 bool TruncOnly = false;
3949 Intrinsic::ID IID = II->getIntrinsicID();
3950 switch (IID) {
3951 case Intrinsic::vector_reduce_add:
3952 case Intrinsic::vector_reduce_mul:
3953 TruncOnly = true;
3954 break;
3955 case Intrinsic::vector_reduce_and:
3956 case Intrinsic::vector_reduce_or:
3957 case Intrinsic::vector_reduce_xor:
3958 break;
3959 default:
3960 return false;
3961 }
3962
3963 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3964 Value *ReductionSrc = I.getOperand(0);
3965
3966 Value *Src;
3967 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3968 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3969 return false;
3970
3971 auto CastOpc =
3972 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3973
3974 auto *SrcTy = cast<VectorType>(Src->getType());
3975 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3976 Type *ResultTy = I.getType();
3977
3979 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3980 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3982 cast<CastInst>(ReductionSrc));
3983 InstructionCost NewCost =
3984 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3985 CostKind) +
3986 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3988
3989 if (OldCost <= NewCost || !NewCost.isValid())
3990 return false;
3991
3992 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3993 II->getIntrinsicID(), {Src});
3994 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3995 replaceValue(I, *NewCast);
3996 return true;
3997}
3998
3999/// Returns true if this ShuffleVectorInst eventually feeds into a
4000/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4001/// chains of shuffles and binary operators (in any combination/order).
4002/// The search does not go deeper than the given Depth.
4004 constexpr unsigned MaxVisited = 32;
4007 bool FoundReduction = false;
4008
4009 WorkList.push_back(SVI);
4010 while (!WorkList.empty()) {
4011 Instruction *I = WorkList.pop_back_val();
4012 for (User *U : I->users()) {
4013 auto *UI = cast<Instruction>(U);
4014 if (!UI || !Visited.insert(UI).second)
4015 continue;
4016 if (Visited.size() > MaxVisited)
4017 return false;
4018 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4019 // More than one reduction reached
4020 if (FoundReduction)
4021 return false;
4022 switch (II->getIntrinsicID()) {
4023 case Intrinsic::vector_reduce_add:
4024 case Intrinsic::vector_reduce_mul:
4025 case Intrinsic::vector_reduce_and:
4026 case Intrinsic::vector_reduce_or:
4027 case Intrinsic::vector_reduce_xor:
4028 case Intrinsic::vector_reduce_smin:
4029 case Intrinsic::vector_reduce_smax:
4030 case Intrinsic::vector_reduce_umin:
4031 case Intrinsic::vector_reduce_umax:
4032 FoundReduction = true;
4033 continue;
4034 default:
4035 return false;
4036 }
4037 }
4038
4040 return false;
4041
4042 WorkList.emplace_back(UI);
4043 }
4044 }
4045 return FoundReduction;
4046}
4047
4048/// This method looks for groups of shuffles acting on binops, of the form:
4049/// %x = shuffle ...
4050/// %y = shuffle ...
4051/// %a = binop %x, %y
4052/// %b = binop %x, %y
4053/// shuffle %a, %b, selectmask
4054/// We may, especially if the shuffle is wider than legal, be able to convert
4055/// the shuffle to a form where only parts of a and b need to be computed. On
4056/// architectures with no obvious "select" shuffle, this can reduce the total
4057/// number of operations if the target reports them as cheaper.
4058bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4059 auto *SVI = cast<ShuffleVectorInst>(&I);
4060 auto *VT = cast<FixedVectorType>(I.getType());
4061 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4062 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4063 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4064 VT != Op0->getType())
4065 return false;
4066
4067 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4068 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4069 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4070 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4071 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4072 auto checkSVNonOpUses = [&](Instruction *I) {
4073 if (!I || I->getOperand(0)->getType() != VT)
4074 return true;
4075 return any_of(I->users(), [&](User *U) {
4076 return U != Op0 && U != Op1 &&
4077 !(isa<ShuffleVectorInst>(U) &&
4078 (InputShuffles.contains(cast<Instruction>(U)) ||
4079 isInstructionTriviallyDead(cast<Instruction>(U))));
4080 });
4081 };
4082 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4083 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4084 return false;
4085
4086 // Collect all the uses that are shuffles that we can transform together. We
4087 // may not have a single shuffle, but a group that can all be transformed
4088 // together profitably.
4090 auto collectShuffles = [&](Instruction *I) {
4091 for (auto *U : I->users()) {
4092 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4093 if (!SV || SV->getType() != VT)
4094 return false;
4095 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4096 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4097 return false;
4098 if (!llvm::is_contained(Shuffles, SV))
4099 Shuffles.push_back(SV);
4100 }
4101 return true;
4102 };
4103 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4104 return false;
4105 // From a reduction, we need to be processing a single shuffle, otherwise the
4106 // other uses will not be lane-invariant.
4107 if (FromReduction && Shuffles.size() > 1)
4108 return false;
4109
4110 // Add any shuffle uses for the shuffles we have found, to include them in our
4111 // cost calculations.
4112 if (!FromReduction) {
4113 for (ShuffleVectorInst *SV : Shuffles) {
4114 for (auto *U : SV->users()) {
4115 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4116 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4117 Shuffles.push_back(SSV);
4118 }
4119 }
4120 }
4121
4122 // For each of the output shuffles, we try to sort all the first vector
4123 // elements to the beginning, followed by the second array elements at the
4124 // end. If the binops are legalized to smaller vectors, this may reduce total
4125 // number of binops. We compute the ReconstructMask mask needed to convert
4126 // back to the original lane order.
4128 SmallVector<SmallVector<int>> OrigReconstructMasks;
4129 int MaxV1Elt = 0, MaxV2Elt = 0;
4130 unsigned NumElts = VT->getNumElements();
4131 for (ShuffleVectorInst *SVN : Shuffles) {
4132 SmallVector<int> Mask;
4133 SVN->getShuffleMask(Mask);
4134
4135 // Check the operands are the same as the original, or reversed (in which
4136 // case we need to commute the mask).
4137 Value *SVOp0 = SVN->getOperand(0);
4138 Value *SVOp1 = SVN->getOperand(1);
4139 if (isa<UndefValue>(SVOp1)) {
4140 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
4141 SVOp0 = SSV->getOperand(0);
4142 SVOp1 = SSV->getOperand(1);
4143 for (int &Elem : Mask) {
4144 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
4145 return false;
4146 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
4147 }
4148 }
4149 if (SVOp0 == Op1 && SVOp1 == Op0) {
4150 std::swap(SVOp0, SVOp1);
4152 }
4153 if (SVOp0 != Op0 || SVOp1 != Op1)
4154 return false;
4155
4156 // Calculate the reconstruction mask for this shuffle, as the mask needed to
4157 // take the packed values from Op0/Op1 and reconstructing to the original
4158 // order.
4159 SmallVector<int> ReconstructMask;
4160 for (unsigned I = 0; I < Mask.size(); I++) {
4161 if (Mask[I] < 0) {
4162 ReconstructMask.push_back(-1);
4163 } else if (Mask[I] < static_cast<int>(NumElts)) {
4164 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
4165 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
4166 return Mask[I] == A.first;
4167 });
4168 if (It != V1.end())
4169 ReconstructMask.push_back(It - V1.begin());
4170 else {
4171 ReconstructMask.push_back(V1.size());
4172 V1.emplace_back(Mask[I], V1.size());
4173 }
4174 } else {
4175 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
4176 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
4177 return Mask[I] - static_cast<int>(NumElts) == A.first;
4178 });
4179 if (It != V2.end())
4180 ReconstructMask.push_back(NumElts + It - V2.begin());
4181 else {
4182 ReconstructMask.push_back(NumElts + V2.size());
4183 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
4184 }
4185 }
4186 }
4187
4188 // For reductions, we know that the lane ordering out doesn't alter the
4189 // result. In-order can help simplify the shuffle away.
4190 if (FromReduction)
4191 sort(ReconstructMask);
4192 OrigReconstructMasks.push_back(std::move(ReconstructMask));
4193 }
4194
4195 // If the Maximum element used from V1 and V2 are not larger than the new
4196 // vectors, the vectors are already packes and performing the optimization
4197 // again will likely not help any further. This also prevents us from getting
4198 // stuck in a cycle in case the costs do not also rule it out.
4199 if (V1.empty() || V2.empty() ||
4200 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
4201 MaxV2Elt == static_cast<int>(V2.size()) - 1))
4202 return false;
4203
4204 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
4205 // shuffle of another shuffle, or not a shuffle (that is treated like a
4206 // identity shuffle).
4207 auto GetBaseMaskValue = [&](Instruction *I, int M) {
4208 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4209 if (!SV)
4210 return M;
4211 if (isa<UndefValue>(SV->getOperand(1)))
4212 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4213 if (InputShuffles.contains(SSV))
4214 return SSV->getMaskValue(SV->getMaskValue(M));
4215 return SV->getMaskValue(M);
4216 };
4217
4218 // Attempt to sort the inputs my ascending mask values to make simpler input
4219 // shuffles and push complex shuffles down to the uses. We sort on the first
4220 // of the two input shuffle orders, to try and get at least one input into a
4221 // nice order.
4222 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
4223 std::pair<int, int> Y) {
4224 int MXA = GetBaseMaskValue(A, X.first);
4225 int MYA = GetBaseMaskValue(A, Y.first);
4226 return MXA < MYA;
4227 };
4228 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
4229 return SortBase(SVI0A, A, B);
4230 });
4231 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
4232 return SortBase(SVI1A, A, B);
4233 });
4234 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
4235 // modified order of the input shuffles.
4236 SmallVector<SmallVector<int>> ReconstructMasks;
4237 for (const auto &Mask : OrigReconstructMasks) {
4238 SmallVector<int> ReconstructMask;
4239 for (int M : Mask) {
4240 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
4241 auto It = find_if(V, [M](auto A) { return A.second == M; });
4242 assert(It != V.end() && "Expected all entries in Mask");
4243 return std::distance(V.begin(), It);
4244 };
4245 if (M < 0)
4246 ReconstructMask.push_back(-1);
4247 else if (M < static_cast<int>(NumElts)) {
4248 ReconstructMask.push_back(FindIndex(V1, M));
4249 } else {
4250 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
4251 }
4252 }
4253 ReconstructMasks.push_back(std::move(ReconstructMask));
4254 }
4255
4256 // Calculate the masks needed for the new input shuffles, which get padded
4257 // with undef
4258 SmallVector<int> V1A, V1B, V2A, V2B;
4259 for (unsigned I = 0; I < V1.size(); I++) {
4260 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
4261 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
4262 }
4263 for (unsigned I = 0; I < V2.size(); I++) {
4264 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
4265 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
4266 }
4267 while (V1A.size() < NumElts) {
4270 }
4271 while (V2A.size() < NumElts) {
4274 }
4275
4276 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
4277 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4278 if (!SV)
4279 return C;
4280 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
4283 VT, VT, SV->getShuffleMask(), CostKind);
4284 };
4285 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4286 return C +
4288 };
4289
4290 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
4291 unsigned MaxVectorSize =
4293 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
4294 if (MaxElementsInVector == 0)
4295 return false;
4296 // When there are multiple shufflevector operations on the same input,
4297 // especially when the vector length is larger than the register size,
4298 // identical shuffle patterns may occur across different groups of elements.
4299 // To avoid overestimating the cost by counting these repeated shuffles more
4300 // than once, we only account for unique shuffle patterns. This adjustment
4301 // prevents inflated costs in the cost model for wide vectors split into
4302 // several register-sized groups.
4303 std::set<SmallVector<int, 4>> UniqueShuffles;
4304 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4305 // Compute the cost for performing the shuffle over the full vector.
4306 auto ShuffleCost =
4308 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
4309 if (NumFullVectors < 2)
4310 return C + ShuffleCost;
4311 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
4312 unsigned NumUniqueGroups = 0;
4313 unsigned NumGroups = Mask.size() / MaxElementsInVector;
4314 // For each group of MaxElementsInVector contiguous elements,
4315 // collect their shuffle pattern and insert into the set of unique patterns.
4316 for (unsigned I = 0; I < NumFullVectors; ++I) {
4317 for (unsigned J = 0; J < MaxElementsInVector; ++J)
4318 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
4319 if (UniqueShuffles.insert(SubShuffle).second)
4320 NumUniqueGroups += 1;
4321 }
4322 return C + ShuffleCost * NumUniqueGroups / NumGroups;
4323 };
4324 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
4325 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4326 if (!SV)
4327 return C;
4328 SmallVector<int, 16> Mask;
4329 SV->getShuffleMask(Mask);
4330 return AddShuffleMaskAdjustedCost(C, Mask);
4331 };
4332 // Check that input consists of ShuffleVectors applied to the same input
4333 auto AllShufflesHaveSameOperands =
4334 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
4335 if (InputShuffles.size() < 2)
4336 return false;
4337 ShuffleVectorInst *FirstSV =
4338 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
4339 if (!FirstSV)
4340 return false;
4341
4342 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
4343 return std::all_of(
4344 std::next(InputShuffles.begin()), InputShuffles.end(),
4345 [&](Instruction *I) {
4346 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
4347 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
4348 });
4349 };
4350
4351 // Get the costs of the shuffles + binops before and after with the new
4352 // shuffle masks.
4353 InstructionCost CostBefore =
4354 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
4355 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
4356 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
4357 InstructionCost(0), AddShuffleCost);
4358 if (AllShufflesHaveSameOperands(InputShuffles)) {
4359 UniqueShuffles.clear();
4360 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4361 InstructionCost(0), AddShuffleAdjustedCost);
4362 } else {
4363 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4364 InstructionCost(0), AddShuffleCost);
4365 }
4366
4367 // The new binops will be unused for lanes past the used shuffle lengths.
4368 // These types attempt to get the correct cost for that from the target.
4369 FixedVectorType *Op0SmallVT =
4370 FixedVectorType::get(VT->getScalarType(), V1.size());
4371 FixedVectorType *Op1SmallVT =
4372 FixedVectorType::get(VT->getScalarType(), V2.size());
4373 InstructionCost CostAfter =
4374 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4375 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4376 UniqueShuffles.clear();
4377 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4378 InstructionCost(0), AddShuffleMaskAdjustedCost);
4379 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4380 CostAfter +=
4381 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4382 InstructionCost(0), AddShuffleMaskCost);
4383
4384 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4385 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4386 << " vs CostAfter: " << CostAfter << "\n");
4387 if (CostBefore < CostAfter ||
4388 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4389 return false;
4390
4391 // The cost model has passed, create the new instructions.
4392 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4393 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4394 if (!SV)
4395 return I;
4396 if (isa<UndefValue>(SV->getOperand(1)))
4397 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4398 if (InputShuffles.contains(SSV))
4399 return SSV->getOperand(Op);
4400 return SV->getOperand(Op);
4401 };
4402 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4403 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4404 GetShuffleOperand(SVI0A, 1), V1A);
4405 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4406 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4407 GetShuffleOperand(SVI0B, 1), V1B);
4408 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4409 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4410 GetShuffleOperand(SVI1A, 1), V2A);
4411 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4412 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4413 GetShuffleOperand(SVI1B, 1), V2B);
4414 Builder.SetInsertPoint(Op0);
4415 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4416 NSV0A, NSV0B);
4417 if (auto *I = dyn_cast<Instruction>(NOp0))
4418 I->copyIRFlags(Op0, true);
4419 Builder.SetInsertPoint(Op1);
4420 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4421 NSV1A, NSV1B);
4422 if (auto *I = dyn_cast<Instruction>(NOp1))
4423 I->copyIRFlags(Op1, true);
4424
4425 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4426 Builder.SetInsertPoint(Shuffles[S]);
4427 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4428 replaceValue(*Shuffles[S], *NSV, false);
4429 }
4430
4431 Worklist.pushValue(NSV0A);
4432 Worklist.pushValue(NSV0B);
4433 Worklist.pushValue(NSV1A);
4434 Worklist.pushValue(NSV1B);
4435 return true;
4436}
4437
4438/// Check if instruction depends on ZExt and this ZExt can be moved after the
4439/// instruction. Move ZExt if it is profitable. For example:
4440/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4441/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4442/// Cost model calculations takes into account if zext(x) has other users and
4443/// whether it can be propagated through them too.
4444bool VectorCombine::shrinkType(Instruction &I) {
4445 Value *ZExted, *OtherOperand;
4446 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4447 m_Value(OtherOperand))) &&
4448 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4449 return false;
4450
4451 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4452
4453 auto *BigTy = cast<FixedVectorType>(I.getType());
4454 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4455 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4456
4457 if (I.getOpcode() == Instruction::LShr) {
4458 // Check that the shift amount is less than the number of bits in the
4459 // smaller type. Otherwise, the smaller lshr will return a poison value.
4460 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4461 if (ShAmtKB.getMaxValue().uge(BW))
4462 return false;
4463 } else {
4464 // Check that the expression overall uses at most the same number of bits as
4465 // ZExted
4466 KnownBits KB = computeKnownBits(&I, *DL);
4467 if (KB.countMaxActiveBits() > BW)
4468 return false;
4469 }
4470
4471 // Calculate costs of leaving current IR as it is and moving ZExt operation
4472 // later, along with adding truncates if needed
4474 Instruction::ZExt, BigTy, SmallTy,
4475 TargetTransformInfo::CastContextHint::None, CostKind);
4476 InstructionCost CurrentCost = ZExtCost;
4477 InstructionCost ShrinkCost = 0;
4478
4479 // Calculate total cost and check that we can propagate through all ZExt users
4480 for (User *U : ZExtOperand->users()) {
4481 auto *UI = cast<Instruction>(U);
4482 if (UI == &I) {
4483 CurrentCost +=
4484 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4485 ShrinkCost +=
4486 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4487 ShrinkCost += ZExtCost;
4488 continue;
4489 }
4490
4491 if (!Instruction::isBinaryOp(UI->getOpcode()))
4492 return false;
4493
4494 // Check if we can propagate ZExt through its other users
4495 KnownBits KB = computeKnownBits(UI, *DL);
4496 if (KB.countMaxActiveBits() > BW)
4497 return false;
4498
4499 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4500 ShrinkCost +=
4501 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4502 ShrinkCost += ZExtCost;
4503 }
4504
4505 // If the other instruction operand is not a constant, we'll need to
4506 // generate a truncate instruction. So we have to adjust cost
4507 if (!isa<Constant>(OtherOperand))
4508 ShrinkCost += TTI.getCastInstrCost(
4509 Instruction::Trunc, SmallTy, BigTy,
4510 TargetTransformInfo::CastContextHint::None, CostKind);
4511
4512 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4513 // towards modifying the IR because shrinking opens opportunities for other
4514 // shrinking optimisations.
4515 if (ShrinkCost > CurrentCost)
4516 return false;
4517
4518 Builder.SetInsertPoint(&I);
4519 Value *Op0 = ZExted;
4520 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4521 // Keep the order of operands the same
4522 if (I.getOperand(0) == OtherOperand)
4523 std::swap(Op0, Op1);
4524 Value *NewBinOp =
4525 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4526 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4527 cast<Instruction>(NewBinOp)->copyMetadata(I);
4528 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4529 replaceValue(I, *NewZExtr);
4530 return true;
4531}
4532
4533/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4534/// shuffle (DstVec, SrcVec, Mask)
4535bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4536 Value *DstVec, *SrcVec;
4537 uint64_t ExtIdx, InsIdx;
4538 if (!match(&I,
4539 m_InsertElt(m_Value(DstVec),
4540 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4541 m_ConstantInt(InsIdx))))
4542 return false;
4543
4544 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4545 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4546 // We can try combining vectors with different element sizes.
4547 if (!DstVecTy || !SrcVecTy ||
4548 SrcVecTy->getElementType() != DstVecTy->getElementType())
4549 return false;
4550
4551 unsigned NumDstElts = DstVecTy->getNumElements();
4552 unsigned NumSrcElts = SrcVecTy->getNumElements();
4553 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4554 return false;
4555
4556 // Insertion into poison is a cheaper single operand shuffle.
4558 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4559
4560 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4561 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4562 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4563 if (NeedDstSrcSwap) {
4565 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4566 Mask[InsIdx] = 0;
4567 else
4568 Mask[InsIdx] = ExtIdx;
4569 std::swap(DstVec, SrcVec);
4570 } else {
4572 std::iota(Mask.begin(), Mask.end(), 0);
4573 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4574 Mask[InsIdx] = NumDstElts;
4575 else
4576 Mask[InsIdx] = ExtIdx + NumDstElts;
4577 }
4578
4579 // Cost
4580 auto *Ins = cast<InsertElementInst>(&I);
4581 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4582 InstructionCost InsCost =
4583 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4584 InstructionCost ExtCost =
4585 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4586 InstructionCost OldCost = ExtCost + InsCost;
4587
4588 InstructionCost NewCost = 0;
4589 SmallVector<int> ExtToVecMask;
4590 if (!NeedExpOrNarrow) {
4591 // Ignore 'free' identity insertion shuffle.
4592 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4593 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4594 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4595 nullptr, {DstVec, SrcVec});
4596 } else {
4597 // When creating length-changing-vector, always create with a Mask whose
4598 // first element has an ExtIdx, so that the first element of the vector
4599 // being created is always the target to be extracted.
4600 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4601 if (IsExtIdxInBounds)
4602 ExtToVecMask[ExtIdx] = ExtIdx;
4603 else
4604 ExtToVecMask[0] = ExtIdx;
4605 // Add cost for expanding or narrowing
4607 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4608 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4609 }
4610
4611 if (!Ext->hasOneUse())
4612 NewCost += ExtCost;
4613
4614 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4615 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4616 << "\n");
4617
4618 if (OldCost < NewCost)
4619 return false;
4620
4621 if (NeedExpOrNarrow) {
4622 if (!NeedDstSrcSwap)
4623 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4624 else
4625 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4626 }
4627
4628 // Canonicalize undef param to RHS to help further folds.
4629 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4630 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4631 std::swap(DstVec, SrcVec);
4632 }
4633
4634 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4635 replaceValue(I, *Shuf);
4636
4637 return true;
4638}
4639
4640/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4641/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4642/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4643/// before casting it back into `<vscale x 16 x i32>`.
4644bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4645 const APInt *SplatVal0, *SplatVal1;
4647 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4648 return false;
4649
4650 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4651 << "\n");
4652
4653 auto *VTy =
4654 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4655 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4656 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4657
4658 // Just in case the cost of interleave2 intrinsic and bitcast are both
4659 // invalid, in which case we want to bail out, we use <= rather
4660 // than < here. Even they both have valid and equal costs, it's probably
4661 // not a good idea to emit a high-cost constant splat.
4663 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4665 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4666 << *I.getType() << " is too high.\n");
4667 return false;
4668 }
4669
4670 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4671 NewSplatVal <<= Width;
4672 NewSplatVal |= SplatVal0->zext(Width * 2);
4673 auto *NewSplat = ConstantVector::getSplat(
4674 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4675
4676 IRBuilder<> Builder(&I);
4677 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4678 return true;
4679}
4680
4681// Attempt to shrink loads that are only used by shufflevector instructions.
4682bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4683 auto *OldLoad = dyn_cast<LoadInst>(&I);
4684 if (!OldLoad || !OldLoad->isSimple())
4685 return false;
4686
4687 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4688 if (!OldLoadTy)
4689 return false;
4690
4691 unsigned const OldNumElements = OldLoadTy->getNumElements();
4692
4693 // Search all uses of load. If all uses are shufflevector instructions, and
4694 // the second operands are all poison values, find the minimum and maximum
4695 // indices of the vector elements referenced by all shuffle masks.
4696 // Otherwise return `std::nullopt`.
4697 using IndexRange = std::pair<int, int>;
4698 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4699 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4700 for (llvm::Use &Use : I.uses()) {
4701 // Ensure all uses match the required pattern.
4702 User *Shuffle = Use.getUser();
4703 ArrayRef<int> Mask;
4704
4705 if (!match(Shuffle,
4706 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4707 return std::nullopt;
4708
4709 // Ignore shufflevector instructions that have no uses.
4710 if (Shuffle->use_empty())
4711 continue;
4712
4713 // Find the min and max indices used by the shufflevector instruction.
4714 for (int Index : Mask) {
4715 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4716 OutputRange.first = std::min(Index, OutputRange.first);
4717 OutputRange.second = std::max(Index, OutputRange.second);
4718 }
4719 }
4720 }
4721
4722 if (OutputRange.second < OutputRange.first)
4723 return std::nullopt;
4724
4725 return OutputRange;
4726 };
4727
4728 // Get the range of vector elements used by shufflevector instructions.
4729 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4730 unsigned const NewNumElements = Indices->second + 1u;
4731
4732 // If the range of vector elements is smaller than the full load, attempt
4733 // to create a smaller load.
4734 if (NewNumElements < OldNumElements) {
4735 IRBuilder Builder(&I);
4736 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4737
4738 // Calculate costs of old and new ops.
4739 Type *ElemTy = OldLoadTy->getElementType();
4740 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4741 Value *PtrOp = OldLoad->getPointerOperand();
4742
4744 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4745 OldLoad->getPointerAddressSpace(), CostKind);
4746 InstructionCost NewCost =
4747 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4748 OldLoad->getPointerAddressSpace(), CostKind);
4749
4750 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4752 unsigned const MaxIndex = NewNumElements * 2u;
4753
4754 for (llvm::Use &Use : I.uses()) {
4755 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4756 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4757
4758 // Create entry for new use.
4759 NewUses.push_back({Shuffle, OldMask});
4760
4761 // Validate mask indices.
4762 for (int Index : OldMask) {
4763 if (Index >= static_cast<int>(MaxIndex))
4764 return false;
4765 }
4766
4767 // Update costs.
4768 OldCost +=
4770 OldLoadTy, OldMask, CostKind);
4771 NewCost +=
4773 NewLoadTy, OldMask, CostKind);
4774 }
4775
4776 LLVM_DEBUG(
4777 dbgs() << "Found a load used only by shufflevector instructions: "
4778 << I << "\n OldCost: " << OldCost
4779 << " vs NewCost: " << NewCost << "\n");
4780
4781 if (OldCost < NewCost || !NewCost.isValid())
4782 return false;
4783
4784 // Create new load of smaller vector.
4785 auto *NewLoad = cast<LoadInst>(
4786 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4787 NewLoad->copyMetadata(I);
4788
4789 // Replace all uses.
4790 for (UseEntry &Use : NewUses) {
4791 ShuffleVectorInst *Shuffle = Use.first;
4792 std::vector<int> &NewMask = Use.second;
4793
4794 Builder.SetInsertPoint(Shuffle);
4795 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4796 Value *NewShuffle = Builder.CreateShuffleVector(
4797 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4798
4799 replaceValue(*Shuffle, *NewShuffle, false);
4800 }
4801
4802 return true;
4803 }
4804 }
4805 return false;
4806}
4807
4808// Attempt to narrow a phi of shufflevector instructions where the two incoming
4809// values have the same operands but different masks. If the two shuffle masks
4810// are offsets of one another we can use one branch to rotate the incoming
4811// vector and perform one larger shuffle after the phi.
4812bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4813 auto *Phi = dyn_cast<PHINode>(&I);
4814 if (!Phi || Phi->getNumIncomingValues() != 2u)
4815 return false;
4816
4817 Value *Op = nullptr;
4818 ArrayRef<int> Mask0;
4819 ArrayRef<int> Mask1;
4820
4821 if (!match(Phi->getOperand(0u),
4822 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4823 !match(Phi->getOperand(1u),
4824 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4825 return false;
4826
4827 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4828
4829 // Ensure result vectors are wider than the argument vector.
4830 auto *InputVT = cast<FixedVectorType>(Op->getType());
4831 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4832 auto const InputNumElements = InputVT->getNumElements();
4833
4834 if (InputNumElements >= ResultVT->getNumElements())
4835 return false;
4836
4837 // Take the difference of the two shuffle masks at each index. Ignore poison
4838 // values at the same index in both masks.
4839 SmallVector<int, 16> NewMask;
4840 NewMask.reserve(Mask0.size());
4841
4842 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4843 if (M0 >= 0 && M1 >= 0)
4844 NewMask.push_back(M0 - M1);
4845 else if (M0 == -1 && M1 == -1)
4846 continue;
4847 else
4848 return false;
4849 }
4850
4851 // Ensure all elements of the new mask are equal. If the difference between
4852 // the incoming mask elements is the same, the two must be constant offsets
4853 // of one another.
4854 if (NewMask.empty() || !all_equal(NewMask))
4855 return false;
4856
4857 // Create new mask using difference of the two incoming masks.
4858 int MaskOffset = NewMask[0u];
4859 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
4860 NewMask.clear();
4861
4862 for (unsigned I = 0u; I < InputNumElements; ++I) {
4863 NewMask.push_back(Index);
4864 Index = (Index + 1u) % InputNumElements;
4865 }
4866
4867 // Calculate costs for worst cases and compare.
4868 auto const Kind = TTI::SK_PermuteSingleSrc;
4869 auto OldCost =
4870 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4871 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4872 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4873 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4874
4875 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4876 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4877 << "\n");
4878
4879 if (NewCost > OldCost)
4880 return false;
4881
4882 // Create new shuffles and narrowed phi.
4883 auto Builder = IRBuilder(Shuf);
4884 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4885 auto *PoisonVal = PoisonValue::get(InputVT);
4886 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4887 Worklist.push(cast<Instruction>(NewShuf0));
4888
4889 Builder.SetInsertPoint(Phi);
4890 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4891 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4892 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4893 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4894
4895 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4896 PoisonVal = PoisonValue::get(NewPhi->getType());
4897 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4898
4899 replaceValue(*Phi, *NewShuf1);
4900 return true;
4901}
4902
4903/// This is the entry point for all transforms. Pass manager differences are
4904/// handled in the callers of this function.
4905bool VectorCombine::run() {
4907 return false;
4908
4909 // Don't attempt vectorization if the target does not support vectors.
4910 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4911 return false;
4912
4913 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4914
4915 auto FoldInst = [this](Instruction &I) {
4916 Builder.SetInsertPoint(&I);
4917 bool IsVectorType = isa<VectorType>(I.getType());
4918 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4919 auto Opcode = I.getOpcode();
4920
4921 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4922
4923 // These folds should be beneficial regardless of when this pass is run
4924 // in the optimization pipeline.
4925 // The type checking is for run-time efficiency. We can avoid wasting time
4926 // dispatching to folding functions if there's no chance of matching.
4927 if (IsFixedVectorType) {
4928 switch (Opcode) {
4929 case Instruction::InsertElement:
4930 if (vectorizeLoadInsert(I))
4931 return true;
4932 break;
4933 case Instruction::ShuffleVector:
4934 if (widenSubvectorLoad(I))
4935 return true;
4936 break;
4937 default:
4938 break;
4939 }
4940 }
4941
4942 // This transform works with scalable and fixed vectors
4943 // TODO: Identify and allow other scalable transforms
4944 if (IsVectorType) {
4945 if (scalarizeOpOrCmp(I))
4946 return true;
4947 if (scalarizeLoad(I))
4948 return true;
4949 if (scalarizeExtExtract(I))
4950 return true;
4951 if (scalarizeVPIntrinsic(I))
4952 return true;
4953 if (foldInterleaveIntrinsics(I))
4954 return true;
4955 }
4956
4957 if (Opcode == Instruction::Store)
4958 if (foldSingleElementStore(I))
4959 return true;
4960
4961 // If this is an early pipeline invocation of this pass, we are done.
4962 if (TryEarlyFoldsOnly)
4963 return false;
4964
4965 // Otherwise, try folds that improve codegen but may interfere with
4966 // early IR canonicalizations.
4967 // The type checking is for run-time efficiency. We can avoid wasting time
4968 // dispatching to folding functions if there's no chance of matching.
4969 if (IsFixedVectorType) {
4970 switch (Opcode) {
4971 case Instruction::InsertElement:
4972 if (foldInsExtFNeg(I))
4973 return true;
4974 if (foldInsExtBinop(I))
4975 return true;
4976 if (foldInsExtVectorToShuffle(I))
4977 return true;
4978 break;
4979 case Instruction::ShuffleVector:
4980 if (foldPermuteOfBinops(I))
4981 return true;
4982 if (foldShuffleOfBinops(I))
4983 return true;
4984 if (foldShuffleOfSelects(I))
4985 return true;
4986 if (foldShuffleOfCastops(I))
4987 return true;
4988 if (foldShuffleOfShuffles(I))
4989 return true;
4990 if (foldPermuteOfIntrinsic(I))
4991 return true;
4992 if (foldShufflesOfLengthChangingShuffles(I))
4993 return true;
4994 if (foldShuffleOfIntrinsics(I))
4995 return true;
4996 if (foldSelectShuffle(I))
4997 return true;
4998 if (foldShuffleToIdentity(I))
4999 return true;
5000 break;
5001 case Instruction::Load:
5002 if (shrinkLoadForShuffles(I))
5003 return true;
5004 break;
5005 case Instruction::BitCast:
5006 if (foldBitcastShuffle(I))
5007 return true;
5008 break;
5009 case Instruction::And:
5010 case Instruction::Or:
5011 case Instruction::Xor:
5012 if (foldBitOpOfCastops(I))
5013 return true;
5014 if (foldBitOpOfCastConstant(I))
5015 return true;
5016 break;
5017 case Instruction::PHI:
5018 if (shrinkPhiOfShuffles(I))
5019 return true;
5020 break;
5021 default:
5022 if (shrinkType(I))
5023 return true;
5024 break;
5025 }
5026 } else {
5027 switch (Opcode) {
5028 case Instruction::Call:
5029 if (foldShuffleFromReductions(I))
5030 return true;
5031 if (foldCastFromReductions(I))
5032 return true;
5033 break;
5034 case Instruction::ExtractElement:
5035 if (foldShuffleChainsToReduce(I))
5036 return true;
5037 break;
5038 case Instruction::ICmp:
5039 case Instruction::FCmp:
5040 if (foldExtractExtract(I))
5041 return true;
5042 break;
5043 case Instruction::Or:
5044 if (foldConcatOfBoolMasks(I))
5045 return true;
5046 [[fallthrough]];
5047 default:
5048 if (Instruction::isBinaryOp(Opcode)) {
5049 if (foldExtractExtract(I))
5050 return true;
5051 if (foldExtractedCmps(I))
5052 return true;
5053 if (foldBinopOfReductions(I))
5054 return true;
5055 }
5056 break;
5057 }
5058 }
5059 return false;
5060 };
5061
5062 bool MadeChange = false;
5063 for (BasicBlock &BB : F) {
5064 // Ignore unreachable basic blocks.
5065 if (!DT.isReachableFromEntry(&BB))
5066 continue;
5067 // Use early increment range so that we can erase instructions in loop.
5068 // make_early_inc_range is not applicable here, as the next iterator may
5069 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5070 // We manually maintain the next instruction and update it when it is about
5071 // to be deleted.
5072 Instruction *I = &BB.front();
5073 while (I) {
5074 NextInst = I->getNextNode();
5075 if (!I->isDebugOrPseudoInst())
5076 MadeChange |= FoldInst(*I);
5077 I = NextInst;
5078 }
5079 }
5080
5081 NextInst = nullptr;
5082
5083 while (!Worklist.isEmpty()) {
5084 Instruction *I = Worklist.removeOne();
5085 if (!I)
5086 continue;
5087
5090 continue;
5091 }
5092
5093 MadeChange |= FoldInst(*I);
5094 }
5095
5096 return MadeChange;
5097}
5098
5101 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5103 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5104 AAResults &AA = FAM.getResult<AAManager>(F);
5105 const DataLayout *DL = &F.getDataLayout();
5106 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5107 TryEarlyFoldsOnly);
5108 if (!Combiner.run())
5109 return PreservedAnalyses::all();
5112 return PA;
5113}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1450
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2266
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2070
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145