LLVM 23.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
40#include <numeric>
41#include <optional>
42#include <queue>
43#include <set>
44
45#define DEBUG_TYPE "vector-combine"
47
48using namespace llvm;
49using namespace llvm::PatternMatch;
50
51STATISTIC(NumVecLoad, "Number of vector loads formed");
52STATISTIC(NumVecCmp, "Number of vector compares formed");
53STATISTIC(NumVecBO, "Number of vector binops formed");
54STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
55STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
56STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
57STATISTIC(NumScalarCmp, "Number of scalar compares formed");
58STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
59
61 "disable-vector-combine", cl::init(false), cl::Hidden,
62 cl::desc("Disable all vector combine transforms"));
63
65 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
66 cl::desc("Disable binop extract to shuffle transforms"));
67
69 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
70 cl::desc("Max number of instructions to scan for vector combining."));
71
72static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
73
74namespace {
75class VectorCombine {
76public:
77 VectorCombine(Function &F, const TargetTransformInfo &TTI,
80 bool TryEarlyFoldsOnly)
81 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
82 DT(DT), AA(AA), DL(DL), CostKind(CostKind),
83 SQ(*DL, /*TLI=*/nullptr, &DT, &AC),
84 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
85
86 bool run();
87
88private:
89 Function &F;
91 const TargetTransformInfo &TTI;
92 const DominatorTree &DT;
93 AAResults &AA;
94 const DataLayout *DL;
95 TTI::TargetCostKind CostKind;
96 const SimplifyQuery SQ;
97
98 /// If true, only perform beneficial early IR transforms. Do not introduce new
99 /// vector operations.
100 bool TryEarlyFoldsOnly;
101
102 InstructionWorklist Worklist;
103
104 /// Next instruction to iterate. It will be updated when it is erased by
105 /// RecursivelyDeleteTriviallyDeadInstructions.
106 Instruction *NextInst;
107
108 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
109 // parameter. That should be updated to specific sub-classes because the
110 // run loop was changed to dispatch on opcode.
111 bool vectorizeLoadInsert(Instruction &I);
112 bool widenSubvectorLoad(Instruction &I);
113 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
114 ExtractElementInst *Ext1,
115 unsigned PreferredExtractIndex) const;
116 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
117 const Instruction &I,
118 ExtractElementInst *&ConvertToShuffle,
119 unsigned PreferredExtractIndex);
120 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
121 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
122 bool foldExtractExtract(Instruction &I);
123 bool foldInsExtFNeg(Instruction &I);
124 bool foldInsExtBinop(Instruction &I);
125 bool foldInsExtVectorToShuffle(Instruction &I);
126 bool foldBitOpOfCastops(Instruction &I);
127 bool foldBitOpOfCastConstant(Instruction &I);
128 bool foldBitcastShuffle(Instruction &I);
129 bool scalarizeOpOrCmp(Instruction &I);
130 bool scalarizeVPIntrinsic(Instruction &I);
131 bool foldExtractedCmps(Instruction &I);
132 bool foldSelectsFromBitcast(Instruction &I);
133 bool foldBinopOfReductions(Instruction &I);
134 bool foldSingleElementStore(Instruction &I);
135 bool scalarizeLoad(Instruction &I);
136 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
137 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
138 bool scalarizeExtExtract(Instruction &I);
139 bool foldConcatOfBoolMasks(Instruction &I);
140 bool foldPermuteOfBinops(Instruction &I);
141 bool foldShuffleOfBinops(Instruction &I);
142 bool foldShuffleOfSelects(Instruction &I);
143 bool foldShuffleOfCastops(Instruction &I);
144 bool foldShuffleOfShuffles(Instruction &I);
145 bool foldPermuteOfIntrinsic(Instruction &I);
146 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
147 bool foldShuffleOfIntrinsics(Instruction &I);
148 bool foldShuffleToIdentity(Instruction &I);
149 bool foldShuffleFromReductions(Instruction &I);
150 bool foldShuffleChainsToReduce(Instruction &I);
151 bool foldCastFromReductions(Instruction &I);
152 bool foldSignBitReductionCmp(Instruction &I);
153 bool foldICmpEqZeroVectorReduce(Instruction &I);
154 bool foldEquivalentReductionCmp(Instruction &I);
155 bool foldReduceAddCmpZero(Instruction &I);
156 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
157 bool foldInterleaveIntrinsics(Instruction &I);
158 bool shrinkType(Instruction &I);
159 bool shrinkLoadForShuffles(Instruction &I);
160 bool shrinkPhiOfShuffles(Instruction &I);
161
162 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
163 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
164 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
165 Old.replaceAllUsesWith(&New);
166 if (auto *NewI = dyn_cast<Instruction>(&New)) {
167 New.takeName(&Old);
168 Worklist.pushUsersToWorkList(*NewI);
169 Worklist.pushValue(NewI);
170 }
171 if (Erase && isInstructionTriviallyDead(&Old)) {
172 eraseInstruction(Old);
173 } else {
174 Worklist.push(&Old);
175 }
176 }
177
178 void eraseInstruction(Instruction &I) {
179 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
180 SmallVector<Value *> Ops(I.operands());
181 Worklist.remove(&I);
182 I.eraseFromParent();
183
184 // Push remaining users of the operands and then the operand itself - allows
185 // further folds that were hindered by OneUse limits.
186 SmallPtrSet<Value *, 4> Visited;
187 for (Value *Op : Ops) {
188 if (!Visited.contains(Op)) {
189 if (auto *OpI = dyn_cast<Instruction>(Op)) {
191 OpI, nullptr, nullptr, [&](Value *V) {
192 if (auto *I = dyn_cast<Instruction>(V)) {
193 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
194 Worklist.remove(I);
195 if (I == NextInst)
196 NextInst = NextInst->getNextNode();
197 Visited.insert(I);
198 }
199 }))
200 continue;
201 Worklist.pushUsersToWorkList(*OpI);
202 Worklist.pushValue(OpI);
203 }
204 }
205 }
206 }
207};
208} // namespace
209
210/// Return the source operand of a potentially bitcasted value. If there is no
211/// bitcast, return the input value itself.
213 while (auto *BitCast = dyn_cast<BitCastInst>(V))
214 V = BitCast->getOperand(0);
215 return V;
216}
217
218static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
219 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
220 // The widened load may load data from dirty regions or create data races
221 // non-existent in the source.
222 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
223 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
225 return false;
226
227 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
228 // sure we have all of our type-based constraints in place for this target.
229 Type *ScalarTy = Load->getType()->getScalarType();
230 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
231 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
232 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
233 ScalarSize % 8 != 0)
234 return false;
235
236 return true;
237}
238
239bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
240 // Match insert into fixed vector of scalar value.
241 // TODO: Handle non-zero insert index.
242 Value *Scalar;
243 if (!match(&I,
245 return false;
246
247 // Optionally match an extract from another vector.
248 Value *X;
249 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
250 if (!HasExtract)
251 X = Scalar;
252
253 auto *Load = dyn_cast<LoadInst>(X);
254 if (!canWidenLoad(Load, TTI))
255 return false;
256
257 Type *ScalarTy = Scalar->getType();
258 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
259 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
260
261 // Check safety of replacing the scalar load with a larger vector load.
262 // We use minimal alignment (maximum flexibility) because we only care about
263 // the dereferenceable region. When calculating cost and creating a new op,
264 // we may use a larger value based on alignment attributes.
265 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
266 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
267
268 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
269 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
270 unsigned OffsetEltIndex = 0;
271 Align Alignment = Load->getAlign();
272 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, SQ.AC,
273 SQ.DT)) {
274 // It is not safe to load directly from the pointer, but we can still peek
275 // through gep offsets and check if it safe to load from a base address with
276 // updated alignment. If it is, we can shuffle the element(s) into place
277 // after loading.
278 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
279 APInt Offset(OffsetBitWidth, 0);
281
282 // We want to shuffle the result down from a high element of a vector, so
283 // the offset must be positive.
284 if (Offset.isNegative())
285 return false;
286
287 // The offset must be a multiple of the scalar element to shuffle cleanly
288 // in the element's size.
289 uint64_t ScalarSizeInBytes = ScalarSize / 8;
290 if (Offset.urem(ScalarSizeInBytes) != 0)
291 return false;
292
293 // If we load MinVecNumElts, will our target element still be loaded?
294 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
295 if (OffsetEltIndex >= MinVecNumElts)
296 return false;
297
298 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load,
299 SQ.AC, SQ.DT))
300 return false;
301
302 // Update alignment with offset value. Note that the offset could be negated
303 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
304 // negation does not change the result of the alignment calculation.
305 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
306 }
307
308 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
309 // Use the greater of the alignment on the load or its source pointer.
310 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
311 Type *LoadTy = Load->getType();
312 unsigned AS = Load->getPointerAddressSpace();
313 InstructionCost OldCost =
314 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
315 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
316 OldCost +=
317 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
318 /* Insert */ true, HasExtract, CostKind);
319
320 // New pattern: load VecPtr
321 InstructionCost NewCost =
322 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
323 // Optionally, we are shuffling the loaded vector element(s) into place.
324 // For the mask set everything but element 0 to undef to prevent poison from
325 // propagating from the extra loaded memory. This will also optionally
326 // shrink/grow the vector from the loaded size to the output size.
327 // We assume this operation has no cost in codegen if there was no offset.
328 // Note that we could use freeze to avoid poison problems, but then we might
329 // still need a shuffle to change the vector size.
330 auto *Ty = cast<FixedVectorType>(I.getType());
331 unsigned OutputNumElts = Ty->getNumElements();
332 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
333 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
334 Mask[0] = OffsetEltIndex;
335 if (OffsetEltIndex)
336 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
337 CostKind);
338
339 // We can aggressively convert to the vector form because the backend can
340 // invert this transform if it does not result in a performance win.
341 if (OldCost < NewCost || !NewCost.isValid())
342 return false;
343
344 // It is safe and potentially profitable to load a vector directly:
345 // inselt undef, load Scalar, 0 --> load VecPtr
346 IRBuilder<> Builder(Load);
347 Value *CastedPtr =
348 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
349 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
350 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
351
352 replaceValue(I, *VecLd);
353 ++NumVecLoad;
354 return true;
355}
356
357/// If we are loading a vector and then inserting it into a larger vector with
358/// undefined elements, try to load the larger vector and eliminate the insert.
359/// This removes a shuffle in IR and may allow combining of other loaded values.
360bool VectorCombine::widenSubvectorLoad(Instruction &I) {
361 // Match subvector insert of fixed vector.
362 auto *Shuf = cast<ShuffleVectorInst>(&I);
363 if (!Shuf->isIdentityWithPadding())
364 return false;
365
366 // Allow a non-canonical shuffle mask that is choosing elements from op1.
367 unsigned NumOpElts =
368 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
369 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
370 return M >= (int)(NumOpElts);
371 });
372
373 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
374 if (!canWidenLoad(Load, TTI))
375 return false;
376
377 // We use minimal alignment (maximum flexibility) because we only care about
378 // the dereferenceable region. When calculating cost and creating a new op,
379 // we may use a larger value based on alignment attributes.
380 auto *Ty = cast<FixedVectorType>(I.getType());
381 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
382 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
383 Align Alignment = Load->getAlign();
384 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, SQ.AC,
385 SQ.DT))
386 return false;
387
388 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
389 Type *LoadTy = Load->getType();
390 unsigned AS = Load->getPointerAddressSpace();
391
392 // Original pattern: insert_subvector (load PtrOp)
393 // This conservatively assumes that the cost of a subvector insert into an
394 // undef value is 0. We could add that cost if the cost model accurately
395 // reflects the real cost of that operation.
396 InstructionCost OldCost =
397 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
398
399 // New pattern: load PtrOp
400 InstructionCost NewCost =
401 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
402
403 // We can aggressively convert to the vector form because the backend can
404 // invert this transform if it does not result in a performance win.
405 if (OldCost < NewCost || !NewCost.isValid())
406 return false;
407
408 IRBuilder<> Builder(Load);
409 Value *CastedPtr =
410 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
411 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
412 replaceValue(I, *VecLd);
413 ++NumVecLoad;
414 return true;
415}
416
417/// Determine which, if any, of the inputs should be replaced by a shuffle
418/// followed by extract from a different index.
419ExtractElementInst *VectorCombine::getShuffleExtract(
420 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
421 unsigned PreferredExtractIndex = InvalidIndex) const {
422 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
423 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
424 assert(Index0C && Index1C && "Expected constant extract indexes");
425
426 unsigned Index0 = Index0C->getZExtValue();
427 unsigned Index1 = Index1C->getZExtValue();
428
429 // If the extract indexes are identical, no shuffle is needed.
430 if (Index0 == Index1)
431 return nullptr;
432
433 Type *VecTy = Ext0->getVectorOperand()->getType();
434 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
435 InstructionCost Cost0 =
436 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
437 InstructionCost Cost1 =
438 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
439
440 // If both costs are invalid no shuffle is needed
441 if (!Cost0.isValid() && !Cost1.isValid())
442 return nullptr;
443
444 // We are extracting from 2 different indexes, so one operand must be shuffled
445 // before performing a vector operation and/or extract. The more expensive
446 // extract will be replaced by a shuffle.
447 if (Cost0 > Cost1)
448 return Ext0;
449 if (Cost1 > Cost0)
450 return Ext1;
451
452 // If the costs are equal and there is a preferred extract index, shuffle the
453 // opposite operand.
454 if (PreferredExtractIndex == Index0)
455 return Ext1;
456 if (PreferredExtractIndex == Index1)
457 return Ext0;
458
459 // Otherwise, replace the extract with the higher index.
460 return Index0 > Index1 ? Ext0 : Ext1;
461}
462
463/// Compare the relative costs of 2 extracts followed by scalar operation vs.
464/// vector operation(s) followed by extract. Return true if the existing
465/// instructions are cheaper than a vector alternative. Otherwise, return false
466/// and if one of the extracts should be transformed to a shufflevector, set
467/// \p ConvertToShuffle to that extract instruction.
468bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
469 ExtractElementInst *Ext1,
470 const Instruction &I,
471 ExtractElementInst *&ConvertToShuffle,
472 unsigned PreferredExtractIndex) {
473 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
474 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
475 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
476
477 unsigned Opcode = I.getOpcode();
478 Value *Ext0Src = Ext0->getVectorOperand();
479 Value *Ext1Src = Ext1->getVectorOperand();
480 Type *ScalarTy = Ext0->getType();
481 auto *VecTy = cast<VectorType>(Ext0Src->getType());
482 InstructionCost ScalarOpCost, VectorOpCost;
483
484 // Get cost estimates for scalar and vector versions of the operation.
485 bool IsBinOp = Instruction::isBinaryOp(Opcode);
486 if (IsBinOp) {
487 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
488 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
489 } else {
490 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
491 "Expected a compare");
492 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
493 ScalarOpCost = TTI.getCmpSelInstrCost(
494 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
495 VectorOpCost = TTI.getCmpSelInstrCost(
496 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
497 }
498
499 // Get cost estimates for the extract elements. These costs will factor into
500 // both sequences.
501 unsigned Ext0Index = Ext0IndexC->getZExtValue();
502 unsigned Ext1Index = Ext1IndexC->getZExtValue();
503
504 InstructionCost Extract0Cost =
505 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
506 InstructionCost Extract1Cost =
507 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
508
509 // A more expensive extract will always be replaced by a splat shuffle.
510 // For example, if Ext0 is more expensive:
511 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
512 // extelt (opcode (splat V0, Ext0), V1), Ext1
513 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
514 // check the cost of creating a broadcast shuffle and shuffling both
515 // operands to element 0.
516 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
517 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
518 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
519
520 // Extra uses of the extracts mean that we include those costs in the
521 // vector total because those instructions will not be eliminated.
522 InstructionCost OldCost, NewCost;
523 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
524 // Handle a special case. If the 2 extracts are identical, adjust the
525 // formulas to account for that. The extra use charge allows for either the
526 // CSE'd pattern or an unoptimized form with identical values:
527 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
528 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
529 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
530 OldCost = CheapExtractCost + ScalarOpCost;
531 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
532 } else {
533 // Handle the general case. Each extract is actually a different value:
534 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
535 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
536 NewCost = VectorOpCost + CheapExtractCost +
537 !Ext0->hasOneUse() * Extract0Cost +
538 !Ext1->hasOneUse() * Extract1Cost;
539 }
540
541 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
542 if (ConvertToShuffle) {
543 if (IsBinOp && DisableBinopExtractShuffle)
544 return true;
545
546 // If we are extracting from 2 different indexes, then one operand must be
547 // shuffled before performing the vector operation. The shuffle mask is
548 // poison except for 1 lane that is being translated to the remaining
549 // extraction lane. Therefore, it is a splat shuffle. Ex:
550 // ShufMask = { poison, poison, 0, poison }
551 // TODO: The cost model has an option for a "broadcast" shuffle
552 // (splat-from-element-0), but no option for a more general splat.
553 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
554 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
556 ShuffleMask[BestInsIndex] = BestExtIndex;
558 VecTy, VecTy, ShuffleMask, CostKind, 0,
559 nullptr, {ConvertToShuffle});
560 } else {
562 VecTy, VecTy, {}, CostKind, 0, nullptr,
563 {ConvertToShuffle});
564 }
565 }
566
567 // Aggressively form a vector op if the cost is equal because the transform
568 // may enable further optimization.
569 // Codegen can reverse this transform (scalarize) if it was not profitable.
570 return OldCost < NewCost;
571}
572
573/// Create a shuffle that translates (shifts) 1 element from the input vector
574/// to a new element location.
575static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
576 unsigned NewIndex, IRBuilderBase &Builder) {
577 // The shuffle mask is poison except for 1 lane that is being translated
578 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
579 // ShufMask = { 2, poison, poison, poison }
580 auto *VecTy = cast<FixedVectorType>(Vec->getType());
581 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
582 ShufMask[NewIndex] = OldIndex;
583 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
584}
585
586/// Given an extract element instruction with constant index operand, shuffle
587/// the source vector (shift the scalar element) to a NewIndex for extraction.
588/// Return null if the input can be constant folded, so that we are not creating
589/// unnecessary instructions.
590static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
591 IRBuilderBase &Builder) {
592 // Shufflevectors can only be created for fixed-width vectors.
593 Value *X = ExtElt->getVectorOperand();
594 if (!isa<FixedVectorType>(X->getType()))
595 return nullptr;
596
597 // If the extract can be constant-folded, this code is unsimplified. Defer
598 // to other passes to handle that.
599 Value *C = ExtElt->getIndexOperand();
600 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
601 if (isa<Constant>(X))
602 return nullptr;
603
604 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
605 NewIndex, Builder);
606 return Shuf;
607}
608
609/// Try to reduce extract element costs by converting scalar compares to vector
610/// compares followed by extract.
611/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
612Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
613 Instruction &I) {
614 assert(isa<CmpInst>(&I) && "Expected a compare");
615
616 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
617 // --> extelt (cmp Pred V0, V1), ExtIndex
618 ++NumVecCmp;
619 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
620 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
621 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
622}
623
624/// Try to reduce extract element costs by converting scalar binops to vector
625/// binops followed by extract.
626/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
627Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
628 Instruction &I) {
629 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
630
631 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
632 // --> extelt (bo V0, V1), ExtIndex
633 ++NumVecBO;
634 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
635 V1, "foldExtExtBinop");
636
637 // All IR flags are safe to back-propagate because any potential poison
638 // created in unused vector elements is discarded by the extract.
639 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
640 VecBOInst->copyIRFlags(&I);
641
642 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
643}
644
645/// Match an instruction with extracted vector operands.
646bool VectorCombine::foldExtractExtract(Instruction &I) {
647 // It is not safe to transform things like div, urem, etc. because we may
648 // create undefined behavior when executing those on unknown vector elements.
650 return false;
651
652 Instruction *I0, *I1;
653 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
654 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
656 return false;
657
658 Value *V0, *V1;
659 uint64_t C0, C1;
660 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
661 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
662 V0->getType() != V1->getType())
663 return false;
664
665 // For fixed-width vectors, reject out-of-bounds extract indexes
666 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(V0->getType())) {
667 unsigned NumElts = FixedVecTy->getNumElements();
668 if (C0 >= NumElts || C1 >= NumElts)
669 return false;
670 }
671
672 // If the scalar value 'I' is going to be re-inserted into a vector, then try
673 // to create an extract to that same element. The extract/insert can be
674 // reduced to a "select shuffle".
675 // TODO: If we add a larger pattern match that starts from an insert, this
676 // probably becomes unnecessary.
677 auto *Ext0 = cast<ExtractElementInst>(I0);
678 auto *Ext1 = cast<ExtractElementInst>(I1);
679 uint64_t InsertIndex = InvalidIndex;
680 if (I.hasOneUse())
681 match(I.user_back(),
682 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
683
684 ExtractElementInst *ExtractToChange;
685 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
686 return false;
687
688 Value *ExtOp0 = Ext0->getVectorOperand();
689 Value *ExtOp1 = Ext1->getVectorOperand();
690
691 if (ExtractToChange) {
692 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
693 Value *NewExtOp =
694 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
695 if (!NewExtOp)
696 return false;
697 if (ExtractToChange == Ext0)
698 ExtOp0 = NewExtOp;
699 else
700 ExtOp1 = NewExtOp;
701 }
702
703 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
704 : Ext0->getIndexOperand();
705 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
706 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
707 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
708 Worklist.push(Ext0);
709 Worklist.push(Ext1);
710 replaceValue(I, *NewExt);
711 return true;
712}
713
714/// Try to replace an extract + scalar fneg + insert with a vector fneg +
715/// shuffle.
716bool VectorCombine::foldInsExtFNeg(Instruction &I) {
717 // Match an insert (op (extract)) pattern.
718 Value *DstVec;
719 uint64_t ExtIdx, InsIdx;
720 Instruction *FNeg;
721 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
722 m_ConstantInt(InsIdx))))
723 return false;
724
725 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
726 Value *SrcVec;
727 Instruction *Extract;
728 if (!match(FNeg, m_FNeg(m_CombineAnd(
729 m_Instruction(Extract),
730 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
731 return false;
732
733 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
734 auto *DstVecScalarTy = DstVecTy->getScalarType();
735 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
736 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
737 return false;
738
739 // Ignore if insert/extract index is out of bounds or destination vector has
740 // one element
741 unsigned NumDstElts = DstVecTy->getNumElements();
742 unsigned NumSrcElts = SrcVecTy->getNumElements();
743 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
744 return false;
745
746 // We are inserting the negated element into the same lane that we extracted
747 // from. This is equivalent to a select-shuffle that chooses all but the
748 // negated element from the destination vector.
749 SmallVector<int> Mask(NumDstElts);
750 std::iota(Mask.begin(), Mask.end(), 0);
751 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
752 InstructionCost OldCost =
753 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
754 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
755
756 // If the extract has one use, it will be eliminated, so count it in the
757 // original cost. If it has more than one use, ignore the cost because it will
758 // be the same before/after.
759 if (Extract->hasOneUse())
760 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
761
762 InstructionCost NewCost =
763 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
765 DstVecTy, Mask, CostKind);
766
767 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
768 // If the lengths of the two vectors are not equal,
769 // we need to add a length-change vector. Add this cost.
770 SmallVector<int> SrcMask;
771 if (NeedLenChg) {
772 SrcMask.assign(NumDstElts, PoisonMaskElem);
773 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
775 DstVecTy, SrcVecTy, SrcMask, CostKind);
776 }
777
778 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
779 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
780 << "\n");
781 if (NewCost > OldCost)
782 return false;
783
784 Value *NewShuf, *LenChgShuf = nullptr;
785 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
786 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
787 if (NeedLenChg) {
788 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
789 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
790 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
791 Worklist.pushValue(LenChgShuf);
792 } else {
793 // shuffle DstVec, (fneg SrcVec), Mask
794 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
795 }
796
797 Worklist.pushValue(VecFNeg);
798 replaceValue(I, *NewShuf);
799 return true;
800}
801
802/// Try to fold insert(binop(x,y),binop(a,b),idx)
803/// --> binop(insert(x,a,idx),insert(y,b,idx))
804bool VectorCombine::foldInsExtBinop(Instruction &I) {
805 BinaryOperator *VecBinOp, *SclBinOp;
806 uint64_t Index;
807 if (!match(&I,
808 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
809 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
810 return false;
811
812 // TODO: Add support for addlike etc.
813 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
814 if (BinOpcode != SclBinOp->getOpcode())
815 return false;
816
817 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
818 if (!ResultTy)
819 return false;
820
821 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
822 // shuffle?
823
825 TTI.getInstructionCost(VecBinOp, CostKind) +
827 InstructionCost NewCost =
828 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
829 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
830 Index, VecBinOp->getOperand(0),
831 SclBinOp->getOperand(0)) +
832 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
833 Index, VecBinOp->getOperand(1),
834 SclBinOp->getOperand(1));
835
836 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
837 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
838 << "\n");
839 if (NewCost > OldCost)
840 return false;
841
842 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
843 SclBinOp->getOperand(0), Index);
844 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
845 SclBinOp->getOperand(1), Index);
846 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
847
848 // Intersect flags from the old binops.
849 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
850 NewInst->copyIRFlags(VecBinOp);
851 NewInst->andIRFlags(SclBinOp);
852 }
853
854 Worklist.pushValue(NewIns0);
855 Worklist.pushValue(NewIns1);
856 replaceValue(I, *NewBO);
857 return true;
858}
859
860/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
861/// Supports: bitcast, trunc, sext, zext
862bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
863 // Check if this is a bitwise logic operation
864 auto *BinOp = dyn_cast<BinaryOperator>(&I);
865 if (!BinOp || !BinOp->isBitwiseLogicOp())
866 return false;
867
868 // Get the cast instructions
869 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
870 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
871 if (!LHSCast || !RHSCast) {
872 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
873 return false;
874 }
875
876 // Both casts must be the same type
877 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
878 if (CastOpcode != RHSCast->getOpcode())
879 return false;
880
881 // Only handle supported cast operations
882 switch (CastOpcode) {
883 case Instruction::BitCast:
884 case Instruction::Trunc:
885 case Instruction::SExt:
886 case Instruction::ZExt:
887 break;
888 default:
889 return false;
890 }
891
892 Value *LHSSrc = LHSCast->getOperand(0);
893 Value *RHSSrc = RHSCast->getOperand(0);
894
895 // Source types must match
896 if (LHSSrc->getType() != RHSSrc->getType())
897 return false;
898
899 auto *SrcTy = LHSSrc->getType();
900 auto *DstTy = I.getType();
901 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
902 // Other casts only handle vector types with integer elements.
903 if (CastOpcode != Instruction::BitCast &&
904 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
905 return false;
906
907 // Only integer scalar/vector values are legal for bitwise logic operations.
908 if (!SrcTy->getScalarType()->isIntegerTy() ||
909 !DstTy->getScalarType()->isIntegerTy())
910 return false;
911
912 // Cost Check :
913 // OldCost = bitlogic + 2*casts
914 // NewCost = bitlogic + cast
915
916 // Calculate specific costs for each cast with instruction context
918 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
920 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
921
922 InstructionCost OldCost =
923 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
924 LHSCastCost + RHSCastCost;
925
926 // For new cost, we can't provide an instruction (it doesn't exist yet)
927 InstructionCost GenericCastCost = TTI.getCastInstrCost(
928 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
929
930 InstructionCost NewCost =
931 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
932 GenericCastCost;
933
934 // Account for multi-use casts using specific costs
935 if (!LHSCast->hasOneUse())
936 NewCost += LHSCastCost;
937 if (!RHSCast->hasOneUse())
938 NewCost += RHSCastCost;
939
940 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
941 << " NewCost=" << NewCost << "\n");
942
943 if (NewCost > OldCost)
944 return false;
945
946 // Create the operation on the source type
947 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
948 BinOp->getName() + ".inner");
949 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
950 NewBinOp->copyIRFlags(BinOp);
951
952 Worklist.pushValue(NewOp);
953
954 // Create the cast operation directly to ensure we get a new instruction
955 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
956
957 // Preserve cast instruction flags
958 NewCast->copyIRFlags(LHSCast);
959 NewCast->andIRFlags(RHSCast);
960
961 // Insert the new instruction
962 Value *Result = Builder.Insert(NewCast);
963
964 replaceValue(I, *Result);
965 return true;
966}
967
968/// Match:
969// bitop(castop(x), C) ->
970// bitop(castop(x), castop(InvC)) ->
971// castop(bitop(x, InvC))
972// Supports: bitcast
973bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
975 Constant *C;
976
977 // Check if this is a bitwise logic operation
979 return false;
980
981 // Get the cast instructions
982 auto *LHSCast = dyn_cast<CastInst>(LHS);
983 if (!LHSCast)
984 return false;
985
986 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
987
988 // Only handle supported cast operations
989 switch (CastOpcode) {
990 case Instruction::BitCast:
991 case Instruction::ZExt:
992 case Instruction::SExt:
993 case Instruction::Trunc:
994 break;
995 default:
996 return false;
997 }
998
999 Value *LHSSrc = LHSCast->getOperand(0);
1000
1001 auto *SrcTy = LHSSrc->getType();
1002 auto *DstTy = I.getType();
1003 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
1004 // Other casts only handle vector types with integer elements.
1005 if (CastOpcode != Instruction::BitCast &&
1006 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
1007 return false;
1008
1009 // Only integer scalar/vector values are legal for bitwise logic operations.
1010 if (!SrcTy->getScalarType()->isIntegerTy() ||
1011 !DstTy->getScalarType()->isIntegerTy())
1012 return false;
1013
1014 // Find the constant InvC, such that castop(InvC) equals to C.
1015 PreservedCastFlags RHSFlags;
1016 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1017 if (!InvC)
1018 return false;
1019
1020 // Cost Check :
1021 // OldCost = bitlogic + cast
1022 // NewCost = bitlogic + cast
1023
1024 // Calculate specific costs for each cast with instruction context
1025 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1026 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1027
1028 InstructionCost OldCost =
1029 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1030
1031 // For new cost, we can't provide an instruction (it doesn't exist yet)
1032 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1033 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1034
1035 InstructionCost NewCost =
1036 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1037 GenericCastCost;
1038
1039 // Account for multi-use casts using specific costs
1040 if (!LHSCast->hasOneUse())
1041 NewCost += LHSCastCost;
1042
1043 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1044 << " NewCost=" << NewCost << "\n");
1045
1046 if (NewCost > OldCost)
1047 return false;
1048
1049 // Create the operation on the source type
1050 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1051 LHSSrc, InvC, I.getName() + ".inner");
1052 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1053 NewBinOp->copyIRFlags(&I);
1054
1055 Worklist.pushValue(NewOp);
1056
1057 // Create the cast operation directly to ensure we get a new instruction
1058 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1059
1060 // Preserve cast instruction flags
1061 if (RHSFlags.NNeg)
1062 NewCast->setNonNeg();
1063 if (RHSFlags.NUW)
1064 NewCast->setHasNoUnsignedWrap();
1065 if (RHSFlags.NSW)
1066 NewCast->setHasNoSignedWrap();
1067
1068 NewCast->andIRFlags(LHSCast);
1069
1070 // Insert the new instruction
1071 Value *Result = Builder.Insert(NewCast);
1072
1073 replaceValue(I, *Result);
1074 return true;
1075}
1076
1077/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1078/// destination type followed by shuffle. This can enable further transforms by
1079/// moving bitcasts or shuffles together.
1080bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1081 Value *V0, *V1;
1082 ArrayRef<int> Mask;
1083 if (!match(&I, m_BitCast(m_OneUse(
1084 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1085 return false;
1086
1087 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1088 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1089 // mask for scalable type is a splat or not.
1090 // 2) Disallow non-vector casts.
1091 // TODO: We could allow any shuffle.
1092 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1093 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1094 if (!DestTy || !SrcTy)
1095 return false;
1096
1097 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1098 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1099 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1100 return false;
1101
1102 bool IsUnary = isa<UndefValue>(V1);
1103
1104 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1105 // if it won't increase the number of bitcasts.
1106 if (!IsUnary) {
1109 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1110 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1111 return false;
1112 }
1113
1114 SmallVector<int, 16> NewMask;
1115 if (DestEltSize <= SrcEltSize) {
1116 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1117 // always be expanded to the equivalent form choosing narrower elements.
1118 if (SrcEltSize % DestEltSize != 0)
1119 return false;
1120 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1121 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1122 } else {
1123 // The bitcast is from narrow elements to wide elements. The shuffle mask
1124 // must choose consecutive elements to allow casting first.
1125 if (DestEltSize % SrcEltSize != 0)
1126 return false;
1127 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1128 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1129 return false;
1130 }
1131
1132 // Bitcast the shuffle src - keep its original width but using the destination
1133 // scalar type.
1134 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1135 auto *NewShuffleTy =
1136 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1137 auto *OldShuffleTy =
1138 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1139 unsigned NumOps = IsUnary ? 1 : 2;
1140
1141 // The new shuffle must not cost more than the old shuffle.
1145
1146 InstructionCost NewCost =
1147 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1148 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1149 TargetTransformInfo::CastContextHint::None,
1150 CostKind));
1151 InstructionCost OldCost =
1152 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1153 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1154 TargetTransformInfo::CastContextHint::None,
1155 CostKind);
1156
1157 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1158 << OldCost << " vs NewCost: " << NewCost << "\n");
1159
1160 if (NewCost > OldCost || !NewCost.isValid())
1161 return false;
1162
1163 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1164 ++NumShufOfBitcast;
1165 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1166 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1167 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1168 replaceValue(I, *Shuf);
1169 return true;
1170}
1171
1172/// VP Intrinsics whose vector operands are both splat values may be simplified
1173/// into the scalar version of the operation and the result splatted. This
1174/// can lead to scalarization down the line.
1175bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1176 if (!isa<VPIntrinsic>(I))
1177 return false;
1178 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1179 Value *Op0 = VPI.getArgOperand(0);
1180 Value *Op1 = VPI.getArgOperand(1);
1181
1182 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1183 return false;
1184
1185 // Check getSplatValue early in this function, to avoid doing unnecessary
1186 // work.
1187 Value *ScalarOp0 = getSplatValue(Op0);
1188 Value *ScalarOp1 = getSplatValue(Op1);
1189 if (!ScalarOp0 || !ScalarOp1)
1190 return false;
1191
1192 // For the binary VP intrinsics supported here, the result on disabled lanes
1193 // is a poison value. For now, only do this simplification if all lanes
1194 // are active.
1195 // TODO: Relax the condition that all lanes are active by using insertelement
1196 // on inactive lanes.
1197 auto IsAllTrueMask = [](Value *MaskVal) {
1198 if (Value *SplattedVal = getSplatValue(MaskVal))
1199 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1200 return ConstValue->isAllOnesValue();
1201 return false;
1202 };
1203 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1204 return false;
1205
1206 // Check to make sure we support scalarization of the intrinsic
1207 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1208 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1209 return false;
1210
1211 // Calculate cost of splatting both operands into vectors and the vector
1212 // intrinsic
1213 VectorType *VecTy = cast<VectorType>(VPI.getType());
1214 SmallVector<int> Mask;
1215 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1216 Mask.resize(FVTy->getNumElements(), 0);
1217 InstructionCost SplatCost =
1218 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1220 CostKind);
1221
1222 // Calculate the cost of the VP Intrinsic
1224 for (Value *V : VPI.args())
1225 Args.push_back(V->getType());
1226 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1227 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1228 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1229
1230 // Determine scalar opcode
1231 std::optional<unsigned> FunctionalOpcode =
1232 VPI.getFunctionalOpcode();
1233 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1234 if (!FunctionalOpcode) {
1235 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1236 if (!ScalarIntrID)
1237 return false;
1238 }
1239
1240 // Calculate cost of scalarizing
1241 InstructionCost ScalarOpCost = 0;
1242 if (ScalarIntrID) {
1243 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1244 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1245 } else {
1246 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1247 VecTy->getScalarType(), CostKind);
1248 }
1249
1250 // The existing splats may be kept around if other instructions use them.
1251 InstructionCost CostToKeepSplats =
1252 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1253 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1254
1255 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1256 << "\n");
1257 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1258 << ", Cost of scalarizing:" << NewCost << "\n");
1259
1260 // We want to scalarize unless the vector variant actually has lower cost.
1261 if (OldCost < NewCost || !NewCost.isValid())
1262 return false;
1263
1264 // Scalarize the intrinsic
1265 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1266 Value *EVL = VPI.getArgOperand(3);
1267
1268 // If the VP op might introduce UB or poison, we can scalarize it provided
1269 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1270 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1271 // scalarizing it.
1272 bool SafeToSpeculate;
1273 if (ScalarIntrID)
1274 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1275 .hasAttribute(Attribute::AttrKind::Speculatable);
1276 else
1278 *FunctionalOpcode, &VPI, nullptr, SQ.AC, SQ.DT);
1279 if (!SafeToSpeculate &&
1280 !isKnownNonZero(EVL, SimplifyQuery(*DL, SQ.DT, SQ.AC, &VPI)))
1281 return false;
1282
1283 Value *ScalarVal =
1284 ScalarIntrID
1285 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1286 {ScalarOp0, ScalarOp1})
1287 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1288 ScalarOp0, ScalarOp1);
1289
1290 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1291 return true;
1292}
1293
1294/// Match a vector op/compare/intrinsic with at least one
1295/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1296/// by insertelement.
1297bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1298 auto *UO = dyn_cast<UnaryOperator>(&I);
1299 auto *BO = dyn_cast<BinaryOperator>(&I);
1300 auto *CI = dyn_cast<CmpInst>(&I);
1301 auto *II = dyn_cast<IntrinsicInst>(&I);
1302 if (!UO && !BO && !CI && !II)
1303 return false;
1304
1305 // TODO: Allow intrinsics with different argument types
1306 if (II) {
1307 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1308 return false;
1309 for (auto [Idx, Arg] : enumerate(II->args()))
1310 if (Arg->getType() != II->getType() &&
1311 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1312 return false;
1313 }
1314
1315 // Do not convert the vector condition of a vector select into a scalar
1316 // condition. That may cause problems for codegen because of differences in
1317 // boolean formats and register-file transfers.
1318 // TODO: Can we account for that in the cost model?
1319 if (CI)
1320 for (User *U : I.users())
1321 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1322 return false;
1323
1324 // Match constant vectors or scalars being inserted into constant vectors:
1325 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1326 SmallVector<Value *> VecCs, ScalarOps;
1327 std::optional<uint64_t> Index;
1328
1329 auto Ops = II ? II->args() : I.operands();
1330 for (auto [OpNum, Op] : enumerate(Ops)) {
1331 Constant *VecC;
1332 Value *V;
1333 uint64_t InsIdx = 0;
1334 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1335 m_ConstantInt(InsIdx)))) {
1336 // Bail if any inserts are out of bounds.
1337 VectorType *OpTy = cast<VectorType>(Op->getType());
1338 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1339 return false;
1340 // All inserts must have the same index.
1341 // TODO: Deal with mismatched index constants and variable indexes?
1342 if (!Index)
1343 Index = InsIdx;
1344 else if (InsIdx != *Index)
1345 return false;
1346 VecCs.push_back(VecC);
1347 ScalarOps.push_back(V);
1348 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1349 OpNum, &TTI)) {
1350 VecCs.push_back(Op.get());
1351 ScalarOps.push_back(Op.get());
1352 } else if (match(Op.get(), m_Constant(VecC))) {
1353 VecCs.push_back(VecC);
1354 ScalarOps.push_back(nullptr);
1355 } else {
1356 return false;
1357 }
1358 }
1359
1360 // Bail if all operands are constant.
1361 if (!Index.has_value())
1362 return false;
1363
1364 VectorType *VecTy = cast<VectorType>(I.getType());
1365 Type *ScalarTy = VecTy->getScalarType();
1366 assert(VecTy->isVectorTy() &&
1367 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1368 ScalarTy->isPointerTy()) &&
1369 "Unexpected types for insert element into binop or cmp");
1370
1371 unsigned Opcode = I.getOpcode();
1372 InstructionCost ScalarOpCost, VectorOpCost;
1373 if (CI) {
1374 CmpInst::Predicate Pred = CI->getPredicate();
1375 ScalarOpCost = TTI.getCmpSelInstrCost(
1376 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1377 VectorOpCost = TTI.getCmpSelInstrCost(
1378 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1379 } else if (UO || BO) {
1380 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1381 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1382 } else {
1383 IntrinsicCostAttributes ScalarICA(
1384 II->getIntrinsicID(), ScalarTy,
1385 SmallVector<Type *>(II->arg_size(), ScalarTy));
1386 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1387 IntrinsicCostAttributes VectorICA(
1388 II->getIntrinsicID(), VecTy,
1389 SmallVector<Type *>(II->arg_size(), VecTy));
1390 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1391 }
1392
1393 // Fold the vector constants in the original vectors into a new base vector to
1394 // get more accurate cost modelling.
1395 Value *NewVecC = nullptr;
1396 if (CI)
1397 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1398 else if (UO)
1399 NewVecC =
1400 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1401 else if (BO)
1402 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1403 else if (II)
1404 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1405
1406 if (!NewVecC)
1407 return false;
1408
1409 // Get cost estimate for the insert element. This cost will factor into
1410 // both sequences.
1411 InstructionCost OldCost = VectorOpCost;
1412 InstructionCost NewCost =
1413 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1414 CostKind, *Index, NewVecC);
1415
1416 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1417 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1418 II->getIntrinsicID(), Idx, &TTI)))
1419 continue;
1421 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1422 OldCost += InsertCost;
1423 NewCost += !Op->hasOneUse() * InsertCost;
1424 }
1425
1426 // We want to scalarize unless the vector variant actually has lower cost.
1427 if (OldCost < NewCost || !NewCost.isValid())
1428 return false;
1429
1430 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1431 // inselt NewVecC, (scalar_op V0, V1), Index
1432 if (CI)
1433 ++NumScalarCmp;
1434 else if (UO || BO)
1435 ++NumScalarOps;
1436 else
1437 ++NumScalarIntrinsic;
1438
1439 // For constant cases, extract the scalar element, this should constant fold.
1440 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1441 if (!Scalar)
1443 cast<Constant>(VecC), Builder.getInt64(*Index));
1444
1445 Value *Scalar;
1446 if (CI)
1447 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1448 else if (UO || BO)
1449 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1450 else
1451 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1452
1453 Scalar->setName(I.getName() + ".scalar");
1454
1455 // All IR flags are safe to back-propagate. There is no potential for extra
1456 // poison to be created by the scalar instruction.
1457 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1458 ScalarInst->copyIRFlags(&I);
1459
1460 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1461 replaceValue(I, *Insert);
1462 return true;
1463}
1464
1465/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1466/// a vector into vector operations followed by extract. Note: The SLP pass
1467/// may miss this pattern because of implementation problems.
1468bool VectorCombine::foldExtractedCmps(Instruction &I) {
1469 auto *BI = dyn_cast<BinaryOperator>(&I);
1470
1471 // We are looking for a scalar binop of booleans.
1472 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1473 if (!BI || !I.getType()->isIntegerTy(1))
1474 return false;
1475
1476 // The compare predicates should match, and each compare should have a
1477 // constant operand.
1478 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1479 Instruction *I0, *I1;
1480 Constant *C0, *C1;
1481 CmpPredicate P0, P1;
1482 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1483 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1484 return false;
1485
1486 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1487 if (!MatchingPred)
1488 return false;
1489
1490 // The compare operands must be extracts of the same vector with constant
1491 // extract indexes.
1492 Value *X;
1493 uint64_t Index0, Index1;
1494 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1495 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1496 return false;
1497
1498 auto *Ext0 = cast<ExtractElementInst>(I0);
1499 auto *Ext1 = cast<ExtractElementInst>(I1);
1500 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1501 if (!ConvertToShuf)
1502 return false;
1503 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1504 "Unknown ExtractElementInst");
1505
1506 // The original scalar pattern is:
1507 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1508 CmpInst::Predicate Pred = *MatchingPred;
1509 unsigned CmpOpcode =
1510 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1511 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1512 if (!VecTy)
1513 return false;
1514
1515 InstructionCost Ext0Cost =
1516 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1517 InstructionCost Ext1Cost =
1518 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1520 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1521 CostKind);
1522
1523 InstructionCost OldCost =
1524 Ext0Cost + Ext1Cost + CmpCost * 2 +
1525 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1526
1527 // The proposed vector pattern is:
1528 // vcmp = cmp Pred X, VecC
1529 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1530 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1531 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1534 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1535 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1536 ShufMask[CheapIndex] = ExpensiveIndex;
1538 CmpTy, ShufMask, CostKind);
1539 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1540 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1541 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1542 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1543
1544 // Aggressively form vector ops if the cost is equal because the transform
1545 // may enable further optimization.
1546 // Codegen can reverse this transform (scalarize) if it was not profitable.
1547 if (OldCost < NewCost || !NewCost.isValid())
1548 return false;
1549
1550 // Create a vector constant from the 2 scalar constants.
1551 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1552 PoisonValue::get(VecTy->getElementType()));
1553 CmpC[Index0] = C0;
1554 CmpC[Index1] = C1;
1555 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1556 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1557 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1558 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1559 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1560 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1561 replaceValue(I, *NewExt);
1562 ++NumVecCmpBO;
1563 return true;
1564}
1565
1566/// Try to fold scalar selects that select between extracted elements and zero
1567/// into extracting from a vector select. This is rooted at the bitcast.
1568///
1569/// This pattern arises when a vector is bitcast to a smaller element type,
1570/// elements are extracted, and then conditionally selected with zero:
1571///
1572/// %bc = bitcast <4 x i32> %src to <16 x i8>
1573/// %e0 = extractelement <16 x i8> %bc, i32 0
1574/// %s0 = select i1 %cond, i8 %e0, i8 0
1575/// %e1 = extractelement <16 x i8> %bc, i32 1
1576/// %s1 = select i1 %cond, i8 %e1, i8 0
1577/// ...
1578///
1579/// Transforms to:
1580/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
1581/// %bc = bitcast <4 x i32> %sel to <16 x i8>
1582/// %e0 = extractelement <16 x i8> %bc, i32 0
1583/// %e1 = extractelement <16 x i8> %bc, i32 1
1584/// ...
1585///
1586/// This is profitable because vector select on wider types produces fewer
1587/// select/cndmask instructions than scalar selects on each element.
1588bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
1589 auto *BC = dyn_cast<BitCastInst>(&I);
1590 if (!BC)
1591 return false;
1592
1593 FixedVectorType *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
1594 FixedVectorType *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
1595 if (!SrcVecTy || !DstVecTy)
1596 return false;
1597
1598 // Source must be 32-bit or 64-bit elements, destination must be smaller
1599 // integer elements. Zero in all these types is all-bits-zero.
1600 Type *SrcEltTy = SrcVecTy->getElementType();
1601 Type *DstEltTy = DstVecTy->getElementType();
1602 unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
1603 unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
1604
1605 if (SrcEltBits != 32 && SrcEltBits != 64)
1606 return false;
1607
1608 if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
1609 return false;
1610
1611 // Check profitability using TTI before collecting users.
1612 Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
1613 Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
1614
1615 InstructionCost ScalarSelCost =
1616 TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
1618 InstructionCost VecSelCost =
1619 TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
1621
1622 // We need at least this many selects for vectorization to be profitable.
1623 // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
1624 // ScalarSelCost
1625 if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
1626 return false;
1627
1628 unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
1629
1630 // Quick check: if bitcast doesn't have enough users, bail early.
1631 if (!BC->hasNUsesOrMore(MinSelects))
1632 return false;
1633
1634 // Collect all select users that match the pattern, grouped by condition.
1635 // Pattern: select i1 %cond, (extractelement %bc, idx), 0
1636 DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
1637
1638 for (User *U : BC->users()) {
1639 auto *Ext = dyn_cast<ExtractElementInst>(U);
1640 if (!Ext)
1641 continue;
1642
1643 for (User *ExtUser : Ext->users()) {
1644 Value *Cond;
1645 // Match: select i1 %cond, %ext, 0
1646 if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
1647 Cond->getType()->isIntegerTy(1))
1648 CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
1649 }
1650 }
1651
1652 if (CondToSelects.empty())
1653 return false;
1654
1655 bool MadeChange = false;
1656 Value *SrcVec = BC->getOperand(0);
1657
1658 // Process each group of selects with the same condition.
1659 for (auto [Cond, Selects] : CondToSelects) {
1660 // Only profitable if vector select cost < total scalar select cost.
1661 if (Selects.size() < MinSelects) {
1662 LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
1663 << "profitable (VecCost=" << VecSelCost
1664 << ", ScalarCost=" << ScalarSelCost
1665 << ", NumSelects=" << Selects.size() << ")\n");
1666 continue;
1667 }
1668
1669 // Create the vector select and bitcast once for this condition.
1670 auto InsertPt = std::next(BC->getIterator());
1671
1672 if (auto *CondInst = dyn_cast<Instruction>(Cond))
1673 if (DT.dominates(BC, CondInst))
1674 InsertPt = std::next(CondInst->getIterator());
1675
1676 Builder.SetInsertPoint(InsertPt);
1677 Value *VecSel =
1678 Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
1679 Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
1680
1681 // Replace each scalar select with an extract from the new bitcast.
1682 for (SelectInst *Sel : Selects) {
1683 auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
1684 Value *Idx = Ext->getIndexOperand();
1685
1686 Builder.SetInsertPoint(Sel);
1687 Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
1688 replaceValue(*Sel, *NewExt);
1689 MadeChange = true;
1690 }
1691
1692 LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
1693 << " selects into vector select\n");
1694 }
1695
1696 return MadeChange;
1697}
1698
1701 const TargetTransformInfo &TTI,
1702 InstructionCost &CostBeforeReduction,
1703 InstructionCost &CostAfterReduction) {
1704 Instruction *Op0, *Op1;
1705 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1706 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1707 unsigned ReductionOpc =
1708 getArithmeticReductionInstruction(II.getIntrinsicID());
1709 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1710 bool IsUnsigned = isa<ZExtInst>(RedOp);
1711 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1712
1713 CostBeforeReduction =
1714 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1716 CostAfterReduction =
1717 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1718 ExtType, FastMathFlags(), CostKind);
1719 return;
1720 }
1721 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1722 match(RedOp,
1724 match(Op0, m_ZExtOrSExt(m_Value())) &&
1725 Op0->getOpcode() == Op1->getOpcode() &&
1726 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1727 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1728 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1729 bool IsUnsigned = isa<ZExtInst>(Op0);
1730 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1731 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1732
1733 InstructionCost ExtCost =
1734 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1736 InstructionCost MulCost =
1737 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1738 InstructionCost Ext2Cost =
1739 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1741
1742 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1743 CostAfterReduction = TTI.getMulAccReductionCost(
1744 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1745 return;
1746 }
1747 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1748 std::nullopt, CostKind);
1749}
1750
1751bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1752 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1753 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1754 if (BinOpOpc == Instruction::Sub)
1755 ReductionIID = Intrinsic::vector_reduce_add;
1756 if (ReductionIID == Intrinsic::not_intrinsic)
1757 return false;
1758
1759 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1760 Intrinsic::ID IID) -> Value * {
1761 auto *II = dyn_cast<IntrinsicInst>(V);
1762 if (!II)
1763 return nullptr;
1764 if (II->getIntrinsicID() == IID && II->hasOneUse())
1765 return II->getArgOperand(0);
1766 return nullptr;
1767 };
1768
1769 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1770 if (!V0)
1771 return false;
1772 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1773 if (!V1)
1774 return false;
1775
1776 auto *VTy = cast<VectorType>(V0->getType());
1777 if (V1->getType() != VTy)
1778 return false;
1779 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1780 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1781 unsigned ReductionOpc =
1782 getArithmeticReductionInstruction(II0.getIntrinsicID());
1783
1784 InstructionCost OldCost = 0;
1785 InstructionCost NewCost = 0;
1786 InstructionCost CostOfRedOperand0 = 0;
1787 InstructionCost CostOfRed0 = 0;
1788 InstructionCost CostOfRedOperand1 = 0;
1789 InstructionCost CostOfRed1 = 0;
1790 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1791 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1792 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1793 NewCost =
1794 CostOfRedOperand0 + CostOfRedOperand1 +
1795 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1796 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1797 if (NewCost >= OldCost || !NewCost.isValid())
1798 return false;
1799
1800 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1801 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1802 << "\n");
1803 Value *VectorBO;
1804 if (BinOpOpc == Instruction::Or)
1805 VectorBO = Builder.CreateOr(V0, V1, "",
1806 cast<PossiblyDisjointInst>(I).isDisjoint());
1807 else
1808 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1809
1810 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1811 replaceValue(I, *Rdx);
1812 return true;
1813}
1814
1815// Check if memory loc modified between two instrs in the same BB
1818 const MemoryLocation &Loc, AAResults &AA) {
1819 unsigned NumScanned = 0;
1820 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1821 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1822 ++NumScanned > MaxInstrsToScan;
1823 });
1824}
1825
1826namespace {
1827/// Helper class to indicate whether a vector index can be safely scalarized and
1828/// if a freeze needs to be inserted.
1829class ScalarizationResult {
1830 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1831
1832 StatusTy Status;
1833 Value *ToFreeze;
1834
1835 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1836 : Status(Status), ToFreeze(ToFreeze) {}
1837
1838public:
1839 ScalarizationResult(const ScalarizationResult &Other) = default;
1840 ~ScalarizationResult() {
1841 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1842 }
1843
1844 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1845 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1846 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1847 return {StatusTy::SafeWithFreeze, ToFreeze};
1848 }
1849
1850 /// Returns true if the index can be scalarize without requiring a freeze.
1851 bool isSafe() const { return Status == StatusTy::Safe; }
1852 /// Returns true if the index cannot be scalarized.
1853 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1854 /// Returns true if the index can be scalarize, but requires inserting a
1855 /// freeze.
1856 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1857
1858 /// Reset the state of Unsafe and clear ToFreze if set.
1859 void discard() {
1860 ToFreeze = nullptr;
1861 Status = StatusTy::Unsafe;
1862 }
1863
1864 /// Freeze the ToFreeze and update the use in \p User to use it.
1865 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1866 assert(isSafeWithFreeze() &&
1867 "should only be used when freezing is required");
1868 assert(is_contained(ToFreeze->users(), &UserI) &&
1869 "UserI must be a user of ToFreeze");
1870 IRBuilder<>::InsertPointGuard Guard(Builder);
1871 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1872 Value *Frozen =
1873 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1874 for (Use &U : make_early_inc_range((UserI.operands())))
1875 if (U.get() == ToFreeze)
1876 U.set(Frozen);
1877
1878 ToFreeze = nullptr;
1879 }
1880};
1881} // namespace
1882
1883/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1884/// Idx. \p Idx must access a valid vector element.
1885static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1886 const SimplifyQuery &SQ) {
1887 // We do checks for both fixed vector types and scalable vector types.
1888 // This is the number of elements of fixed vector types,
1889 // or the minimum number of elements of scalable vector types.
1890 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1891 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1892
1893 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1894 if (C->getValue().ult(NumElements))
1895 return ScalarizationResult::safe();
1896 return ScalarizationResult::unsafe();
1897 }
1898
1899 // Always unsafe if the index type can't handle all inbound values.
1900 if (!llvm::isUIntN(IntWidth, NumElements))
1901 return ScalarizationResult::unsafe();
1902
1903 APInt Zero(IntWidth, 0);
1904 APInt MaxElts(IntWidth, NumElements);
1905 ConstantRange ValidIndices(Zero, MaxElts);
1906 ConstantRange IdxRange(IntWidth, true);
1907
1908 if (isGuaranteedNotToBePoison(Idx, SQ.AC, SQ.CxtI, SQ.DT)) {
1909 if (ValidIndices.contains(
1910 computeConstantRange(Idx, /*ForSigned=*/false, SQ)))
1911 return ScalarizationResult::safe();
1912 return ScalarizationResult::unsafe();
1913 }
1914
1915 // If the index may be poison, check if we can insert a freeze before the
1916 // range of the index is restricted.
1917 Value *IdxBase;
1918 ConstantInt *CI;
1919 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1920 IdxRange = IdxRange.binaryAnd(CI->getValue());
1921 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1922 IdxRange = IdxRange.urem(CI->getValue());
1923 }
1924
1925 if (ValidIndices.contains(IdxRange))
1926 return ScalarizationResult::safeWithFreeze(IdxBase);
1927 return ScalarizationResult::unsafe();
1928}
1929
1930/// The memory operation on a vector of \p ScalarType had alignment of
1931/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1932/// alignment that will be valid for the memory operation on a single scalar
1933/// element of the same type with index \p Idx.
1935 Type *ScalarType, Value *Idx,
1936 const DataLayout &DL) {
1937 if (auto *C = dyn_cast<ConstantInt>(Idx))
1938 return commonAlignment(VectorAlignment,
1939 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1940 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1941}
1942
1943// Combine patterns like:
1944// %0 = load <4 x i32>, <4 x i32>* %a
1945// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1946// store <4 x i32> %1, <4 x i32>* %a
1947// to:
1948// %0 = bitcast <4 x i32>* %a to i32*
1949// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1950// store i32 %b, i32* %1
1951bool VectorCombine::foldSingleElementStore(Instruction &I) {
1953 return false;
1954 auto *SI = cast<StoreInst>(&I);
1955 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1956 return false;
1957
1958 // TODO: Combine more complicated patterns (multiple insert) by referencing
1959 // TargetTransformInfo.
1961 Value *NewElement;
1962 Value *Idx;
1963 if (!match(SI->getValueOperand(),
1964 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1965 m_Value(Idx))))
1966 return false;
1967
1968 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1969 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1970 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1971 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1972 // modified between, vector type matches store size, and index is inbounds.
1973 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1974 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1975 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1976 return false;
1977
1978 auto ScalarizableIdx =
1979 canScalarizeAccess(VecTy, Idx, SQ.getWithInstruction(Load));
1980 if (ScalarizableIdx.isUnsafe() ||
1981 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1982 MemoryLocation::get(SI), AA))
1983 return false;
1984
1985 // Ensure we add the load back to the worklist BEFORE its users so they can
1986 // erased in the correct order.
1987 Worklist.push(Load);
1988
1989 if (ScalarizableIdx.isSafeWithFreeze())
1990 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1991 Value *GEP = Builder.CreateInBoundsGEP(
1992 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1993 {ConstantInt::get(Idx->getType(), 0), Idx});
1994 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1995 NSI->copyMetadata(*SI);
1996 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1997 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1998 *DL);
1999 NSI->setAlignment(ScalarOpAlignment);
2000 replaceValue(I, *NSI);
2002 return true;
2003 }
2004
2005 return false;
2006}
2007
2008/// Try to scalarize vector loads feeding extractelement or bitcast
2009/// instructions.
2010bool VectorCombine::scalarizeLoad(Instruction &I) {
2011 Value *Ptr;
2012 if (!match(&I, m_Load(m_Value(Ptr))))
2013 return false;
2014
2015 auto *LI = cast<LoadInst>(&I);
2016 auto *VecTy = cast<VectorType>(LI->getType());
2017 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
2018 return false;
2019
2020 bool AllExtracts = true;
2021 bool AllBitcasts = true;
2022 Instruction *LastCheckedInst = LI;
2023 unsigned NumInstChecked = 0;
2024
2025 // Check what type of users we have (must either all be extracts or
2026 // bitcasts) and ensure no memory modifications between the load and
2027 // its users.
2028 for (User *U : LI->users()) {
2029 auto *UI = dyn_cast<Instruction>(U);
2030 if (!UI || UI->getParent() != LI->getParent())
2031 return false;
2032
2033 // If any user is waiting to be erased, then bail out as this will
2034 // distort the cost calculation and possibly lead to infinite loops.
2035 if (UI->use_empty())
2036 return false;
2037
2038 if (!isa<ExtractElementInst>(UI))
2039 AllExtracts = false;
2040 if (!isa<BitCastInst>(UI))
2041 AllBitcasts = false;
2042
2043 // Check if any instruction between the load and the user may modify memory.
2044 if (LastCheckedInst->comesBefore(UI)) {
2045 for (Instruction &I :
2046 make_range(std::next(LI->getIterator()), UI->getIterator())) {
2047 // Bail out if we reached the check limit or the instruction may write
2048 // to memory.
2049 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
2050 return false;
2051 NumInstChecked++;
2052 }
2053 LastCheckedInst = UI;
2054 }
2055 }
2056
2057 if (AllExtracts)
2058 return scalarizeLoadExtract(LI, VecTy, Ptr);
2059 if (AllBitcasts)
2060 return scalarizeLoadBitcast(LI, VecTy, Ptr);
2061 return false;
2062}
2063
2064/// Try to scalarize vector loads feeding extractelement instructions.
2065bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
2066 Value *Ptr) {
2068 return false;
2069
2070 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
2071 llvm::scope_exit FailureGuard([&]() {
2072 // If the transform is aborted, discard the ScalarizationResults.
2073 for (auto &Pair : NeedFreeze)
2074 Pair.second.discard();
2075 });
2076
2077 InstructionCost OriginalCost =
2078 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2080 InstructionCost ScalarizedCost = 0;
2081
2082 for (User *U : LI->users()) {
2083 auto *UI = cast<ExtractElementInst>(U);
2084
2085 auto ScalarIdx = canScalarizeAccess(VecTy, UI->getIndexOperand(),
2086 SQ.getWithInstruction(LI));
2087 if (ScalarIdx.isUnsafe())
2088 return false;
2089 if (ScalarIdx.isSafeWithFreeze()) {
2090 NeedFreeze.try_emplace(UI, ScalarIdx);
2091 ScalarIdx.discard();
2092 }
2093
2094 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
2095 OriginalCost +=
2096 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
2097 Index ? Index->getZExtValue() : -1);
2098 ScalarizedCost +=
2099 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
2101 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
2102 nullptr, nullptr, CostKind);
2103 }
2104
2105 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
2106 << "\n LoadExtractCost: " << OriginalCost
2107 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2108
2109 if (ScalarizedCost >= OriginalCost)
2110 return false;
2111
2112 // Ensure we add the load back to the worklist BEFORE its users so they can
2113 // erased in the correct order.
2114 Worklist.push(LI);
2115
2116 Type *ElemType = VecTy->getElementType();
2117
2118 // Replace extracts with narrow scalar loads.
2119 for (User *U : LI->users()) {
2120 auto *EI = cast<ExtractElementInst>(U);
2121 Value *Idx = EI->getIndexOperand();
2122
2123 // Insert 'freeze' for poison indexes.
2124 auto It = NeedFreeze.find(EI);
2125 if (It != NeedFreeze.end())
2126 It->second.freeze(Builder, *cast<Instruction>(Idx));
2127
2128 Builder.SetInsertPoint(EI);
2129 Value *GEP =
2130 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
2131 auto *NewLoad = cast<LoadInst>(
2132 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
2133
2134 Align ScalarOpAlignment =
2135 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
2136 NewLoad->setAlignment(ScalarOpAlignment);
2137
2138 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
2139 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
2140 AAMDNodes OldAAMD = LI->getAAMetadata();
2141 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
2142 }
2143
2144 replaceValue(*EI, *NewLoad, false);
2145 }
2146
2147 FailureGuard.release();
2148 return true;
2149}
2150
2151/// Try to scalarize vector loads feeding bitcast instructions.
2152bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2153 Value *Ptr) {
2154 InstructionCost OriginalCost =
2155 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2157
2158 Type *TargetScalarType = nullptr;
2159 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2160
2161 for (User *U : LI->users()) {
2162 auto *BC = cast<BitCastInst>(U);
2163
2164 Type *DestTy = BC->getDestTy();
2165 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2166 return false;
2167
2168 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2169 if (DestBitWidth != VecBitWidth)
2170 return false;
2171
2172 // All bitcasts must target the same scalar type.
2173 if (!TargetScalarType)
2174 TargetScalarType = DestTy;
2175 else if (TargetScalarType != DestTy)
2176 return false;
2177
2178 OriginalCost +=
2179 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2181 }
2182
2183 if (!TargetScalarType)
2184 return false;
2185
2186 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2187 InstructionCost ScalarizedCost =
2188 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2190
2191 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2192 << "\n OriginalCost: " << OriginalCost
2193 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2194
2195 if (ScalarizedCost >= OriginalCost)
2196 return false;
2197
2198 // Ensure we add the load back to the worklist BEFORE its users so they can
2199 // erased in the correct order.
2200 Worklist.push(LI);
2201
2202 Builder.SetInsertPoint(LI);
2203 auto *ScalarLoad =
2204 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2205 ScalarLoad->setAlignment(LI->getAlign());
2206 ScalarLoad->copyMetadata(*LI);
2207
2208 // Replace all bitcast users with the scalar load.
2209 for (User *U : LI->users()) {
2210 auto *BC = cast<BitCastInst>(U);
2211 replaceValue(*BC, *ScalarLoad, false);
2212 }
2213
2214 return true;
2215}
2216
2217bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2219 return false;
2220 auto *Ext = dyn_cast<ZExtInst>(&I);
2221 if (!Ext)
2222 return false;
2223
2224 // Try to convert a vector zext feeding only extracts to a set of scalar
2225 // (Src << ExtIdx *Size) & (Size -1)
2226 // if profitable .
2227 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2228 if (!SrcTy)
2229 return false;
2230 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2231
2232 Type *ScalarDstTy = DstTy->getElementType();
2233 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2234 return false;
2235
2236 InstructionCost VectorCost =
2237 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2239 unsigned ExtCnt = 0;
2240 bool ExtLane0 = false;
2241 for (User *U : Ext->users()) {
2242 uint64_t Idx;
2243 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2244 return false;
2245 if (cast<Instruction>(U)->use_empty())
2246 continue;
2247 ExtCnt += 1;
2248 ExtLane0 |= !Idx;
2249 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2250 CostKind, Idx, U);
2251 }
2252
2253 InstructionCost ScalarCost =
2254 ExtCnt * TTI.getArithmeticInstrCost(
2255 Instruction::And, ScalarDstTy, CostKind,
2258 (ExtCnt - ExtLane0) *
2260 Instruction::LShr, ScalarDstTy, CostKind,
2263 if (ScalarCost > VectorCost)
2264 return false;
2265
2266 Value *ScalarV = Ext->getOperand(0);
2267 if (!isGuaranteedNotToBePoison(ScalarV, SQ.AC, dyn_cast<Instruction>(ScalarV),
2268 SQ.DT)) {
2269 // Check wether all lanes are extracted, all extracts trigger UB
2270 // on poison, and the last extract (and hence all previous ones)
2271 // are guaranteed to execute if Ext executes. If so, we do not
2272 // need to insert a freeze.
2273 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2274 bool AllExtractsTriggerUB = true;
2275 ExtractElementInst *LastExtract = nullptr;
2276 BasicBlock *ExtBB = Ext->getParent();
2277 for (User *U : Ext->users()) {
2278 auto *Extract = cast<ExtractElementInst>(U);
2279 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2280 AllExtractsTriggerUB = false;
2281 break;
2282 }
2283 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2284 if (!LastExtract || LastExtract->comesBefore(Extract))
2285 LastExtract = Extract;
2286 }
2287 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2288 !AllExtractsTriggerUB ||
2290 LastExtract->getIterator()))
2291 ScalarV = Builder.CreateFreeze(ScalarV);
2292 }
2293 ScalarV = Builder.CreateBitCast(
2294 ScalarV,
2295 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2296 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2297 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2298 APInt EltBitMask = APInt::getLowBitsSet(TotalBits, SrcEltSizeInBits);
2299 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2300 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2301 for (User *U : Ext->users()) {
2302 auto *Extract = cast<ExtractElementInst>(U);
2303 uint64_t Idx =
2304 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2305 uint64_t ShiftAmt =
2306 DL->isBigEndian()
2307 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2308 : (Idx * SrcEltSizeInBits);
2309 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2310 Value *And = Builder.CreateAnd(LShr, Mask);
2311 U->replaceAllUsesWith(And);
2312 }
2313 return true;
2314}
2315
2316/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2317/// to "(bitcast (concat X, Y))"
2318/// where X/Y are bitcasted from i1 mask vectors.
2319bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2320 Type *Ty = I.getType();
2321 if (!Ty->isIntegerTy())
2322 return false;
2323
2324 // TODO: Add big endian test coverage
2325 if (DL->isBigEndian())
2326 return false;
2327
2328 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2329 Instruction *X, *Y;
2331 return false;
2332
2333 // Allow both sources to contain shl, to handle more generic pattern:
2334 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2335 Value *SrcX;
2336 uint64_t ShAmtX = 0;
2337 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2338 !match(X, m_OneUse(
2340 m_ConstantInt(ShAmtX)))))
2341 return false;
2342
2343 Value *SrcY;
2344 uint64_t ShAmtY = 0;
2345 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2346 !match(Y, m_OneUse(
2348 m_ConstantInt(ShAmtY)))))
2349 return false;
2350
2351 // Canonicalize larger shift to the RHS.
2352 if (ShAmtX > ShAmtY) {
2353 std::swap(X, Y);
2354 std::swap(SrcX, SrcY);
2355 std::swap(ShAmtX, ShAmtY);
2356 }
2357
2358 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2359 // difference is the mask width so they can be easily concatenated together.
2360 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2361 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2362 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2363 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2364 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2365 !MaskTy->getElementType()->isIntegerTy(1) ||
2366 MaskTy->getNumElements() != ShAmtDiff ||
2367 MaskTy->getNumElements() > (BitWidth / 2))
2368 return false;
2369
2370 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2371 auto *ConcatIntTy =
2372 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2373 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2374
2375 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2376 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2377
2378 // TODO: Is it worth supporting multi use cases?
2379 InstructionCost OldCost = 0;
2380 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2381 OldCost +=
2382 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2383 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2385 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2387
2388 InstructionCost NewCost = 0;
2390 MaskTy, ConcatMask, CostKind);
2391 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2393 if (Ty != ConcatIntTy)
2394 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2396 if (ShAmtX > 0)
2397 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2398
2399 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2400 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2401 << "\n");
2402
2403 if (NewCost > OldCost)
2404 return false;
2405
2406 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2407 // any residual zero-extension or shifting.
2408 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2409 Worklist.pushValue(Concat);
2410
2411 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2412
2413 if (Ty != ConcatIntTy) {
2414 Worklist.pushValue(Result);
2415 Result = Builder.CreateZExt(Result, Ty);
2416 }
2417
2418 if (ShAmtX > 0) {
2419 Worklist.pushValue(Result);
2420 Result = Builder.CreateShl(Result, ShAmtX);
2421 }
2422
2423 replaceValue(I, *Result);
2424 return true;
2425}
2426
2427/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2428/// --> "binop (shuffle), (shuffle)".
2429bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2430 BinaryOperator *BinOp;
2431 ArrayRef<int> OuterMask;
2432 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2433 return false;
2434
2435 // Don't introduce poison into div/rem.
2436 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2437 return false;
2438
2439 Value *Op00, *Op01, *Op10, *Op11;
2440 ArrayRef<int> Mask0, Mask1;
2441 bool Match0 = match(BinOp->getOperand(0),
2442 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2443 bool Match1 = match(BinOp->getOperand(1),
2444 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2445 if (!Match0 && !Match1)
2446 return false;
2447
2448 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2449 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2450 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2451 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2452
2453 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2454 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2455 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2456 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2457 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2458 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2459 return false;
2460
2461 unsigned NumSrcElts = BinOpTy->getNumElements();
2462
2463 // Don't accept shuffles that reference the second operand in
2464 // div/rem or if its an undef arg.
2465 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2466 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2467 return false;
2468
2469 // Merge outer / inner (or identity if no match) shuffles.
2470 SmallVector<int> NewMask0, NewMask1;
2471 for (int M : OuterMask) {
2472 if (M < 0 || M >= (int)NumSrcElts) {
2473 NewMask0.push_back(PoisonMaskElem);
2474 NewMask1.push_back(PoisonMaskElem);
2475 } else {
2476 NewMask0.push_back(Match0 ? Mask0[M] : M);
2477 NewMask1.push_back(Match1 ? Mask1[M] : M);
2478 }
2479 }
2480
2481 unsigned NumOpElts = Op0Ty->getNumElements();
2482 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2483 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2484 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2485 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2486 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2487 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2488
2489 InstructionCost NewCost = 0;
2490 // Try to merge shuffles across the binop if the new shuffles are not costly.
2491 InstructionCost BinOpCost =
2492 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2493 InstructionCost OldCost =
2495 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2496 0, nullptr, {BinOp}, &I);
2497 if (!BinOp->hasOneUse())
2498 NewCost += BinOpCost;
2499
2500 if (Match0) {
2502 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2503 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2504 OldCost += Shuf0Cost;
2505 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2506 NewCost += Shuf0Cost;
2507 }
2508 if (Match1) {
2510 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2511 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2512 OldCost += Shuf1Cost;
2513 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2514 NewCost += Shuf1Cost;
2515 }
2516
2517 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2518
2519 if (!IsIdentity0)
2520 NewCost +=
2522 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2523 if (!IsIdentity1)
2524 NewCost +=
2526 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2527
2528 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2529 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2530 << "\n");
2531
2532 // If costs are equal, still fold as we reduce instruction count.
2533 if (NewCost > OldCost)
2534 return false;
2535
2536 Value *LHS =
2537 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2538 Value *RHS =
2539 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2540 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2541
2542 // Intersect flags from the old binops.
2543 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2544 NewInst->copyIRFlags(BinOp);
2545
2546 Worklist.pushValue(LHS);
2547 Worklist.pushValue(RHS);
2548 replaceValue(I, *NewBO);
2549 return true;
2550}
2551
2552/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2553/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2554bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2555 ArrayRef<int> OldMask;
2556 Instruction *LHS, *RHS;
2558 m_Mask(OldMask))))
2559 return false;
2560
2561 // TODO: Add support for addlike etc.
2562 if (LHS->getOpcode() != RHS->getOpcode())
2563 return false;
2564
2565 Value *X, *Y, *Z, *W;
2566 bool IsCommutative = false;
2567 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2568 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2569 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2570 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2571 auto *BO = cast<BinaryOperator>(LHS);
2572 // Don't introduce poison into div/rem.
2573 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2574 return false;
2575 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2576 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2577 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2578 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2579 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2580 } else
2581 return false;
2582
2583 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2584 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2585 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2586 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2587 return false;
2588
2589 bool SameBinOp = LHS == RHS;
2590 unsigned NumSrcElts = BinOpTy->getNumElements();
2591
2592 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2593 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2594 std::swap(X, Y);
2595
2596 auto ConvertToUnary = [NumSrcElts](int &M) {
2597 if (M >= (int)NumSrcElts)
2598 M -= NumSrcElts;
2599 };
2600
2601 SmallVector<int> NewMask0(OldMask);
2603 TTI::OperandValueInfo Op0Info = TTI.commonOperandInfo(X, Z);
2604 if (X == Z) {
2605 llvm::for_each(NewMask0, ConvertToUnary);
2607 Z = PoisonValue::get(BinOpTy);
2608 }
2609
2610 SmallVector<int> NewMask1(OldMask);
2612 TTI::OperandValueInfo Op1Info = TTI.commonOperandInfo(Y, W);
2613 if (Y == W) {
2614 llvm::for_each(NewMask1, ConvertToUnary);
2616 W = PoisonValue::get(BinOpTy);
2617 }
2618
2619 // Try to replace a binop with a shuffle if the shuffle is not costly.
2620 // When SameBinOp, only count the binop cost once.
2623
2624 InstructionCost OldCost = LHSCost;
2625 if (!SameBinOp) {
2626 OldCost += RHSCost;
2627 }
2629 ShuffleDstTy, BinResTy, OldMask, CostKind, 0,
2630 nullptr, {LHS, RHS}, &I);
2631
2632 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2633 // where one use shuffles have gotten split across the binop/cmp. These
2634 // often allow a major reduction in total cost that wouldn't happen as
2635 // individual folds.
2636 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2637 TTI::TargetCostKind CostKind) -> bool {
2638 Value *InnerOp;
2639 ArrayRef<int> InnerMask;
2640 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2641 m_Mask(InnerMask)))) &&
2642 InnerOp->getType() == Op->getType() &&
2643 all_of(InnerMask,
2644 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2645 for (int &M : Mask)
2646 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2647 M = InnerMask[M - Offset];
2648 M = 0 <= M ? M + Offset : M;
2649 }
2651 Op = InnerOp;
2652 return true;
2653 }
2654 return false;
2655 };
2656 bool ReducedInstCount = false;
2657 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2658 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2659 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2660 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2661 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2662 // SingleSrcBinOp only reduces instruction count if we also eliminate the
2663 // original binop(s). If binops have multiple uses, they won't be eliminated.
2664 ReducedInstCount |= SingleSrcBinOp && LHS->hasOneUser() && RHS->hasOneUser();
2665
2666 auto *ShuffleCmpTy =
2667 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2669 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2670 if (!SingleSrcBinOp)
2671 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2672 CostKind, 0, nullptr, {Y, W});
2673
2674 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2675 NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy,
2676 CostKind, Op0Info, Op1Info);
2677 } else {
2678 NewCost +=
2679 TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, ShuffleDstTy,
2680 PredLHS, CostKind, Op0Info, Op1Info);
2681 }
2682 // If LHS/RHS have other uses, we need to account for the cost of keeping
2683 // the original instructions. When SameBinOp, only add the cost once.
2684 if (!LHS->hasOneUser())
2685 NewCost += LHSCost;
2686 if (!SameBinOp && !RHS->hasOneUser())
2687 NewCost += RHSCost;
2688
2689 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2690 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2691 << "\n");
2692
2693 // If either shuffle will constant fold away, then fold for the same cost as
2694 // we will reduce the instruction count.
2695 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2696 (isa<Constant>(Y) && isa<Constant>(W));
2697 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2698 return false;
2699
2700 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2701 Value *Shuf1 =
2702 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2703 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2704 ? Builder.CreateBinOp(
2705 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2706 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2707
2708 // Intersect flags from the old binops.
2709 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2710 NewInst->copyIRFlags(LHS);
2711 NewInst->andIRFlags(RHS);
2712 }
2713
2714 Worklist.pushValue(Shuf0);
2715 Worklist.pushValue(Shuf1);
2716 replaceValue(I, *NewBO);
2717 return true;
2718}
2719
2720/// Try to convert,
2721/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2722/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2723bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2724 ArrayRef<int> Mask;
2725 Value *C1, *T1, *F1, *C2, *T2, *F2;
2726 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2727 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2728 m_Mask(Mask))))
2729 return false;
2730
2731 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2732 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2733
2734 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2735 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2736 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2737 return false;
2738
2739 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2740 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2741 // SelectInsts must have the same FMF.
2742 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2743 ((SI0FOp != nullptr) &&
2744 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2745 return false;
2746
2747 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2748 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2750 auto SelOp = Instruction::Select;
2751
2753 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2755 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2756
2757 InstructionCost OldCost =
2758 CostSel1 + CostSel2 +
2759 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2760 {I.getOperand(0), I.getOperand(1)}, &I);
2761
2763 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2764 Mask, CostKind, 0, nullptr, {C1, C2});
2765 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2766 nullptr, {T1, T2});
2767 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2768 nullptr, {F1, F2});
2769 auto *C1C2ShuffledVecTy = FixedVectorType::get(
2770 Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements());
2771 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2773
2774 if (!Sel1->hasOneUse())
2775 NewCost += CostSel1;
2776 if (!Sel2->hasOneUse())
2777 NewCost += CostSel2;
2778
2779 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2780 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2781 << "\n");
2782 if (NewCost > OldCost)
2783 return false;
2784
2785 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2786 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2787 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2788 Value *NewSel;
2789 // We presuppose that the SelectInsts have the same FMF.
2790 if (SI0FOp)
2791 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2792 SI0FOp->getFastMathFlags());
2793 else
2794 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2795
2796 Worklist.pushValue(ShuffleCmp);
2797 Worklist.pushValue(ShuffleTrue);
2798 Worklist.pushValue(ShuffleFalse);
2799 replaceValue(I, *NewSel);
2800 return true;
2801}
2802
2803/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2804/// into "castop (shuffle)".
2805bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2806 Value *V0, *V1;
2807 ArrayRef<int> OldMask;
2808 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2809 return false;
2810
2811 // Check whether this is a binary shuffle.
2812 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2813
2814 auto *C0 = dyn_cast<CastInst>(V0);
2815 auto *C1 = dyn_cast<CastInst>(V1);
2816 if (!C0 || (IsBinaryShuffle && !C1))
2817 return false;
2818
2819 Instruction::CastOps Opcode = C0->getOpcode();
2820
2821 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2822 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2823 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2824 return false;
2825
2826 if (IsBinaryShuffle) {
2827 if (C0->getSrcTy() != C1->getSrcTy())
2828 return false;
2829 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2830 if (Opcode != C1->getOpcode()) {
2831 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2832 Opcode = Instruction::SExt;
2833 else
2834 return false;
2835 }
2836 }
2837
2838 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2839 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2840 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2841 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2842 return false;
2843
2844 unsigned NumSrcElts = CastSrcTy->getNumElements();
2845 unsigned NumDstElts = CastDstTy->getNumElements();
2846 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2847 "Only bitcasts expected to alter src/dst element counts");
2848
2849 // Check for bitcasting of unscalable vector types.
2850 // e.g. <32 x i40> -> <40 x i32>
2851 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2852 (NumDstElts % NumSrcElts) != 0)
2853 return false;
2854
2855 SmallVector<int, 16> NewMask;
2856 if (NumSrcElts >= NumDstElts) {
2857 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2858 // always be expanded to the equivalent form choosing narrower elements.
2859 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2860 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2861 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2862 } else {
2863 // The bitcast is from narrow elements to wide elements. The shuffle mask
2864 // must choose consecutive elements to allow casting first.
2865 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2866 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2867 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2868 return false;
2869 }
2870
2871 auto *NewShuffleDstTy =
2872 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2873
2874 // Try to replace a castop with a shuffle if the shuffle is not costly.
2875 InstructionCost CostC0 =
2876 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2878
2880 if (IsBinaryShuffle)
2882 else
2884
2885 InstructionCost OldCost = CostC0;
2886 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2887 CostKind, 0, nullptr, {}, &I);
2888
2889 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2890 CastSrcTy, NewMask, CostKind);
2891 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2893 if (!C0->hasOneUse())
2894 NewCost += CostC0;
2895 if (IsBinaryShuffle) {
2896 InstructionCost CostC1 =
2897 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2899 OldCost += CostC1;
2900 if (!C1->hasOneUse())
2901 NewCost += CostC1;
2902 }
2903
2904 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2905 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2906 << "\n");
2907 if (NewCost > OldCost)
2908 return false;
2909
2910 Value *Shuf;
2911 if (IsBinaryShuffle)
2912 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2913 NewMask);
2914 else
2915 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2916
2917 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2918
2919 // Intersect flags from the old casts.
2920 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2921 NewInst->copyIRFlags(C0);
2922 if (IsBinaryShuffle)
2923 NewInst->andIRFlags(C1);
2924 }
2925
2926 Worklist.pushValue(Shuf);
2927 replaceValue(I, *Cast);
2928 return true;
2929}
2930
2931/// Try to convert any of:
2932/// "shuffle (shuffle x, y), (shuffle y, x)"
2933/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2934/// "shuffle (shuffle x, undef), y"
2935/// "shuffle x, (shuffle y, undef)"
2936/// into "shuffle x, y".
2937bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2938 ArrayRef<int> OuterMask;
2939 Value *OuterV0, *OuterV1;
2940 if (!match(&I,
2941 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2942 return false;
2943
2944 ArrayRef<int> InnerMask0, InnerMask1;
2945 Value *X0, *X1, *Y0, *Y1;
2946 bool Match0 =
2947 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2948 bool Match1 =
2949 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2950 if (!Match0 && !Match1)
2951 return false;
2952
2953 // If the outer shuffle is a permute, then create a fake inner all-poison
2954 // shuffle. This is easier than accounting for length-changing shuffles below.
2955 SmallVector<int, 16> PoisonMask1;
2956 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2957 X1 = X0;
2958 Y1 = Y0;
2959 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2960 InnerMask1 = PoisonMask1;
2961 Match1 = true; // fake match
2962 }
2963
2964 X0 = Match0 ? X0 : OuterV0;
2965 Y0 = Match0 ? Y0 : OuterV0;
2966 X1 = Match1 ? X1 : OuterV1;
2967 Y1 = Match1 ? Y1 : OuterV1;
2968 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2969 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2970 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2971 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2972 X0->getType() != X1->getType())
2973 return false;
2974
2975 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2976 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2977
2978 // Attempt to merge shuffles, matching upto 2 source operands.
2979 // Replace index to a poison arg with PoisonMaskElem.
2980 // Bail if either inner masks reference an undef arg.
2981 SmallVector<int, 16> NewMask(OuterMask);
2982 Value *NewX = nullptr, *NewY = nullptr;
2983 for (int &M : NewMask) {
2984 Value *Src = nullptr;
2985 if (0 <= M && M < (int)NumImmElts) {
2986 Src = OuterV0;
2987 if (Match0) {
2988 M = InnerMask0[M];
2989 Src = M >= (int)NumSrcElts ? Y0 : X0;
2990 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2991 }
2992 } else if (M >= (int)NumImmElts) {
2993 Src = OuterV1;
2994 M -= NumImmElts;
2995 if (Match1) {
2996 M = InnerMask1[M];
2997 Src = M >= (int)NumSrcElts ? Y1 : X1;
2998 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2999 }
3000 }
3001 if (Src && M != PoisonMaskElem) {
3002 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
3003 if (isa<UndefValue>(Src)) {
3004 // We've referenced an undef element - if its poison, update the shuffle
3005 // mask, else bail.
3006 if (!isa<PoisonValue>(Src))
3007 return false;
3008 M = PoisonMaskElem;
3009 continue;
3010 }
3011 if (!NewX || NewX == Src) {
3012 NewX = Src;
3013 continue;
3014 }
3015 if (!NewY || NewY == Src) {
3016 M += NumSrcElts;
3017 NewY = Src;
3018 continue;
3019 }
3020 return false;
3021 }
3022 }
3023
3024 if (!NewX)
3025 return PoisonValue::get(ShuffleDstTy);
3026 if (!NewY)
3027 NewY = PoisonValue::get(ShuffleSrcTy);
3028
3029 // Have we folded to an Identity shuffle?
3030 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
3031 replaceValue(I, *NewX);
3032 return true;
3033 }
3034
3035 // Try to merge the shuffles if the new shuffle is not costly.
3036 InstructionCost InnerCost0 = 0;
3037 if (Match0)
3038 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
3039
3040 InstructionCost InnerCost1 = 0;
3041 if (Match1)
3042 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
3043
3045
3046 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
3047
3048 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
3052 InstructionCost NewCost =
3053 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
3054 nullptr, {NewX, NewY});
3055 if (!OuterV0->hasOneUse())
3056 NewCost += InnerCost0;
3057 if (!OuterV1->hasOneUse())
3058 NewCost += InnerCost1;
3059
3060 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
3061 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3062 << "\n");
3063 if (NewCost > OldCost)
3064 return false;
3065
3066 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
3067 replaceValue(I, *Shuf);
3068 return true;
3069}
3070
3071/// Try to convert a chain of length-preserving shuffles that are fed by
3072/// length-changing shuffles from the same source, e.g. a chain of length 3:
3073///
3074/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
3075/// (shuffle y, undef)),
3076// (shuffle y, undef)"
3077///
3078/// into a single shuffle fed by a length-changing shuffle:
3079///
3080/// "shuffle x, (shuffle y, undef)"
3081///
3082/// Such chains arise e.g. from folding extract/insert sequences.
3083bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
3084 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
3085 if (!TrunkType)
3086 return false;
3087
3088 unsigned ChainLength = 0;
3089 SmallVector<int> Mask;
3090 SmallVector<int> YMask;
3091 InstructionCost OldCost = 0;
3092 InstructionCost NewCost = 0;
3093 Value *Trunk = &I;
3094 unsigned NumTrunkElts = TrunkType->getNumElements();
3095 Value *Y = nullptr;
3096
3097 for (;;) {
3098 // Match the current trunk against (commutations of) the pattern
3099 // "shuffle trunk', (shuffle y, undef)"
3100 ArrayRef<int> OuterMask;
3101 Value *OuterV0, *OuterV1;
3102 if (ChainLength != 0 && !Trunk->hasOneUse())
3103 break;
3104 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
3105 m_Mask(OuterMask))))
3106 break;
3107 if (OuterV0->getType() != TrunkType) {
3108 // This shuffle is not length-preserving, so it cannot be part of the
3109 // chain.
3110 break;
3111 }
3112
3113 ArrayRef<int> InnerMask0, InnerMask1;
3114 Value *A0, *A1, *B0, *B1;
3115 bool Match0 =
3116 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
3117 bool Match1 =
3118 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
3119 bool Match0Leaf = Match0 && A0->getType() != I.getType();
3120 bool Match1Leaf = Match1 && A1->getType() != I.getType();
3121 if (Match0Leaf == Match1Leaf) {
3122 // Only handle the case of exactly one leaf in each step. The "two leaves"
3123 // case is handled by foldShuffleOfShuffles.
3124 break;
3125 }
3126
3127 SmallVector<int> CommutedOuterMask;
3128 if (Match0Leaf) {
3129 std::swap(OuterV0, OuterV1);
3130 std::swap(InnerMask0, InnerMask1);
3131 std::swap(A0, A1);
3132 std::swap(B0, B1);
3133 llvm::append_range(CommutedOuterMask, OuterMask);
3134 for (int &M : CommutedOuterMask) {
3135 if (M == PoisonMaskElem)
3136 continue;
3137 if (M < (int)NumTrunkElts)
3138 M += NumTrunkElts;
3139 else
3140 M -= NumTrunkElts;
3141 }
3142 OuterMask = CommutedOuterMask;
3143 }
3144 if (!OuterV1->hasOneUse())
3145 break;
3146
3147 if (!isa<UndefValue>(A1)) {
3148 if (!Y)
3149 Y = A1;
3150 else if (Y != A1)
3151 break;
3152 }
3153 if (!isa<UndefValue>(B1)) {
3154 if (!Y)
3155 Y = B1;
3156 else if (Y != B1)
3157 break;
3158 }
3159
3160 auto *YType = cast<FixedVectorType>(A1->getType());
3161 int NumLeafElts = YType->getNumElements();
3162 SmallVector<int> LocalYMask(InnerMask1);
3163 for (int &M : LocalYMask) {
3164 if (M >= NumLeafElts)
3165 M -= NumLeafElts;
3166 }
3167
3168 InstructionCost LocalOldCost =
3171
3172 // Handle the initial (start of chain) case.
3173 if (!ChainLength) {
3174 Mask.assign(OuterMask);
3175 YMask.assign(LocalYMask);
3176 OldCost = NewCost = LocalOldCost;
3177 Trunk = OuterV0;
3178 ChainLength++;
3179 continue;
3180 }
3181
3182 // For the non-root case, first attempt to combine masks.
3183 SmallVector<int> NewYMask(YMask);
3184 bool Valid = true;
3185 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3186 if (LeafM == -1 || CombinedM == LeafM)
3187 continue;
3188 if (CombinedM == -1) {
3189 CombinedM = LeafM;
3190 } else {
3191 Valid = false;
3192 break;
3193 }
3194 }
3195 if (!Valid)
3196 break;
3197
3198 SmallVector<int> NewMask;
3199 NewMask.reserve(NumTrunkElts);
3200 for (int M : Mask) {
3201 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3202 NewMask.push_back(M);
3203 else
3204 NewMask.push_back(OuterMask[M]);
3205 }
3206
3207 // Break the chain if adding this new step complicates the shuffles such
3208 // that it would increase the new cost by more than the old cost of this
3209 // step.
3210 InstructionCost LocalNewCost =
3212 YType, NewYMask, CostKind) +
3214 TrunkType, NewMask, CostKind);
3215
3216 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3217 break;
3218
3219 LLVM_DEBUG({
3220 if (ChainLength == 1) {
3221 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3222 << I << '\n';
3223 }
3224 dbgs() << " next chain link: " << *Trunk << '\n'
3225 << " old cost: " << (OldCost + LocalOldCost)
3226 << " new cost: " << LocalNewCost << '\n';
3227 });
3228
3229 Mask = NewMask;
3230 YMask = NewYMask;
3231 OldCost += LocalOldCost;
3232 NewCost = LocalNewCost;
3233 Trunk = OuterV0;
3234 ChainLength++;
3235 }
3236 if (ChainLength <= 1)
3237 return false;
3238
3239 if (llvm::all_of(Mask, [&](int M) {
3240 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3241 })) {
3242 // Produce a canonical simplified form if all elements are sourced from Y.
3243 for (int &M : Mask) {
3244 if (M >= static_cast<int>(NumTrunkElts))
3245 M = YMask[M - NumTrunkElts];
3246 }
3247 Value *Root =
3248 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3249 replaceValue(I, *Root);
3250 return true;
3251 }
3252
3253 Value *Leaf =
3254 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3255 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3256 replaceValue(I, *Root);
3257 return true;
3258}
3259
3260/// Try to convert
3261/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3262bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3263 Value *V0, *V1;
3264 ArrayRef<int> OldMask;
3265 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3266 return false;
3267
3268 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3269 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3270 if (!II0 || !II1)
3271 return false;
3272
3273 Intrinsic::ID IID = II0->getIntrinsicID();
3274 if (IID != II1->getIntrinsicID())
3275 return false;
3276 InstructionCost CostII0 =
3277 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3278 InstructionCost CostII1 =
3279 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3280
3281 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3282 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3283 if (!ShuffleDstTy || !II0Ty)
3284 return false;
3285
3286 if (!isTriviallyVectorizable(IID))
3287 return false;
3288
3289 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3291 II0->getArgOperand(I) != II1->getArgOperand(I))
3292 return false;
3293
3294 InstructionCost OldCost =
3295 CostII0 + CostII1 +
3297 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3298
3299 SmallVector<Type *> NewArgsTy;
3300 InstructionCost NewCost = 0;
3301 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3302 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3304 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3305 } else {
3306 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3307 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3308 ShuffleDstTy->getNumElements());
3309 NewArgsTy.push_back(ArgTy);
3310 std::pair<Value *, Value *> OperandPair =
3311 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3312 if (!SeenOperandPairs.insert(OperandPair).second) {
3313 // We've already computed the cost for this operand pair.
3314 continue;
3315 }
3316 NewCost += TTI.getShuffleCost(
3317 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3318 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3319 }
3320 }
3321 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3322
3323 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3324 if (!II0->hasOneUse())
3325 NewCost += CostII0;
3326 if (II1 != II0 && !II1->hasOneUse())
3327 NewCost += CostII1;
3328
3329 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3330 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3331 << "\n");
3332
3333 if (NewCost > OldCost)
3334 return false;
3335
3336 SmallVector<Value *> NewArgs;
3337 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3338 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3340 NewArgs.push_back(II0->getArgOperand(I));
3341 } else {
3342 std::pair<Value *, Value *> OperandPair =
3343 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3344 auto It = ShuffleCache.find(OperandPair);
3345 if (It != ShuffleCache.end()) {
3346 // Reuse previously created shuffle for this operand pair.
3347 NewArgs.push_back(It->second);
3348 continue;
3349 }
3350 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3351 II1->getArgOperand(I), OldMask);
3352 ShuffleCache[OperandPair] = Shuf;
3353 NewArgs.push_back(Shuf);
3354 Worklist.pushValue(Shuf);
3355 }
3356 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3357
3358 // Intersect flags from the old intrinsics.
3359 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3360 NewInst->copyIRFlags(II0);
3361 NewInst->andIRFlags(II1);
3362 }
3363
3364 replaceValue(I, *NewIntrinsic);
3365 return true;
3366}
3367
3368/// Try to convert
3369/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3370bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3371 Value *V0;
3372 ArrayRef<int> Mask;
3373 if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
3374 return false;
3375
3376 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3377 if (!II0)
3378 return false;
3379
3380 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3381 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3382 if (!ShuffleDstTy || !IntrinsicSrcTy)
3383 return false;
3384
3385 // Validate it's a pure permute, mask should only reference the first vector
3386 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3387 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3388 return false;
3389
3390 Intrinsic::ID IID = II0->getIntrinsicID();
3391 if (!isTriviallyVectorizable(IID))
3392 return false;
3393
3394 // Cost analysis
3396 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3397 InstructionCost OldCost =
3400 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3401
3402 SmallVector<Type *> NewArgsTy;
3403 InstructionCost NewCost = 0;
3404 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3406 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3407 } else {
3408 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3409 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3410 ShuffleDstTy->getNumElements());
3411 NewArgsTy.push_back(ArgTy);
3413 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3414 {II0->getArgOperand(I)});
3415 }
3416 }
3417 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3418 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3419
3420 // If the intrinsic has multiple uses, we need to account for the cost of
3421 // keeping the original intrinsic around.
3422 if (!II0->hasOneUse())
3423 NewCost += IntrinsicCost;
3424
3425 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3426 << OldCost << " vs NewCost: " << NewCost << "\n");
3427
3428 if (NewCost > OldCost)
3429 return false;
3430
3431 // Transform
3432 SmallVector<Value *> NewArgs;
3433 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3435 NewArgs.push_back(II0->getArgOperand(I));
3436 } else {
3437 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3438 NewArgs.push_back(Shuf);
3439 Worklist.pushValue(Shuf);
3440 }
3441 }
3442
3443 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3444
3445 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3446 NewInst->copyIRFlags(II0);
3447
3448 replaceValue(I, *NewIntrinsic);
3449 return true;
3450}
3451
3452using InstLane = std::pair<Value *, int>;
3453
3454static InstLane lookThroughShuffles(Value *V, int Lane) {
3455 while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
3456 unsigned NumElts =
3457 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3458 int M = SV->getMaskValue(Lane);
3459 if (M < 0)
3460 return {nullptr, PoisonMaskElem};
3461 if (static_cast<unsigned>(M) < NumElts) {
3462 V = SV->getOperand(0);
3463 Lane = M;
3464 } else {
3465 V = SV->getOperand(1);
3466 Lane = M - NumElts;
3467 }
3468 }
3469 return InstLane{V, Lane};
3470}
3471
3475 for (InstLane IL : Item) {
3476 auto [U, Lane] = IL;
3477 InstLane OpLane =
3478 U ? lookThroughShuffles(cast<Instruction>(U)->getOperand(Op), Lane)
3479 : InstLane{nullptr, PoisonMaskElem};
3480 NItem.emplace_back(OpLane);
3481 }
3482 return NItem;
3483}
3484
3485/// Detect concat of multiple values into a vector
3487 const TargetTransformInfo &TTI) {
3488 auto *Ty = cast<FixedVectorType>(Item.front().first->getType());
3489 unsigned NumElts = Ty->getNumElements();
3490 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3491 return false;
3492
3493 // Check that the concat is free, usually meaning that the type will be split
3494 // during legalization.
3495 SmallVector<int, 16> ConcatMask(NumElts * 2);
3496 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3497 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3498 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3499 Ty, ConcatMask, CostKind) != 0)
3500 return false;
3501
3502 unsigned NumSlices = Item.size() / NumElts;
3503 // Currently we generate a tree of shuffles for the concats, which limits us
3504 // to a power2.
3505 if (!isPowerOf2_32(NumSlices))
3506 return false;
3507 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3508 Value *SliceV = Item[Slice * NumElts].first;
3509 if (!SliceV || SliceV->getType() != Ty)
3510 return false;
3511 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3512 auto [V, Lane] = Item[Slice * NumElts + Elt];
3513 if (Lane != static_cast<int>(Elt) || SliceV != V)
3514 return false;
3515 }
3516 }
3517 return true;
3518}
3519
3520static Value *
3522 const DenseSet<std::pair<Value *, Use *>> &IdentityLeafs,
3523 const DenseSet<std::pair<Value *, Use *>> &SplatLeafs,
3524 const DenseSet<std::pair<Value *, Use *>> &ConcatLeafs,
3525 IRBuilderBase &Builder, const TargetTransformInfo *TTI) {
3526 auto [FrontV, FrontLane] = Item.front();
3527
3528 if (IdentityLeafs.contains(std::make_pair(FrontV, From))) {
3529 return FrontV;
3530 }
3531 if (SplatLeafs.contains(std::make_pair(FrontV, From))) {
3532 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3533 return Builder.CreateShuffleVector(FrontV, Mask);
3534 }
3535 if (ConcatLeafs.contains(std::make_pair(FrontV, From))) {
3536 unsigned NumElts =
3537 cast<FixedVectorType>(FrontV->getType())->getNumElements();
3538 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3539 for (unsigned S = 0; S < Values.size(); ++S)
3540 Values[S] = Item[S * NumElts].first;
3541
3542 while (Values.size() > 1) {
3543 NumElts *= 2;
3544 SmallVector<int, 16> Mask(NumElts, 0);
3545 std::iota(Mask.begin(), Mask.end(), 0);
3546 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3547 for (unsigned S = 0; S < NewValues.size(); ++S)
3548 NewValues[S] =
3549 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3550 Values = NewValues;
3551 }
3552 return Values[0];
3553 }
3554
3555 auto *I = cast<Instruction>(FrontV);
3556 auto *II = dyn_cast<IntrinsicInst>(I);
3557 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3559 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3560 if (II &&
3561 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3562 Ops[Idx] = II->getOperand(Idx);
3563 continue;
3564 }
3566 &I->getOperandUse(Idx), Ty, IdentityLeafs,
3567 SplatLeafs, ConcatLeafs, Builder, TTI);
3568 }
3569
3570 SmallVector<Value *, 8> ValueList;
3571 for (const auto &Lane : Item)
3572 if (Lane.first)
3573 ValueList.push_back(Lane.first);
3574
3575 Type *DstTy =
3576 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3577 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3578 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3579 Ops[0], Ops[1]);
3580 propagateIRFlags(Value, ValueList);
3581 return Value;
3582 }
3583 if (auto *CI = dyn_cast<CmpInst>(I)) {
3584 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3585 propagateIRFlags(Value, ValueList);
3586 return Value;
3587 }
3588 if (auto *SI = dyn_cast<SelectInst>(I)) {
3589 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3590 propagateIRFlags(Value, ValueList);
3591 return Value;
3592 }
3593 if (auto *CI = dyn_cast<CastInst>(I)) {
3594 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3595 propagateIRFlags(Value, ValueList);
3596 return Value;
3597 }
3598 if (II) {
3599 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3600 propagateIRFlags(Value, ValueList);
3601 return Value;
3602 }
3603 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3604 auto *Value =
3605 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3606 propagateIRFlags(Value, ValueList);
3607 return Value;
3608}
3609
3610// Starting from a shuffle, look up through operands tracking the shuffled index
3611// of each lane. If we can simplify away the shuffles to identities then
3612// do so.
3613bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3614 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3615 if (!Ty || I.use_empty())
3616 return false;
3617
3618 SmallVector<InstLane> Start(Ty->getNumElements());
3619 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3620 Start[M] = lookThroughShuffles(&I, M);
3621
3623 Worklist.push_back(std::make_pair(Start, &*I.use_begin()));
3624 DenseSet<std::pair<Value *, Use *>> IdentityLeafs, SplatLeafs, ConcatLeafs;
3625 unsigned NumVisited = 0;
3626
3627 while (!Worklist.empty()) {
3628 if (++NumVisited > MaxInstrsToScan)
3629 return false;
3630
3631 auto ItemFrom = Worklist.pop_back_val();
3632 auto Item = ItemFrom.first;
3633 auto From = ItemFrom.second;
3634 auto [FrontV, FrontLane] = Item.front();
3635
3636 // If we found an undef first lane then bail out to keep things simple.
3637 if (!FrontV)
3638 return false;
3639
3640 // Helper to peek through bitcasts to the same value.
3641 auto IsEquiv = [&](Value *X, Value *Y) {
3642 return X->getType() == Y->getType() &&
3644 };
3645
3646 // Look for an identity value.
3647 if (FrontLane == 0 &&
3648 cast<FixedVectorType>(FrontV->getType())->getNumElements() ==
3649 Ty->getNumElements() &&
3650 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3651 Value *FrontV = Item.front().first;
3652 return !E.value().first || (IsEquiv(E.value().first, FrontV) &&
3653 E.value().second == (int)E.index());
3654 })) {
3655 IdentityLeafs.insert(std::make_pair(FrontV, From));
3656 continue;
3657 }
3658 // Look for constants, for the moment only supporting constant splats.
3659 if (auto *C = dyn_cast<Constant>(FrontV);
3660 C && C->getSplatValue() &&
3661 all_of(drop_begin(Item), [Item](InstLane &IL) {
3662 Value *FrontV = Item.front().first;
3663 Value *V = IL.first;
3664 return !V || (isa<Constant>(V) &&
3665 cast<Constant>(V)->getSplatValue() ==
3666 cast<Constant>(FrontV)->getSplatValue());
3667 })) {
3668 SplatLeafs.insert(std::make_pair(FrontV, From));
3669 continue;
3670 }
3671 // Look for a splat value.
3672 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3673 auto [FrontV, FrontLane] = Item.front();
3674 auto [V, Lane] = IL;
3675 return !V || (V == FrontV && Lane == FrontLane);
3676 })) {
3677 SplatLeafs.insert(std::make_pair(FrontV, From));
3678 continue;
3679 }
3680
3681 // We need each element to be the same type of value, and check that each
3682 // element has a single use.
3683 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3684 Value *FrontV = Item.front().first;
3685 if (!IL.first)
3686 return true;
3687 Value *V = IL.first;
3688 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3689 return false;
3690 if (V->getValueID() != FrontV->getValueID())
3691 return false;
3692 if (auto *CI = dyn_cast<CmpInst>(V))
3693 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3694 return false;
3695 if (auto *CI = dyn_cast<CastInst>(V))
3696 if (CI->getSrcTy()->getScalarType() !=
3697 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3698 return false;
3699 if (auto *SI = dyn_cast<SelectInst>(V))
3700 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3701 SI->getOperand(0)->getType() !=
3702 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3703 return false;
3704 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3705 return false;
3706 auto *II = dyn_cast<IntrinsicInst>(V);
3707 return !II || (isa<IntrinsicInst>(FrontV) &&
3708 II->getIntrinsicID() ==
3709 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3710 !II->hasOperandBundles());
3711 };
3712 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3713 // Check the operator is one that we support.
3714 if (isa<BinaryOperator, CmpInst>(FrontV)) {
3715 // We exclude div/rem in case they hit UB from poison lanes.
3716 if (auto *BO = dyn_cast<BinaryOperator>(FrontV);
3717 BO && BO->isIntDivRem())
3718 return false;
3720 &cast<Instruction>(FrontV)->getOperandUse(0));
3722 &cast<Instruction>(FrontV)->getOperandUse(1));
3723 continue;
3724 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3725 FPToUIInst, SIToFPInst, UIToFPInst>(FrontV)) {
3727 &cast<Instruction>(FrontV)->getOperandUse(0));
3728 continue;
3729 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontV)) {
3730 // TODO: Handle vector widening/narrowing bitcasts.
3731 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3732 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3733 if (DstTy && SrcTy &&
3734 SrcTy->getNumElements() == DstTy->getNumElements()) {
3736 &BitCast->getOperandUse(0));
3737 continue;
3738 }
3739 } else if (auto *Sel = dyn_cast<SelectInst>(FrontV)) {
3741 &Sel->getOperandUse(0));
3743 &Sel->getOperandUse(1));
3745 &Sel->getOperandUse(2));
3746 continue;
3747 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
3748 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3749 !II->hasOperandBundles()) {
3750 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3751 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3752 &TTI)) {
3753 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3754 Value *FrontV = Item.front().first;
3755 Value *V = IL.first;
3756 return !V || (cast<Instruction>(V)->getOperand(Op) ==
3757 cast<Instruction>(FrontV)->getOperand(Op));
3758 }))
3759 return false;
3760 continue;
3761 }
3763 &cast<Instruction>(FrontV)->getOperandUse(Op));
3764 }
3765 continue;
3766 }
3767 }
3768
3769 if (isFreeConcat(Item, CostKind, TTI)) {
3770 ConcatLeafs.insert(std::make_pair(FrontV, From));
3771 continue;
3772 }
3773
3774 return false;
3775 }
3776
3777 if (NumVisited <= 1)
3778 return false;
3779
3780 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3781
3782 // If we got this far, we know the shuffles are superfluous and can be
3783 // removed. Scan through again and generate the new tree of instructions.
3784 Builder.SetInsertPoint(&I);
3785 Value *V = generateNewInstTree(Start, &*I.use_begin(), Ty, IdentityLeafs,
3786 SplatLeafs, ConcatLeafs, Builder, &TTI);
3787 replaceValue(I, *V);
3788 return true;
3789}
3790
3791/// Given a commutative reduction, the order of the input lanes does not alter
3792/// the results. We can use this to remove certain shuffles feeding the
3793/// reduction, removing the need to shuffle at all.
3794bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3795 auto *II = dyn_cast<IntrinsicInst>(&I);
3796 if (!II)
3797 return false;
3798 switch (II->getIntrinsicID()) {
3799 case Intrinsic::vector_reduce_add:
3800 case Intrinsic::vector_reduce_mul:
3801 case Intrinsic::vector_reduce_and:
3802 case Intrinsic::vector_reduce_or:
3803 case Intrinsic::vector_reduce_xor:
3804 case Intrinsic::vector_reduce_smin:
3805 case Intrinsic::vector_reduce_smax:
3806 case Intrinsic::vector_reduce_umin:
3807 case Intrinsic::vector_reduce_umax:
3808 break;
3809 default:
3810 return false;
3811 }
3812
3813 // Find all the inputs when looking through operations that do not alter the
3814 // lane order (binops, for example). Currently we look for a single shuffle,
3815 // and can ignore splat values.
3816 std::queue<Value *> Worklist;
3817 SmallPtrSet<Value *, 4> Visited;
3818 ShuffleVectorInst *Shuffle = nullptr;
3819 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3820 Worklist.push(Op);
3821
3822 while (!Worklist.empty()) {
3823 Value *CV = Worklist.front();
3824 Worklist.pop();
3825 if (Visited.contains(CV))
3826 continue;
3827
3828 // Splats don't change the order, so can be safely ignored.
3829 if (isSplatValue(CV))
3830 continue;
3831
3832 Visited.insert(CV);
3833
3834 if (auto *CI = dyn_cast<Instruction>(CV)) {
3835 if (CI->isBinaryOp()) {
3836 for (auto *Op : CI->operand_values())
3837 Worklist.push(Op);
3838 continue;
3839 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3840 if (Shuffle && Shuffle != SV)
3841 return false;
3842 Shuffle = SV;
3843 continue;
3844 }
3845 }
3846
3847 // Anything else is currently an unknown node.
3848 return false;
3849 }
3850
3851 if (!Shuffle)
3852 return false;
3853
3854 // Check all uses of the binary ops and shuffles are also included in the
3855 // lane-invariant operations (Visited should be the list of lanewise
3856 // instructions, including the shuffle that we found).
3857 for (auto *V : Visited)
3858 for (auto *U : V->users())
3859 if (!Visited.contains(U) && U != &I)
3860 return false;
3861
3862 FixedVectorType *VecType =
3863 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3864 if (!VecType)
3865 return false;
3866 FixedVectorType *ShuffleInputType =
3868 if (!ShuffleInputType)
3869 return false;
3870 unsigned NumInputElts = ShuffleInputType->getNumElements();
3871
3872 // Find the mask from sorting the lanes into order. This is most likely to
3873 // become a identity or concat mask. Undef elements are pushed to the end.
3874 SmallVector<int> ConcatMask;
3875 Shuffle->getShuffleMask(ConcatMask);
3876 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3877 bool UsesSecondVec =
3878 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3879
3881 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3882 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3884 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3885 ShuffleInputType, ConcatMask, CostKind);
3886
3887 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3888 << "\n");
3889 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3890 << "\n");
3891 bool MadeChanges = false;
3892 if (NewCost < OldCost) {
3893 Builder.SetInsertPoint(Shuffle);
3894 Value *NewShuffle = Builder.CreateShuffleVector(
3895 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3896 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3897 replaceValue(*Shuffle, *NewShuffle);
3898 return true;
3899 }
3900
3901 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3902 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3903 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3904 return MadeChanges;
3905}
3906
3907/// For a given chain of patterns of the following form:
3908///
3909/// ```
3910/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3911///
3912/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3913/// ty1> %1)
3914/// OR
3915/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3916///
3917/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3918/// ...
3919/// ...
3920/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3921/// 3), <n x ty1> %(i - 2)
3922/// OR
3923/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3924///
3925/// %(i) = extractelement <n x ty1> %(i - 1), 0
3926/// ```
3927///
3928/// Where:
3929/// `mask` follows a partition pattern:
3930///
3931/// Ex:
3932/// [n = 8, p = poison]
3933///
3934/// 4 5 6 7 | p p p p
3935/// 2 3 | p p p p p p
3936/// 1 | p p p p p p p
3937///
3938/// For powers of 2, there's a consistent pattern, but for other cases
3939/// the parity of the current half value at each step decides the
3940/// next partition half (see `ExpectedParityMask` for more logical details
3941/// in generalising this).
3942///
3943/// Ex:
3944/// [n = 6]
3945///
3946/// 3 4 5 | p p p
3947/// 1 2 | p p p p
3948/// 1 | p p p p p
3949bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3950 // Going bottom-up for the pattern.
3951 std::queue<Value *> InstWorklist;
3952 InstructionCost OrigCost = 0;
3953
3954 // Common instruction operation after each shuffle op.
3955 std::optional<unsigned int> CommonCallOp = std::nullopt;
3956 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3957
3958 bool IsFirstCallOrBinInst = true;
3959 bool ShouldBeCallOrBinInst = true;
3960
3961 // This stores the last used instructions for shuffle/common op.
3962 //
3963 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3964 // instructions from either shuffle/common op.
3965 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3966
3967 Value *VecOpEE;
3968 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3969 return false;
3970
3971 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3972 if (!FVT)
3973 return false;
3974
3975 int64_t VecSize = FVT->getNumElements();
3976 if (VecSize < 2)
3977 return false;
3978
3979 // Number of levels would be ~log2(n), considering we always partition
3980 // by half for this fold pattern.
3981 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3982 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3983
3984 // This is how we generalise for all element sizes.
3985 // At each step, if vector size is odd, we need non-poison
3986 // values to cover the dominant half so we don't miss out on any element.
3987 //
3988 // This mask will help us retrieve this as we go from bottom to top:
3989 //
3990 // Mask Set -> N = N * 2 - 1
3991 // Mask Unset -> N = N * 2
3992 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3993 Cur = (Cur + 1) / 2, --Mask) {
3994 if (Cur & 1)
3995 ExpectedParityMask |= (1ll << Mask);
3996 }
3997
3998 InstWorklist.push(VecOpEE);
3999
4000 bool IsPartialReduction = false;
4001
4002 while (!InstWorklist.empty()) {
4003 Value *CI = InstWorklist.front();
4004 InstWorklist.pop();
4005
4006 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
4007 if (!ShouldBeCallOrBinInst)
4008 return false;
4009
4010 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4011 return false;
4012
4013 // For the first found call/bin op, the vector has to come from the
4014 // extract element op.
4015 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4016 return false;
4017 IsFirstCallOrBinInst = false;
4018
4019 if (!CommonCallOp)
4020 CommonCallOp = II->getIntrinsicID();
4021 if (II->getIntrinsicID() != *CommonCallOp)
4022 return false;
4023
4024 switch (II->getIntrinsicID()) {
4025 case Intrinsic::umin:
4026 case Intrinsic::umax:
4027 case Intrinsic::smin:
4028 case Intrinsic::smax: {
4029 auto *Op0 = II->getOperand(0);
4030 auto *Op1 = II->getOperand(1);
4031 PrevVecV[0] = Op0;
4032 PrevVecV[1] = Op1;
4033 break;
4034 }
4035 default:
4036 return false;
4037 }
4038 ShouldBeCallOrBinInst ^= 1;
4039
4040 IntrinsicCostAttributes ICA(
4041 *CommonCallOp, II->getType(),
4042 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
4043 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
4044
4045 // We may need a swap here since it can be (a, b) or (b, a)
4046 // and accordingly change as we go up.
4047 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4048 std::swap(PrevVecV[0], PrevVecV[1]);
4049 InstWorklist.push(PrevVecV[1]);
4050 InstWorklist.push(PrevVecV[0]);
4051 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
4052 // Similar logic for bin ops.
4053
4054 if (!ShouldBeCallOrBinInst)
4055 return false;
4056
4057 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4058 return false;
4059
4060 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4061 return false;
4062 IsFirstCallOrBinInst = false;
4063
4064 if (!CommonBinOp)
4065 CommonBinOp = BinOp->getOpcode();
4066
4067 if (BinOp->getOpcode() != *CommonBinOp)
4068 return false;
4069
4070 switch (*CommonBinOp) {
4071 case BinaryOperator::Add:
4072 case BinaryOperator::Mul:
4073 case BinaryOperator::Or:
4074 case BinaryOperator::And:
4075 case BinaryOperator::Xor: {
4076 auto *Op0 = BinOp->getOperand(0);
4077 auto *Op1 = BinOp->getOperand(1);
4078 PrevVecV[0] = Op0;
4079 PrevVecV[1] = Op1;
4080 break;
4081 }
4082 default:
4083 return false;
4084 }
4085 ShouldBeCallOrBinInst ^= 1;
4086
4087 OrigCost +=
4088 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
4089
4090 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4091 std::swap(PrevVecV[0], PrevVecV[1]);
4092 InstWorklist.push(PrevVecV[1]);
4093 InstWorklist.push(PrevVecV[0]);
4094 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
4095 // We shouldn't have any null values in the previous vectors,
4096 // is so, there was a mismatch in pattern.
4097 if (ShouldBeCallOrBinInst || any_of(PrevVecV, equal_to(nullptr)))
4098 return false;
4099
4100 if (SVInst != PrevVecV[1])
4101 return false;
4102
4103 ArrayRef<int> CurMask;
4104 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
4105 m_Mask(CurMask))))
4106 return false;
4107
4108 // Subtract the parity mask when checking the condition.
4109 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
4110 if (Mask < ShuffleMaskHalf &&
4111 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
4112 return false;
4113 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
4114 return false;
4115 }
4116
4117 // Update mask values.
4118 ShuffleMaskHalf *= 2;
4119 ShuffleMaskHalf -= (ExpectedParityMask & 1);
4120 ExpectedParityMask >>= 1;
4121
4123 SVInst->getType(), SVInst->getType(),
4124 CurMask, CostKind);
4125
4126 VisitedCnt += 1;
4127 if (!ExpectedParityMask && VisitedCnt == NumLevels)
4128 break;
4129
4130 ShouldBeCallOrBinInst ^= 1;
4131 } else {
4132 // Check if this is a partial reduction - the chain ended because
4133 // the source vector is not a recognized op/shuffle.
4134 // Reject non-power-of-2 vectors because parity-based masks cause
4135 // lane duplication in the reduction tree, making the partial result
4136 // not a simple subvector reduction.
4137 if (ShouldBeCallOrBinInst && VisitedCnt >= 1 && CI == PrevVecV[0] &&
4138 isPowerOf2_64(VecSize)) {
4139 IsPartialReduction = true;
4140 break;
4141 }
4142 return false;
4143 }
4144 }
4145
4146 // Full reduction pattern should end with a shuffle op.
4147 // Partial reduction ends when the source vector is reached.
4148 if (ShouldBeCallOrBinInst && !IsPartialReduction)
4149 return false;
4150
4151 assert(VecSize != -1 && "Expected Match for Vector Size");
4152
4153 Value *FinalVecV = PrevVecV[0];
4154 if (!FinalVecV)
4155 return false;
4156
4157 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
4158
4159 Intrinsic::ID ReducedOp =
4160 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
4161 : getReductionForBinop(*CommonBinOp));
4162 if (!ReducedOp)
4163 return false;
4164
4165 InstructionCost NewCost = 0;
4166 FixedVectorType *ReduceVecTy = FinalVecVTy;
4167 SmallVector<int> ExtractMask;
4168
4169 if (IsPartialReduction) {
4170 unsigned SubVecSize = ShuffleMaskHalf;
4171 ReduceVecTy = FixedVectorType::get(FVT->getElementType(), SubVecSize);
4172 ExtractMask.resize(SubVecSize);
4173 std::iota(ExtractMask.begin(), ExtractMask.end(), 0);
4175 ReduceVecTy, FinalVecVTy, ExtractMask,
4176 CostKind, 0, ReduceVecTy);
4177 }
4178
4179 IntrinsicCostAttributes ICA(ReducedOp, ReduceVecTy, {ReduceVecTy});
4180 NewCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
4181
4182 LLVM_DEBUG(dbgs() << "Found reduction shuffle chain: " << I << "\n OldCost : "
4183 << OrigCost << " vs NewCost: " << NewCost << "\n");
4184
4185 if (VecOpEE->hasOneUse() ? (NewCost > OrigCost) : (NewCost >= OrigCost))
4186 return false;
4187
4188 Value *ReduceInput = FinalVecV;
4189 if (IsPartialReduction)
4190 ReduceInput = Builder.CreateShuffleVector(FinalVecV, ExtractMask);
4191
4192 auto *ReducedResult = Builder.CreateIntrinsic(
4193 ReducedOp, {ReduceInput->getType()}, {ReduceInput});
4194 replaceValue(I, *ReducedResult);
4195
4196 return true;
4197}
4198
4199/// Determine if its more efficient to fold:
4200/// reduce(trunc(x)) -> trunc(reduce(x)).
4201/// reduce(sext(x)) -> sext(reduce(x)).
4202/// reduce(zext(x)) -> zext(reduce(x)).
4203bool VectorCombine::foldCastFromReductions(Instruction &I) {
4204 auto *II = dyn_cast<IntrinsicInst>(&I);
4205 if (!II)
4206 return false;
4207
4208 bool TruncOnly = false;
4209 Intrinsic::ID IID = II->getIntrinsicID();
4210 switch (IID) {
4211 case Intrinsic::vector_reduce_add:
4212 case Intrinsic::vector_reduce_mul:
4213 TruncOnly = true;
4214 break;
4215 case Intrinsic::vector_reduce_and:
4216 case Intrinsic::vector_reduce_or:
4217 case Intrinsic::vector_reduce_xor:
4218 break;
4219 default:
4220 return false;
4221 }
4222
4223 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4224 Value *ReductionSrc = I.getOperand(0);
4225
4226 Value *Src;
4227 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4228 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4229 return false;
4230
4231 auto CastOpc =
4232 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4233
4234 auto *SrcTy = cast<VectorType>(Src->getType());
4235 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4236 Type *ResultTy = I.getType();
4237
4239 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4240 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4242 cast<CastInst>(ReductionSrc));
4243 InstructionCost NewCost =
4244 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4245 CostKind) +
4246 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4248
4249 if (OldCost <= NewCost || !NewCost.isValid())
4250 return false;
4251
4252 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4253 II->getIntrinsicID(), {Src});
4254 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4255 replaceValue(I, *NewCast);
4256 return true;
4257}
4258
4259/// Fold:
4260/// icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
4261/// into:
4262/// icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
4263///
4264/// Sign-bit reductions produce values with known semantics:
4265/// - reduce.{or,umax}: 0 if no element is negative, 1 if any is
4266/// - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
4267/// - reduce.add: count of negative elements (0 to NumElts)
4268///
4269/// Both lshr and ashr are supported:
4270/// - lshr produces 0 or 1, so reduce.add range is [0, N]
4271/// - ashr produces 0 or -1, so reduce.add range is [-N, 0]
4272///
4273/// The fold generalizes to multiple source vectors combined with the same
4274/// operation as the reduction. For example:
4275/// reduce.or(or(shr A, shr B)) conceptually extends the vector
4276/// For reduce.add, this changes the count to M*N where M is the number of
4277/// source vectors.
4278///
4279/// We transform to a direct sign check on the original vector using
4280/// reduce.{or,umax} or reduce.{and,umin}.
4281///
4282/// In spirit, it's similar to foldSignBitCheck in InstCombine.
4283bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
4284 CmpPredicate Pred;
4285 IntrinsicInst *ReduceOp;
4286 const APInt *CmpVal;
4287 if (!match(&I,
4288 m_ICmp(Pred, m_OneUse(m_AnyIntrinsic(ReduceOp)), m_APInt(CmpVal))))
4289 return false;
4290
4291 Intrinsic::ID OrigIID = ReduceOp->getIntrinsicID();
4292 switch (OrigIID) {
4293 case Intrinsic::vector_reduce_or:
4294 case Intrinsic::vector_reduce_umax:
4295 case Intrinsic::vector_reduce_and:
4296 case Intrinsic::vector_reduce_umin:
4297 case Intrinsic::vector_reduce_add:
4298 break;
4299 default:
4300 return false;
4301 }
4302
4303 Value *ReductionSrc = ReduceOp->getArgOperand(0);
4304 auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
4305 if (!VecTy)
4306 return false;
4307
4308 unsigned BitWidth = VecTy->getScalarSizeInBits();
4309 if (BitWidth == 1)
4310 return false;
4311
4312 unsigned NumElts = VecTy->getNumElements();
4313
4314 // Determine the expected tree opcode for multi-vector patterns.
4315 // The tree opcode must match the reduction's underlying operation.
4316 //
4317 // TODO: for pairs of equivalent operators, we should match both,
4318 // not only the most common.
4319 Instruction::BinaryOps TreeOpcode;
4320 switch (OrigIID) {
4321 case Intrinsic::vector_reduce_or:
4322 case Intrinsic::vector_reduce_umax:
4323 TreeOpcode = Instruction::Or;
4324 break;
4325 case Intrinsic::vector_reduce_and:
4326 case Intrinsic::vector_reduce_umin:
4327 TreeOpcode = Instruction::And;
4328 break;
4329 case Intrinsic::vector_reduce_add:
4330 TreeOpcode = Instruction::Add;
4331 break;
4332 default:
4333 llvm_unreachable("Unexpected intrinsic");
4334 }
4335
4336 // Collect sign-bit extraction leaves from an associative tree of TreeOpcode.
4337 // The tree conceptually extends the vector being reduced.
4338 SmallVector<Value *, 8> Worklist;
4339 SmallVector<Value *, 8> Sources; // Original vectors (X in shr X, BW-1)
4340 Worklist.push_back(ReductionSrc);
4341 std::optional<bool> IsAShr;
4342 constexpr unsigned MaxSources = 8;
4343
4344 // Calculate old cost: all shifts + tree ops + reduction
4345 InstructionCost OldCost = TTI.getInstructionCost(ReduceOp, CostKind);
4346
4347 while (!Worklist.empty() && Worklist.size() <= MaxSources &&
4348 Sources.size() <= MaxSources) {
4349 Value *V = Worklist.pop_back_val();
4350
4351 // Try to match sign-bit extraction: shr X, (bitwidth-1)
4352 Value *X;
4353 if (match(V, m_OneUse(m_Shr(m_Value(X), m_SpecificInt(BitWidth - 1))))) {
4354 auto *Shr = cast<Instruction>(V);
4355
4356 // All shifts must be the same type (all lshr or all ashr)
4357 bool ThisIsAShr = Shr->getOpcode() == Instruction::AShr;
4358 if (!IsAShr)
4359 IsAShr = ThisIsAShr;
4360 else if (*IsAShr != ThisIsAShr)
4361 return false;
4362
4363 Sources.push_back(X);
4364
4365 // As part of the fold, we remove all of the shifts, so we need to keep
4366 // track of their costs.
4367 OldCost += TTI.getInstructionCost(Shr, CostKind);
4368
4369 continue;
4370 }
4371
4372 // Try to extend through a tree node of the expected opcode
4373 Value *A, *B;
4374 if (!match(V, m_OneUse(m_BinOp(TreeOpcode, m_Value(A), m_Value(B)))))
4375 return false;
4376
4377 // We are potentially replacing these operations as well, so we add them
4378 // to the costs.
4380
4381 Worklist.push_back(A);
4382 Worklist.push_back(B);
4383 }
4384
4385 // Must have at least one source and not exceed limit
4386 if (Sources.empty() || Sources.size() > MaxSources ||
4387 Worklist.size() > MaxSources || !IsAShr)
4388 return false;
4389
4390 unsigned NumSources = Sources.size();
4391
4392 // For reduce.add, the total count must fit as a signed integer.
4393 // Range is [0, M*N] for lshr or [-M*N, 0] for ashr.
4394 if (OrigIID == Intrinsic::vector_reduce_add &&
4395 !isIntN(BitWidth, NumSources * NumElts))
4396 return false;
4397
4398 // Compute the boundary value when all elements are negative:
4399 // - Per-element contribution: 1 for lshr, -1 for ashr
4400 // - For add: M*N (total elements across all sources); for others: just 1
4401 unsigned Count =
4402 (OrigIID == Intrinsic::vector_reduce_add) ? NumSources * NumElts : 1;
4403 APInt NegativeVal(CmpVal->getBitWidth(), Count);
4404 if (*IsAShr)
4405 NegativeVal.negate();
4406
4407 // Range is [min(0, AllNegVal), max(0, AllNegVal)]
4408 APInt Zero = APInt::getZero(CmpVal->getBitWidth());
4409 APInt RangeLow = APIntOps::smin(Zero, NegativeVal);
4410 APInt RangeHigh = APIntOps::smax(Zero, NegativeVal);
4411
4412 // Determine comparison semantics:
4413 // - IsEq: true for equality test, false for inequality
4414 // - TestsNegative: true if testing against AllNegVal, false for zero
4415 //
4416 // In addition to EQ/NE against 0 or AllNegVal, we support inequalities
4417 // that fold to boundary tests given the narrow value range:
4418 // < RangeHigh -> != RangeHigh
4419 // > RangeHigh-1 -> == RangeHigh
4420 // > RangeLow -> != RangeLow
4421 // < RangeLow+1 -> == RangeLow
4422 //
4423 // For inequalities, we work with signed predicates only. Unsigned predicates
4424 // are canonicalized to signed when the range is non-negative (where they are
4425 // equivalent). When the range includes negative values, unsigned predicates
4426 // would have different semantics due to wrap-around, so we reject them.
4427 if (!ICmpInst::isEquality(Pred) && !ICmpInst::isSigned(Pred)) {
4428 if (RangeLow.isNegative())
4429 return false;
4430 Pred = ICmpInst::getSignedPredicate(Pred);
4431 }
4432
4433 bool IsEq;
4434 bool TestsNegative;
4435 if (ICmpInst::isEquality(Pred)) {
4436 if (CmpVal->isZero()) {
4437 TestsNegative = false;
4438 } else if (*CmpVal == NegativeVal) {
4439 TestsNegative = true;
4440 } else {
4441 return false;
4442 }
4443 IsEq = Pred == ICmpInst::ICMP_EQ;
4444 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeHigh) {
4445 IsEq = false;
4446 TestsNegative = (RangeHigh == NegativeVal);
4447 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeHigh - 1) {
4448 IsEq = true;
4449 TestsNegative = (RangeHigh == NegativeVal);
4450 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeLow) {
4451 IsEq = false;
4452 TestsNegative = (RangeLow == NegativeVal);
4453 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeLow + 1) {
4454 IsEq = true;
4455 TestsNegative = (RangeLow == NegativeVal);
4456 } else {
4457 return false;
4458 }
4459
4460 // For this fold we support four types of checks:
4461 //
4462 // 1. All lanes are negative - AllNeg
4463 // 2. All lanes are non-negative - AllNonNeg
4464 // 3. At least one negative lane - AnyNeg
4465 // 4. At least one non-negative lane - AnyNonNeg
4466 //
4467 // For each case, we can generate the following code:
4468 //
4469 // 1. AllNeg - reduce.and/umin(X) < 0
4470 // 2. AllNonNeg - reduce.or/umax(X) > -1
4471 // 3. AnyNeg - reduce.or/umax(X) < 0
4472 // 4. AnyNonNeg - reduce.and/umin(X) > -1
4473 //
4474 // The table below shows the aggregation of all supported cases
4475 // using these four cases.
4476 //
4477 // Reduction | == 0 | != 0 | == MAX | != MAX
4478 // ------------+-----------+-----------+-----------+-----------
4479 // or/umax | AllNonNeg | AnyNeg | AnyNeg | AllNonNeg
4480 // and/umin | AnyNonNeg | AllNeg | AllNeg | AnyNonNeg
4481 // add | AllNonNeg | AnyNeg | AllNeg | AnyNonNeg
4482 //
4483 // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
4484 //
4485 // For easier codegen and check inversion, we use the following encoding:
4486 //
4487 // 1. Bit-3 === requires or/umax (1) or and/umin (0) check
4488 // 2. Bit-2 === checks < 0 (1) or > -1 (0)
4489 // 3. Bit-1 === universal (1) or existential (0) check
4490 //
4491 // AnyNeg = 0b110: uses or/umax, checks negative, any-check
4492 // AllNonNeg = 0b101: uses or/umax, checks non-neg, all-check
4493 // AnyNonNeg = 0b000: uses and/umin, checks non-neg, any-check
4494 // AllNeg = 0b011: uses and/umin, checks negative, all-check
4495 //
4496 // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
4497 //
4498 enum CheckKind : unsigned {
4499 AnyNonNeg = 0b000,
4500 AllNeg = 0b011,
4501 AllNonNeg = 0b101,
4502 AnyNeg = 0b110,
4503 };
4504 // Return true if we fold this check into or/umax and false for and/umin
4505 auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
4506 // Return true if we should check if result is negative and false otherwise
4507 auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
4508 // Logically invert the check
4509 auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
4510
4511 CheckKind Base;
4512 switch (OrigIID) {
4513 case Intrinsic::vector_reduce_or:
4514 case Intrinsic::vector_reduce_umax:
4515 Base = TestsNegative ? AnyNeg : AllNonNeg;
4516 break;
4517 case Intrinsic::vector_reduce_and:
4518 case Intrinsic::vector_reduce_umin:
4519 Base = TestsNegative ? AllNeg : AnyNonNeg;
4520 break;
4521 case Intrinsic::vector_reduce_add:
4522 Base = TestsNegative ? AllNeg : AllNonNeg;
4523 break;
4524 default:
4525 llvm_unreachable("Unexpected intrinsic");
4526 }
4527
4528 CheckKind Check = IsEq ? Base : Invert(Base);
4529
4530 auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
4531 InstructionCost ArithCost =
4533 VecTy, std::nullopt, CostKind);
4534 InstructionCost MinMaxCost =
4536 FastMathFlags(), CostKind);
4537 return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
4538 : std::make_pair(MinMax, MinMaxCost);
4539 };
4540
4541 // Choose output reduction based on encoding's MSB
4542 auto [NewIID, NewCost] = RequiresOr(Check)
4543 ? PickCheaper(Intrinsic::vector_reduce_or,
4544 Intrinsic::vector_reduce_umax)
4545 : PickCheaper(Intrinsic::vector_reduce_and,
4546 Intrinsic::vector_reduce_umin);
4547
4548 // Add cost of combining multiple sources with or/and
4549 if (NumSources > 1) {
4550 unsigned CombineOpc =
4551 RequiresOr(Check) ? Instruction::Or : Instruction::And;
4552 NewCost += TTI.getArithmeticInstrCost(CombineOpc, VecTy, CostKind) *
4553 (NumSources - 1);
4554 }
4555
4556 LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n OldCost: "
4557 << OldCost << " vs NewCost: " << NewCost << "\n");
4558
4559 if (NewCost > OldCost)
4560 return false;
4561
4562 // Generate the combined input and reduction
4563 Builder.SetInsertPoint(&I);
4564 Type *ScalarTy = VecTy->getScalarType();
4565
4566 Value *Input;
4567 if (NumSources == 1) {
4568 Input = Sources[0];
4569 } else {
4570 // Combine sources with or/and based on check type
4571 Input = RequiresOr(Check) ? Builder.CreateOr(Sources)
4572 : Builder.CreateAnd(Sources);
4573 }
4574
4575 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {Input});
4576 Value *NewCmp = IsNegativeCheck(Check) ? Builder.CreateIsNeg(NewReduce)
4577 : Builder.CreateIsNotNeg(NewReduce);
4578 replaceValue(I, *NewCmp);
4579 return true;
4580}
4581
4582/// vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
4583///
4584/// We can prove it for cases when:
4585///
4586/// 1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
4587/// 1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
4588/// 2. f(x) == 0 <=> x == 0
4589///
4590/// From 1 and 2 (or 1' and 2), we can infer that
4591///
4592/// OP f(X_i) == 0 <=> OP X_i == 0.
4593///
4594/// (1)
4595/// OP f(X_i) == 0 <=> \forall i \in [1, N] f(X_i) == 0
4596/// (2)
4597/// <=> \forall i \in [1, N] X_i == 0
4598/// (1)
4599/// <=> OP(X_i) == 0
4600///
4601/// For some of the OP's and f's, we need to have domain constraints on X
4602/// to ensure properties 1 (or 1') and 2.
4603bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
4604 CmpPredicate Pred;
4605 Value *Op;
4606 if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
4607 !ICmpInst::isEquality(Pred))
4608 return false;
4609
4610 auto *II = dyn_cast<IntrinsicInst>(Op);
4611 if (!II)
4612 return false;
4613
4614 switch (II->getIntrinsicID()) {
4615 case Intrinsic::vector_reduce_add:
4616 case Intrinsic::vector_reduce_or:
4617 case Intrinsic::vector_reduce_umin:
4618 case Intrinsic::vector_reduce_umax:
4619 case Intrinsic::vector_reduce_smin:
4620 case Intrinsic::vector_reduce_smax:
4621 break;
4622 default:
4623 return false;
4624 }
4625
4626 Value *InnerOp = II->getArgOperand(0);
4627
4628 // TODO: fixed vector type might be too restrictive
4629 if (!II->hasOneUse() || !isa<FixedVectorType>(InnerOp->getType()))
4630 return false;
4631
4632 Value *X = nullptr;
4633
4634 // Check for zero-preserving operations where f(x) = 0 <=> x = 0
4635 //
4636 // 1. f(x) = shl nuw x, y for arbitrary y
4637 // 2. f(x) = mul nuw x, c for defined c != 0
4638 // 3. f(x) = zext x
4639 // 4. f(x) = sext x
4640 // 5. f(x) = neg x
4641 //
4642 if (!(match(InnerOp, m_NUWShl(m_Value(X), m_Value())) || // Case 1
4643 match(InnerOp, m_NUWMul(m_Value(X), m_NonZeroInt())) || // Case 2
4644 match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
4645 match(InnerOp, m_SExt(m_Value(X))) || // Case 4
4646 match(InnerOp, m_Neg(m_Value(X))) // Case 5
4647 ))
4648 return false;
4649
4650 SimplifyQuery S = SQ.getWithInstruction(&I);
4651 auto *XTy = cast<FixedVectorType>(X->getType());
4652
4653 // Check for domain constraints for all supported reductions.
4654 //
4655 // a. OR X_i - has property 1 for every X
4656 // b. UMAX X_i - has property 1 for every X
4657 // c. UMIN X_i - has property 1' for every X
4658 // d. SMAX X_i - has property 1 for X >= 0
4659 // e. SMIN X_i - has property 1' for X >= 0
4660 // f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
4661 //
4662 // In order for the proof to work, we need 1 (or 1') to be true for both
4663 // OP f(X_i) and OP X_i and that's why below we check constraints twice.
4664 //
4665 // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
4666 // X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
4667 // of known bits, we can't reasonably hold knowledge of "either 0
4668 // or negative".
4669 switch (II->getIntrinsicID()) {
4670 case Intrinsic::vector_reduce_add: {
4671 // We need to check that both X_i and f(X_i) have enough leading
4672 // zeros to not overflow.
4673 KnownBits KnownX = computeKnownBits(X, S);
4674 KnownBits KnownFX = computeKnownBits(InnerOp, S);
4675 unsigned NumElems = XTy->getNumElements();
4676 // Adding N elements loses at most ceil(log2(N)) leading bits.
4677 unsigned LostBits = Log2_32_Ceil(NumElems);
4678 unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
4679 unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
4680 // Need at least one leading zero left after summation to ensure no overflow
4681 if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
4682 return false;
4683
4684 // We are not checking whether X or f(X) are positive explicitly because
4685 // we implicitly checked for it when we checked if both cases have enough
4686 // leading zeros to not wrap addition.
4687 break;
4688 }
4689 case Intrinsic::vector_reduce_smin:
4690 case Intrinsic::vector_reduce_smax:
4691 // Check whether X >= 0 and f(X) >= 0
4692 if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
4693 return false;
4694
4695 break;
4696 default:
4697 break;
4698 };
4699
4700 LLVM_DEBUG(dbgs() << "Found a reduction to 0 comparison with removable op: "
4701 << *II << "\n");
4702
4703 // For zext/sext, check if the transform is profitable using cost model.
4704 // For other operations (shl, mul, neg), we're removing an instruction
4705 // while keeping the same reduction type, so it's always profitable.
4706 if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
4707 auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
4708 Intrinsic::ID IID = II->getIntrinsicID();
4709
4711 cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
4713
4714 InstructionCost OldReduceCost, NewReduceCost;
4715 switch (IID) {
4716 case Intrinsic::vector_reduce_add:
4717 case Intrinsic::vector_reduce_or:
4718 OldReduceCost = TTI.getArithmeticReductionCost(
4719 getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
4720 NewReduceCost = TTI.getArithmeticReductionCost(
4721 getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
4722 break;
4723 case Intrinsic::vector_reduce_umin:
4724 case Intrinsic::vector_reduce_umax:
4725 case Intrinsic::vector_reduce_smin:
4726 case Intrinsic::vector_reduce_smax:
4727 OldReduceCost = TTI.getMinMaxReductionCost(
4728 getMinMaxReductionIntrinsicOp(IID), FXTy, FastMathFlags(), CostKind);
4729 NewReduceCost = TTI.getMinMaxReductionCost(
4730 getMinMaxReductionIntrinsicOp(IID), XTy, FastMathFlags(), CostKind);
4731 break;
4732 default:
4733 llvm_unreachable("Unexpected reduction");
4734 }
4735
4736 InstructionCost OldCost = OldReduceCost + ExtCost;
4737 InstructionCost NewCost =
4738 NewReduceCost + (InnerOp->hasOneUse() ? 0 : ExtCost);
4739
4740 LLVM_DEBUG(dbgs() << "Found a removable extension before reduction: "
4741 << *InnerOp << "\n OldCost: " << OldCost
4742 << " vs NewCost: " << NewCost << "\n");
4743
4744 // We consider transformation to still be potentially beneficial even
4745 // when the costs are the same because we might remove a use from f(X)
4746 // and unlock other optimizations. Equal costs would just mean that we
4747 // didn't make it worse in the worst case.
4748 if (NewCost > OldCost)
4749 return false;
4750 }
4751
4752 // Since we support zext and sext as f, we might change the scalar type
4753 // of the intrinsic.
4754 Type *Ty = XTy->getScalarType();
4755 Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
4756 Value *NewCmp =
4757 Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
4758 replaceValue(I, *NewCmp);
4759 return true;
4760}
4761
4762/// Fold comparisons of reduce.or/reduce.and with reduce.umax/reduce.umin
4763/// based on cost, preserving the comparison semantics.
4764///
4765/// We use two fundamental properties for each pair:
4766///
4767/// 1. or(X) == 0 <=> umax(X) == 0
4768/// 2. or(X) == 1 <=> umax(X) == 1
4769/// 3. sign(or(X)) == sign(umax(X))
4770///
4771/// 1. and(X) == -1 <=> umin(X) == -1
4772/// 2. and(X) == -2 <=> umin(X) == -2
4773/// 3. sign(and(X)) == sign(umin(X))
4774///
4775/// From these we can infer the following transformations:
4776/// a. or(X) ==/!= 0 <-> umax(X) ==/!= 0
4777/// b. or(X) s< 0 <-> umax(X) s< 0
4778/// c. or(X) s> -1 <-> umax(X) s> -1
4779/// d. or(X) s< 1 <-> umax(X) s< 1
4780/// e. or(X) ==/!= 1 <-> umax(X) ==/!= 1
4781/// f. or(X) s< 2 <-> umax(X) s< 2
4782/// g. and(X) ==/!= -1 <-> umin(X) ==/!= -1
4783/// h. and(X) s< 0 <-> umin(X) s< 0
4784/// i. and(X) s> -1 <-> umin(X) s> -1
4785/// j. and(X) s> -2 <-> umin(X) s> -2
4786/// k. and(X) ==/!= -2 <-> umin(X) ==/!= -2
4787/// l. and(X) s> -3 <-> umin(X) s> -3
4788///
4789bool VectorCombine::foldEquivalentReductionCmp(Instruction &I) {
4790 CmpPredicate Pred;
4791 Value *ReduceOp;
4792 const APInt *CmpVal;
4793 if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
4794 return false;
4795
4796 auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
4797 if (!II || !II->hasOneUse())
4798 return false;
4799
4800 const auto IsValidOrUmaxCmp = [&]() {
4801 // or === umax for i1
4802 if (CmpVal->getBitWidth() == 1)
4803 return true;
4804
4805 // Cases a and e
4806 bool IsEquality =
4807 (CmpVal->isZero() || CmpVal->isOne()) && ICmpInst::isEquality(Pred);
4808 // Case c
4809 bool IsPositive = CmpVal->isAllOnes() && Pred == ICmpInst::ICMP_SGT;
4810 // Cases b, d, and f
4811 bool IsNegative = (CmpVal->isZero() || CmpVal->isOne() || *CmpVal == 2) &&
4812 Pred == ICmpInst::ICMP_SLT;
4813 return IsEquality || IsPositive || IsNegative;
4814 };
4815
4816 const auto IsValidAndUminCmp = [&]() {
4817 // and === umin for i1
4818 if (CmpVal->getBitWidth() == 1)
4819 return true;
4820
4821 const auto LeadingOnes = CmpVal->countl_one();
4822
4823 // Cases g and k
4824 bool IsEquality =
4825 (CmpVal->isAllOnes() || LeadingOnes + 1 == CmpVal->getBitWidth()) &&
4827 // Case h
4828 bool IsNegative = CmpVal->isZero() && Pred == ICmpInst::ICMP_SLT;
4829 // Cases i, j, and l
4830 bool IsPositive =
4831 // if the number has at least N - 2 leading ones
4832 // and the two LSBs are:
4833 // - 1 x 1 -> -1
4834 // - 1 x 0 -> -2
4835 // - 0 x 1 -> -3
4836 LeadingOnes + 2 >= CmpVal->getBitWidth() &&
4837 ((*CmpVal)[0] || (*CmpVal)[1]) && Pred == ICmpInst::ICMP_SGT;
4838 return IsEquality || IsNegative || IsPositive;
4839 };
4840
4841 Intrinsic::ID OriginalIID = II->getIntrinsicID();
4842 Intrinsic::ID AlternativeIID;
4843
4844 // Check if this is a valid comparison pattern and determine the alternate
4845 // reduction intrinsic.
4846 switch (OriginalIID) {
4847 case Intrinsic::vector_reduce_or:
4848 if (!IsValidOrUmaxCmp())
4849 return false;
4850 AlternativeIID = Intrinsic::vector_reduce_umax;
4851 break;
4852 case Intrinsic::vector_reduce_umax:
4853 if (!IsValidOrUmaxCmp())
4854 return false;
4855 AlternativeIID = Intrinsic::vector_reduce_or;
4856 break;
4857 case Intrinsic::vector_reduce_and:
4858 if (!IsValidAndUminCmp())
4859 return false;
4860 AlternativeIID = Intrinsic::vector_reduce_umin;
4861 break;
4862 case Intrinsic::vector_reduce_umin:
4863 if (!IsValidAndUminCmp())
4864 return false;
4865 AlternativeIID = Intrinsic::vector_reduce_and;
4866 break;
4867 default:
4868 return false;
4869 }
4870
4871 Value *X = II->getArgOperand(0);
4872 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
4873 if (!VecTy)
4874 return false;
4875
4876 const auto GetReductionCost = [&](Intrinsic::ID IID) -> InstructionCost {
4877 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4878 if (ReductionOpc != Instruction::ICmp)
4879 return TTI.getArithmeticReductionCost(ReductionOpc, VecTy, std::nullopt,
4880 CostKind);
4882 FastMathFlags(), CostKind);
4883 };
4884
4885 InstructionCost OrigCost = GetReductionCost(OriginalIID);
4886 InstructionCost AltCost = GetReductionCost(AlternativeIID);
4887
4888 LLVM_DEBUG(dbgs() << "Found equivalent reduction cmp: " << I
4889 << "\n OrigCost: " << OrigCost
4890 << " vs AltCost: " << AltCost << "\n");
4891
4892 if (AltCost >= OrigCost)
4893 return false;
4894
4895 Builder.SetInsertPoint(&I);
4896 Type *ScalarTy = VecTy->getScalarType();
4897 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, AlternativeIID, {X});
4898 Value *NewCmp =
4899 Builder.CreateICmp(Pred, NewReduce, ConstantInt::get(ScalarTy, *CmpVal));
4900
4901 replaceValue(I, *NewCmp);
4902 return true;
4903}
4904
4905/// Used by foldReduceAddCmpZero to check if we can prove that a value is
4906/// non-positive.
4907/// KnownBits cannot see sext <? x i1> as non-positive: each top bit equals a
4908/// single unknown input bit, which a per-bit lattice cannot track. The fold's
4909/// target shape is popcount-style sums of <N x i1> valid/invalid masks (e.g.
4910/// ray-intersection hits) tested for any-hit.
4911/// Previous attempts to approximate the known bits of such expressions were
4912/// using a fully recursive value tracking approach to infer a constant range
4913/// but ultimately turned to be too expensive in compile time.
4914static bool isKnownNonPositive(const Value *V, const SimplifyQuery &SQ,
4915 unsigned Depth = 0) {
4916 constexpr unsigned MaxLocalDepth = 2;
4917 if (Depth > MaxLocalDepth)
4918 return false;
4919
4920 auto NumSignBits = [&](const Value *X) {
4921 return ComputeNumSignBits(X, SQ.DL, SQ.AC, SQ.CxtI, SQ.DT);
4922 };
4923 if (NumSignBits(V) == V->getType()->getScalarSizeInBits())
4924 return true;
4925
4926 Value *A, *B;
4927 if (match(V, m_Add(m_Value(A), m_Value(B))))
4928 return NumSignBits(A) >= 2 && NumSignBits(B) >= 2 &&
4929 isKnownNonPositive(A, SQ, Depth + 1) &&
4930 isKnownNonPositive(B, SQ, Depth + 1);
4931
4932 return computeKnownBits(V, SQ).isNonPositive();
4933}
4934
4935/// Fold (icmp pred (reduce.add X), 0) to (icmp pred' (reduce.or X), 0) when X
4936/// has lanes known to all be non-negative or all non-positive, so that
4937/// sum == 0 iff every lane is 0. Falls back to reduce.umax if reduce.or is
4938/// more expensive on the target.
4939bool VectorCombine::foldReduceAddCmpZero(Instruction &I) {
4940 CmpPredicate Pred;
4941 Value *Vec;
4942 if (!match(&I, m_ICmp(Pred,
4944 m_Value(Vec))),
4945 m_Zero())))
4946 return false;
4947
4948 auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
4949 if (!VecTy || VecTy->getNumElements() < 2)
4950 return false;
4951
4952 SimplifyQuery Q = SQ.getWithInstruction(&I);
4953 bool IsNonNegative = isKnownNonNegative(Vec, Q);
4954 bool IsNonPositive = !IsNonNegative && isKnownNonPositive(Vec, Q);
4955 if (!IsNonNegative && !IsNonPositive)
4956 return false;
4957
4958 // Summing NumElts lanes can consume up to log2(NumElts) sign bits. Require
4959 // strictly more headroom than that so the sum cannot wrap to zero.
4960 unsigned NumElts = VecTy->getNumElements();
4961 unsigned NumSignBits = ComputeNumSignBits(Vec, *DL, SQ.AC, &I, &DT);
4962 if (Log2_32(NumElts) >= NumSignBits)
4963 return false;
4964
4965 ICmpInst::Predicate NewPred;
4966 switch (Pred) {
4967 case ICmpInst::ICMP_EQ:
4968 case ICmpInst::ICMP_ULE:
4969 case ICmpInst::ICMP_SLE:
4970 case ICmpInst::ICMP_SGE:
4971 NewPred = ICmpInst::ICMP_EQ;
4972 break;
4973 case ICmpInst::ICMP_NE:
4974 case ICmpInst::ICMP_UGT:
4975 case ICmpInst::ICMP_SGT:
4976 case ICmpInst::ICMP_SLT:
4977 NewPred = ICmpInst::ICMP_NE;
4978 break;
4979 default:
4980 return false;
4981 }
4982
4983 // SGT and SLE on a non-positive tree, and SLT and SGE on a non-negative
4984 // tree, are tautologies (always true or always false). Leave those to
4985 // InstCombine rather than mapping them here. Remaining signed inequalities
4986 // also need one extra sign bit so the sum cannot flip sign.
4987 if (!IsNonNegative &&
4988 (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE))
4989 return false;
4990 if (!IsNonPositive &&
4991 (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE))
4992 return false;
4993 if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE ||
4994 Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) &&
4995 Log2_32(NumElts) >= NumSignBits - 1)
4996 return false;
4997
4999 Instruction::Add, VecTy, std::nullopt, CostKind);
5001 Instruction::Or, VecTy, std::nullopt, CostKind);
5003 Intrinsic::umax, VecTy, FastMathFlags(), CostKind);
5004 if (!OrCost.isValid() && !UmaxCost.isValid())
5005 return false;
5006 bool UseOr = OrCost.isValid() && (!UmaxCost.isValid() || OrCost <= UmaxCost);
5007 InstructionCost AltCost = UseOr ? OrCost : UmaxCost;
5008 if (AltCost > OrigCost)
5009 return false;
5010
5011 Builder.SetInsertPoint(&I);
5012 Value *NewReduce = UseOr ? Builder.CreateOrReduce(Vec)
5013 : Builder.CreateIntrinsic(
5014 Intrinsic::vector_reduce_umax, {VecTy}, {Vec});
5015 Worklist.pushValue(NewReduce);
5016 Value *NewCmp = Builder.CreateICmp(
5017 NewPred, NewReduce, ConstantInt::getNullValue(VecTy->getScalarType()));
5018 replaceValue(I, *NewCmp);
5019 return true;
5020}
5021
5022/// Returns true if this ShuffleVectorInst eventually feeds into a
5023/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
5024/// chains of shuffles and binary operators (in any combination/order).
5025/// The search does not go deeper than the given Depth.
5027 constexpr unsigned MaxVisited = 32;
5030 bool FoundReduction = false;
5031
5032 WorkList.push_back(SVI);
5033 while (!WorkList.empty()) {
5034 Instruction *I = WorkList.pop_back_val();
5035 for (User *U : I->users()) {
5036 auto *UI = cast<Instruction>(U);
5037 if (!UI || !Visited.insert(UI).second)
5038 continue;
5039 if (Visited.size() > MaxVisited)
5040 return false;
5041 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
5042 // More than one reduction reached
5043 if (FoundReduction)
5044 return false;
5045 switch (II->getIntrinsicID()) {
5046 case Intrinsic::vector_reduce_add:
5047 case Intrinsic::vector_reduce_mul:
5048 case Intrinsic::vector_reduce_and:
5049 case Intrinsic::vector_reduce_or:
5050 case Intrinsic::vector_reduce_xor:
5051 case Intrinsic::vector_reduce_smin:
5052 case Intrinsic::vector_reduce_smax:
5053 case Intrinsic::vector_reduce_umin:
5054 case Intrinsic::vector_reduce_umax:
5055 FoundReduction = true;
5056 continue;
5057 default:
5058 return false;
5059 }
5060 }
5061
5063 return false;
5064
5065 WorkList.emplace_back(UI);
5066 }
5067 }
5068 return FoundReduction;
5069}
5070
5071/// This method looks for groups of shuffles acting on binops, of the form:
5072/// %x = shuffle ...
5073/// %y = shuffle ...
5074/// %a = binop %x, %y
5075/// %b = binop %x, %y
5076/// shuffle %a, %b, selectmask
5077/// We may, especially if the shuffle is wider than legal, be able to convert
5078/// the shuffle to a form where only parts of a and b need to be computed. On
5079/// architectures with no obvious "select" shuffle, this can reduce the total
5080/// number of operations if the target reports them as cheaper.
5081bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
5082 auto *SVI = cast<ShuffleVectorInst>(&I);
5083 auto *VT = cast<FixedVectorType>(I.getType());
5084 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
5085 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
5086 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
5087 VT != Op0->getType())
5088 return false;
5089
5090 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
5091 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
5092 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
5093 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
5094 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
5095 auto checkSVNonOpUses = [&](Instruction *I) {
5096 if (!I || I->getOperand(0)->getType() != VT)
5097 return true;
5098 return any_of(I->users(), [&](User *U) {
5099 return U != Op0 && U != Op1 &&
5100 !(isa<ShuffleVectorInst>(U) &&
5101 (InputShuffles.contains(cast<Instruction>(U)) ||
5102 isInstructionTriviallyDead(cast<Instruction>(U))));
5103 });
5104 };
5105 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
5106 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
5107 return false;
5108
5109 // Collect all the uses that are shuffles that we can transform together. We
5110 // may not have a single shuffle, but a group that can all be transformed
5111 // together profitably.
5113 auto collectShuffles = [&](Instruction *I) {
5114 for (auto *U : I->users()) {
5115 auto *SV = dyn_cast<ShuffleVectorInst>(U);
5116 if (!SV || SV->getType() != VT)
5117 return false;
5118 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
5119 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
5120 return false;
5121 if (!llvm::is_contained(Shuffles, SV))
5122 Shuffles.push_back(SV);
5123 }
5124 return true;
5125 };
5126 if (!collectShuffles(Op0) || !collectShuffles(Op1))
5127 return false;
5128 // From a reduction, we need to be processing a single shuffle, otherwise the
5129 // other uses will not be lane-invariant.
5130 if (FromReduction && Shuffles.size() > 1)
5131 return false;
5132
5133 // Add any shuffle uses for the shuffles we have found, to include them in our
5134 // cost calculations.
5135 if (!FromReduction) {
5136 for (size_t Idx = 0, E = Shuffles.size(); Idx != E; ++Idx) {
5137 for (auto *U : Shuffles[Idx]->users()) {
5138 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
5139 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
5140 Shuffles.push_back(SSV);
5141 }
5142 }
5143 }
5144
5145 // For each of the output shuffles, we try to sort all the first vector
5146 // elements to the beginning, followed by the second array elements at the
5147 // end. If the binops are legalized to smaller vectors, this may reduce total
5148 // number of binops. We compute the ReconstructMask mask needed to convert
5149 // back to the original lane order.
5151 SmallVector<SmallVector<int>> OrigReconstructMasks;
5152 int MaxV1Elt = 0, MaxV2Elt = 0;
5153 unsigned NumElts = VT->getNumElements();
5154 for (ShuffleVectorInst *SVN : Shuffles) {
5155 SmallVector<int> Mask;
5156 SVN->getShuffleMask(Mask);
5157
5158 // Check the operands are the same as the original, or reversed (in which
5159 // case we need to commute the mask).
5160 Value *SVOp0 = SVN->getOperand(0);
5161 Value *SVOp1 = SVN->getOperand(1);
5162 if (isa<UndefValue>(SVOp1)) {
5163 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
5164 SVOp0 = SSV->getOperand(0);
5165 SVOp1 = SSV->getOperand(1);
5166 for (int &Elem : Mask) {
5167 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
5168 return false;
5169 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
5170 }
5171 }
5172 if (SVOp0 == Op1 && SVOp1 == Op0) {
5173 std::swap(SVOp0, SVOp1);
5175 }
5176 if (SVOp0 != Op0 || SVOp1 != Op1)
5177 return false;
5178
5179 // Calculate the reconstruction mask for this shuffle, as the mask needed to
5180 // take the packed values from Op0/Op1 and reconstructing to the original
5181 // order.
5182 SmallVector<int> ReconstructMask;
5183 for (unsigned I = 0; I < Mask.size(); I++) {
5184 if (Mask[I] < 0) {
5185 ReconstructMask.push_back(-1);
5186 } else if (Mask[I] < static_cast<int>(NumElts)) {
5187 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
5188 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
5189 return Mask[I] == A.first;
5190 });
5191 if (It != V1.end())
5192 ReconstructMask.push_back(It - V1.begin());
5193 else {
5194 ReconstructMask.push_back(V1.size());
5195 V1.emplace_back(Mask[I], V1.size());
5196 }
5197 } else {
5198 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
5199 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
5200 return Mask[I] - static_cast<int>(NumElts) == A.first;
5201 });
5202 if (It != V2.end())
5203 ReconstructMask.push_back(NumElts + It - V2.begin());
5204 else {
5205 ReconstructMask.push_back(NumElts + V2.size());
5206 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
5207 }
5208 }
5209 }
5210
5211 // For reductions, we know that the lane ordering out doesn't alter the
5212 // result. In-order can help simplify the shuffle away.
5213 if (FromReduction)
5214 sort(ReconstructMask);
5215 OrigReconstructMasks.push_back(std::move(ReconstructMask));
5216 }
5217
5218 // If the Maximum element used from V1 and V2 are not larger than the new
5219 // vectors, the vectors are already packes and performing the optimization
5220 // again will likely not help any further. This also prevents us from getting
5221 // stuck in a cycle in case the costs do not also rule it out.
5222 if (V1.empty() || V2.empty() ||
5223 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
5224 MaxV2Elt == static_cast<int>(V2.size()) - 1))
5225 return false;
5226
5227 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
5228 // shuffle of another shuffle, or not a shuffle (that is treated like a
5229 // identity shuffle).
5230 auto GetBaseMaskValue = [&](Instruction *I, int M) {
5231 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5232 if (!SV)
5233 return M;
5234 if (isa<UndefValue>(SV->getOperand(1)))
5235 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5236 if (InputShuffles.contains(SSV))
5237 return SSV->getMaskValue(SV->getMaskValue(M));
5238 return SV->getMaskValue(M);
5239 };
5240
5241 // Attempt to sort the inputs my ascending mask values to make simpler input
5242 // shuffles and push complex shuffles down to the uses. We sort on the first
5243 // of the two input shuffle orders, to try and get at least one input into a
5244 // nice order.
5245 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
5246 std::pair<int, int> Y) {
5247 int MXA = GetBaseMaskValue(A, X.first);
5248 int MYA = GetBaseMaskValue(A, Y.first);
5249 return MXA < MYA;
5250 };
5251 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
5252 return SortBase(SVI0A, A, B);
5253 });
5254 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
5255 return SortBase(SVI1A, A, B);
5256 });
5257 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
5258 // modified order of the input shuffles.
5259 SmallVector<SmallVector<int>> ReconstructMasks;
5260 for (const auto &Mask : OrigReconstructMasks) {
5261 SmallVector<int> ReconstructMask;
5262 for (int M : Mask) {
5263 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
5264 auto It = find_if(V, [M](auto A) { return A.second == M; });
5265 assert(It != V.end() && "Expected all entries in Mask");
5266 return std::distance(V.begin(), It);
5267 };
5268 if (M < 0)
5269 ReconstructMask.push_back(-1);
5270 else if (M < static_cast<int>(NumElts)) {
5271 ReconstructMask.push_back(FindIndex(V1, M));
5272 } else {
5273 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
5274 }
5275 }
5276 ReconstructMasks.push_back(std::move(ReconstructMask));
5277 }
5278
5279 // Calculate the masks needed for the new input shuffles, which get padded
5280 // with undef
5281 SmallVector<int> V1A, V1B, V2A, V2B;
5282 for (unsigned I = 0; I < V1.size(); I++) {
5283 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
5284 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
5285 }
5286 for (unsigned I = 0; I < V2.size(); I++) {
5287 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
5288 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
5289 }
5290 while (V1A.size() < NumElts) {
5293 }
5294 while (V2A.size() < NumElts) {
5297 }
5298
5299 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
5300 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5301 if (!SV)
5302 return C;
5303 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
5306 VT, VT, SV->getShuffleMask(), CostKind);
5307 };
5308 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5309 return C +
5311 };
5312
5313 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
5314 unsigned MaxVectorSize =
5316 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
5317 if (MaxElementsInVector == 0)
5318 return false;
5319 // When there are multiple shufflevector operations on the same input,
5320 // especially when the vector length is larger than the register size,
5321 // identical shuffle patterns may occur across different groups of elements.
5322 // To avoid overestimating the cost by counting these repeated shuffles more
5323 // than once, we only account for unique shuffle patterns. This adjustment
5324 // prevents inflated costs in the cost model for wide vectors split into
5325 // several register-sized groups.
5326 std::set<SmallVector<int, 4>> UniqueShuffles;
5327 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5328 // Compute the cost for performing the shuffle over the full vector.
5329 auto ShuffleCost =
5331 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
5332 if (NumFullVectors < 2)
5333 return C + ShuffleCost;
5334 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
5335 unsigned NumUniqueGroups = 0;
5336 unsigned NumGroups = Mask.size() / MaxElementsInVector;
5337 // For each group of MaxElementsInVector contiguous elements,
5338 // collect their shuffle pattern and insert into the set of unique patterns.
5339 for (unsigned I = 0; I < NumFullVectors; ++I) {
5340 for (unsigned J = 0; J < MaxElementsInVector; ++J)
5341 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
5342 if (UniqueShuffles.insert(SubShuffle).second)
5343 NumUniqueGroups += 1;
5344 }
5345 return C + ShuffleCost * NumUniqueGroups / NumGroups;
5346 };
5347 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
5348 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5349 if (!SV)
5350 return C;
5351 SmallVector<int, 16> Mask;
5352 SV->getShuffleMask(Mask);
5353 return AddShuffleMaskAdjustedCost(C, Mask);
5354 };
5355 // Check that input consists of ShuffleVectors applied to the same input
5356 auto AllShufflesHaveSameOperands =
5357 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
5358 if (InputShuffles.size() < 2)
5359 return false;
5360 ShuffleVectorInst *FirstSV =
5361 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
5362 if (!FirstSV)
5363 return false;
5364
5365 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
5366 return std::all_of(
5367 std::next(InputShuffles.begin()), InputShuffles.end(),
5368 [&](Instruction *I) {
5369 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
5370 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
5371 });
5372 };
5373
5374 // Get the costs of the shuffles + binops before and after with the new
5375 // shuffle masks.
5376 InstructionCost CostBefore =
5377 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
5378 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
5379 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
5380 InstructionCost(0), AddShuffleCost);
5381 if (AllShufflesHaveSameOperands(InputShuffles)) {
5382 UniqueShuffles.clear();
5383 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5384 InstructionCost(0), AddShuffleAdjustedCost);
5385 } else {
5386 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5387 InstructionCost(0), AddShuffleCost);
5388 }
5389
5390 // The new binops will be unused for lanes past the used shuffle lengths.
5391 // These types attempt to get the correct cost for that from the target.
5392 FixedVectorType *Op0SmallVT =
5393 FixedVectorType::get(VT->getScalarType(), V1.size());
5394 FixedVectorType *Op1SmallVT =
5395 FixedVectorType::get(VT->getScalarType(), V2.size());
5396 InstructionCost CostAfter =
5397 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
5398 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
5399 UniqueShuffles.clear();
5400 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
5401 InstructionCost(0), AddShuffleMaskAdjustedCost);
5402 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
5403 CostAfter +=
5404 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
5405 InstructionCost(0), AddShuffleMaskCost);
5406
5407 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
5408 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
5409 << " vs CostAfter: " << CostAfter << "\n");
5410 if (CostBefore < CostAfter ||
5411 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
5412 return false;
5413
5414 // The cost model has passed, create the new instructions.
5415 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
5416 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5417 if (!SV)
5418 return I;
5419 if (isa<UndefValue>(SV->getOperand(1)))
5420 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5421 if (InputShuffles.contains(SSV))
5422 return SSV->getOperand(Op);
5423 return SV->getOperand(Op);
5424 };
5425 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
5426 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
5427 GetShuffleOperand(SVI0A, 1), V1A);
5428 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
5429 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
5430 GetShuffleOperand(SVI0B, 1), V1B);
5431 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
5432 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
5433 GetShuffleOperand(SVI1A, 1), V2A);
5434 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
5435 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
5436 GetShuffleOperand(SVI1B, 1), V2B);
5437 Builder.SetInsertPoint(Op0);
5438 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
5439 NSV0A, NSV0B);
5440 if (auto *I = dyn_cast<Instruction>(NOp0))
5441 I->copyIRFlags(Op0, true);
5442 Builder.SetInsertPoint(Op1);
5443 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
5444 NSV1A, NSV1B);
5445 if (auto *I = dyn_cast<Instruction>(NOp1))
5446 I->copyIRFlags(Op1, true);
5447
5448 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
5449 Builder.SetInsertPoint(Shuffles[S]);
5450 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
5451 replaceValue(*Shuffles[S], *NSV, false);
5452 }
5453
5454 Worklist.pushValue(NSV0A);
5455 Worklist.pushValue(NSV0B);
5456 Worklist.pushValue(NSV1A);
5457 Worklist.pushValue(NSV1B);
5458 return true;
5459}
5460
5461/// Check if instruction depends on ZExt and this ZExt can be moved after the
5462/// instruction. Move ZExt if it is profitable. For example:
5463/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
5464/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
5465/// Cost model calculations takes into account if zext(x) has other users and
5466/// whether it can be propagated through them too.
5467bool VectorCombine::shrinkType(Instruction &I) {
5468 Value *ZExted, *OtherOperand;
5469 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
5470 m_Value(OtherOperand))) &&
5471 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
5472 return false;
5473
5474 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
5475
5476 auto *BigTy = cast<FixedVectorType>(I.getType());
5477 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
5478 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
5479
5480 if (I.getOpcode() == Instruction::LShr) {
5481 // Check that the shift amount is less than the number of bits in the
5482 // smaller type. Otherwise, the smaller lshr will return a poison value.
5483 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
5484 if (ShAmtKB.getMaxValue().uge(BW))
5485 return false;
5486 } else {
5487 // Check that the expression overall uses at most the same number of bits as
5488 // ZExted
5489 KnownBits KB = computeKnownBits(&I, *DL);
5490 if (KB.countMaxActiveBits() > BW)
5491 return false;
5492 }
5493
5494 // Calculate costs of leaving current IR as it is and moving ZExt operation
5495 // later, along with adding truncates if needed
5497 Instruction::ZExt, BigTy, SmallTy,
5498 TargetTransformInfo::CastContextHint::None, CostKind);
5499 InstructionCost CurrentCost = ZExtCost;
5500 InstructionCost ShrinkCost = 0;
5501
5502 // Calculate total cost and check that we can propagate through all ZExt users
5503 for (User *U : ZExtOperand->users()) {
5504 auto *UI = cast<Instruction>(U);
5505 if (UI == &I) {
5506 CurrentCost +=
5507 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5508 ShrinkCost +=
5509 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5510 ShrinkCost += ZExtCost;
5511 continue;
5512 }
5513
5514 if (!Instruction::isBinaryOp(UI->getOpcode()))
5515 return false;
5516
5517 // Check if we can propagate ZExt through its other users
5518 KnownBits KB = computeKnownBits(UI, *DL);
5519 if (KB.countMaxActiveBits() > BW)
5520 return false;
5521
5522 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5523 ShrinkCost +=
5524 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5525 ShrinkCost += ZExtCost;
5526 }
5527
5528 // If the other instruction operand is not a constant, we'll need to
5529 // generate a truncate instruction. So we have to adjust cost
5530 if (!isa<Constant>(OtherOperand))
5531 ShrinkCost += TTI.getCastInstrCost(
5532 Instruction::Trunc, SmallTy, BigTy,
5533 TargetTransformInfo::CastContextHint::None, CostKind);
5534
5535 // If the cost of shrinking types and leaving the IR is the same, we'll lean
5536 // towards modifying the IR because shrinking opens opportunities for other
5537 // shrinking optimisations.
5538 if (ShrinkCost > CurrentCost)
5539 return false;
5540
5541 Builder.SetInsertPoint(&I);
5542 Value *Op0 = ZExted;
5543 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
5544 // Keep the order of operands the same
5545 if (I.getOperand(0) == OtherOperand)
5546 std::swap(Op0, Op1);
5547 Value *NewBinOp =
5548 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
5549 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
5550 cast<Instruction>(NewBinOp)->copyMetadata(I);
5551 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
5552 replaceValue(I, *NewZExtr);
5553 return true;
5554}
5555
5556/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
5557/// shuffle (DstVec, SrcVec, Mask)
5558bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
5559 Value *DstVec, *SrcVec;
5560 uint64_t ExtIdx, InsIdx;
5561 if (!match(&I,
5562 m_InsertElt(m_Value(DstVec),
5563 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
5564 m_ConstantInt(InsIdx))))
5565 return false;
5566
5567 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
5568 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
5569 // We can try combining vectors with different element sizes.
5570 if (!DstVecTy || !SrcVecTy ||
5571 SrcVecTy->getElementType() != DstVecTy->getElementType())
5572 return false;
5573
5574 unsigned NumDstElts = DstVecTy->getNumElements();
5575 unsigned NumSrcElts = SrcVecTy->getNumElements();
5576 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
5577 return false;
5578
5579 // Insertion into poison is a cheaper single operand shuffle.
5581 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
5582
5583 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
5584 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
5585 if (NeedDstSrcSwap) {
5587 Mask[InsIdx] = ExtIdx % NumDstElts;
5588 std::swap(DstVec, SrcVec);
5589 } else {
5591 std::iota(Mask.begin(), Mask.end(), 0);
5592 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
5593 }
5594
5595 // Cost
5596 auto *Ins = cast<InsertElementInst>(&I);
5597 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
5598 InstructionCost InsCost =
5599 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
5600 InstructionCost ExtCost =
5601 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
5602 InstructionCost OldCost = ExtCost + InsCost;
5603
5604 InstructionCost NewCost = 0;
5605 SmallVector<int> ExtToVecMask;
5606 if (!NeedExpOrNarrow) {
5607 // Ignore 'free' identity insertion shuffle.
5608 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
5609 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
5610 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
5611 nullptr, {DstVec, SrcVec});
5612 } else {
5613 // When creating a length-changing-vector, always try to keep the relevant
5614 // element in an equivalent position, so that bulk shuffles are more likely
5615 // to be useful.
5616 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
5617 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
5618 // Add cost for expanding or narrowing
5620 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
5621 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
5622 }
5623
5624 if (!Ext->hasOneUse())
5625 NewCost += ExtCost;
5626
5627 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
5628 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5629 << "\n");
5630
5631 if (OldCost < NewCost)
5632 return false;
5633
5634 if (NeedExpOrNarrow) {
5635 if (!NeedDstSrcSwap)
5636 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
5637 else
5638 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
5639 }
5640
5641 // Canonicalize undef param to RHS to help further folds.
5642 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
5643 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
5644 std::swap(DstVec, SrcVec);
5645 }
5646
5647 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
5648 replaceValue(I, *Shuf);
5649
5650 return true;
5651}
5652
5653/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
5654/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
5655/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
5656/// before casting it back into `<vscale x 16 x i32>`.
5657bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
5658 const APInt *SplatVal0, *SplatVal1;
5660 m_APInt(SplatVal0), m_APInt(SplatVal1))))
5661 return false;
5662
5663 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
5664 << "\n");
5665
5666 auto *VTy =
5667 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
5668 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
5669 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
5670
5671 // Just in case the cost of interleave2 intrinsic and bitcast are both
5672 // invalid, in which case we want to bail out, we use <= rather
5673 // than < here. Even they both have valid and equal costs, it's probably
5674 // not a good idea to emit a high-cost constant splat.
5676 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
5678 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
5679 << *I.getType() << " is too high.\n");
5680 return false;
5681 }
5682
5683 APInt NewSplatVal = SplatVal1->zext(Width * 2);
5684 NewSplatVal <<= Width;
5685 NewSplatVal |= SplatVal0->zext(Width * 2);
5686 auto *NewSplat = ConstantVector::getSplat(
5687 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
5688
5689 IRBuilder<> Builder(&I);
5690 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
5691 return true;
5692}
5693
5694// Attempt to shrink loads that are only used by shufflevector instructions.
5695bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
5696 auto *OldLoad = dyn_cast<LoadInst>(&I);
5697 if (!OldLoad || !OldLoad->isSimple())
5698 return false;
5699
5700 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
5701 if (!OldLoadTy)
5702 return false;
5703
5704 unsigned const OldNumElements = OldLoadTy->getNumElements();
5705
5706 // Search all uses of load. If all uses are shufflevector instructions, and
5707 // the second operands are all poison values, find the minimum and maximum
5708 // indices of the vector elements referenced by all shuffle masks.
5709 // Otherwise return `std::nullopt`.
5710 using IndexRange = std::pair<int, int>;
5711 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
5712 IndexRange OutputRange = IndexRange(OldNumElements, -1);
5713 for (llvm::Use &Use : I.uses()) {
5714 // Ensure all uses match the required pattern.
5715 User *Shuffle = Use.getUser();
5716 ArrayRef<int> Mask;
5717
5718 if (!match(Shuffle,
5719 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
5720 return std::nullopt;
5721
5722 // Ignore shufflevector instructions that have no uses.
5723 if (Shuffle->use_empty())
5724 continue;
5725
5726 // Find the min and max indices used by the shufflevector instruction.
5727 for (int Index : Mask) {
5728 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
5729 OutputRange.first = std::min(Index, OutputRange.first);
5730 OutputRange.second = std::max(Index, OutputRange.second);
5731 }
5732 }
5733 }
5734
5735 if (OutputRange.second < OutputRange.first)
5736 return std::nullopt;
5737
5738 return OutputRange;
5739 };
5740
5741 // Get the range of vector elements used by shufflevector instructions.
5742 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
5743 unsigned const NewNumElements = Indices->second + 1u;
5744
5745 // If the range of vector elements is smaller than the full load, attempt
5746 // to create a smaller load.
5747 if (NewNumElements < OldNumElements) {
5748 IRBuilder Builder(&I);
5749 Builder.SetCurrentDebugLocation(I.getDebugLoc());
5750
5751 // Calculate costs of old and new ops.
5752 Type *ElemTy = OldLoadTy->getElementType();
5753 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
5754 Value *PtrOp = OldLoad->getPointerOperand();
5755
5757 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
5758 OldLoad->getPointerAddressSpace(), CostKind);
5759 InstructionCost NewCost =
5760 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
5761 OldLoad->getPointerAddressSpace(), CostKind);
5762
5763 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
5765 unsigned const MaxIndex = NewNumElements * 2u;
5766
5767 for (llvm::Use &Use : I.uses()) {
5768 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
5769
5770 // Ignore shufflevector instructions that have no uses.
5771 if (Shuffle->use_empty())
5772 continue;
5773
5774 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
5775
5776 // Create entry for new use.
5777 NewUses.push_back({Shuffle, OldMask});
5778
5779 // Validate mask indices.
5780 for (int Index : OldMask) {
5781 if (Index >= static_cast<int>(MaxIndex))
5782 return false;
5783 }
5784
5785 // Update costs.
5786 OldCost +=
5788 OldLoadTy, OldMask, CostKind);
5789 NewCost +=
5791 NewLoadTy, OldMask, CostKind);
5792 }
5793
5794 LLVM_DEBUG(
5795 dbgs() << "Found a load used only by shufflevector instructions: "
5796 << I << "\n OldCost: " << OldCost
5797 << " vs NewCost: " << NewCost << "\n");
5798
5799 if (OldCost < NewCost || !NewCost.isValid())
5800 return false;
5801
5802 // Create new load of smaller vector.
5803 auto *NewLoad = cast<LoadInst>(
5804 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
5805 NewLoad->copyMetadata(I);
5806
5807 // Replace all uses.
5808 for (UseEntry &Use : NewUses) {
5809 ShuffleVectorInst *Shuffle = Use.first;
5810 std::vector<int> &NewMask = Use.second;
5811
5812 Builder.SetInsertPoint(Shuffle);
5813 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
5814 Value *NewShuffle = Builder.CreateShuffleVector(
5815 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
5816
5817 replaceValue(*Shuffle, *NewShuffle, false);
5818 }
5819
5820 return true;
5821 }
5822 }
5823 return false;
5824}
5825
5826// Attempt to narrow a phi of shufflevector instructions where the two incoming
5827// values have the same operands but different masks. If the two shuffle masks
5828// are offsets of one another we can use one branch to rotate the incoming
5829// vector and perform one larger shuffle after the phi.
5830bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
5831 auto *Phi = dyn_cast<PHINode>(&I);
5832 if (!Phi || Phi->getNumIncomingValues() != 2u)
5833 return false;
5834
5835 Value *Op = nullptr;
5836 ArrayRef<int> Mask0;
5837 ArrayRef<int> Mask1;
5838
5839 if (!match(Phi->getOperand(0u),
5840 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
5841 !match(Phi->getOperand(1u),
5842 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
5843 return false;
5844
5845 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
5846
5847 // Ensure result vectors are wider than the argument vector.
5848 auto *InputVT = cast<FixedVectorType>(Op->getType());
5849 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
5850 auto const InputNumElements = InputVT->getNumElements();
5851
5852 if (InputNumElements >= ResultVT->getNumElements())
5853 return false;
5854
5855 // Take the difference of the two shuffle masks at each index. Ignore poison
5856 // values at the same index in both masks.
5857 SmallVector<int, 16> NewMask;
5858 NewMask.reserve(Mask0.size());
5859
5860 for (auto [M0, M1] : zip(Mask0, Mask1)) {
5861 if (M0 >= 0 && M1 >= 0)
5862 NewMask.push_back(M0 - M1);
5863 else if (M0 == -1 && M1 == -1)
5864 continue;
5865 else
5866 return false;
5867 }
5868
5869 // Ensure all elements of the new mask are equal. If the difference between
5870 // the incoming mask elements is the same, the two must be constant offsets
5871 // of one another.
5872 if (NewMask.empty() || !all_equal(NewMask))
5873 return false;
5874
5875 // Create new mask using difference of the two incoming masks.
5876 int MaskOffset = NewMask[0u];
5877 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
5878 NewMask.clear();
5879
5880 for (unsigned I = 0u; I < InputNumElements; ++I) {
5881 NewMask.push_back(Index);
5882 Index = (Index + 1u) % InputNumElements;
5883 }
5884
5885 // Calculate costs for worst cases and compare.
5886 auto const Kind = TTI::SK_PermuteSingleSrc;
5887 auto OldCost =
5888 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
5889 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
5890 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
5891 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
5892
5893 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
5894 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5895 << "\n");
5896
5897 if (NewCost > OldCost)
5898 return false;
5899
5900 // Create new shuffles and narrowed phi.
5901 auto Builder = IRBuilder(Shuf);
5902 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
5903 auto *PoisonVal = PoisonValue::get(InputVT);
5904 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
5905 Worklist.push(cast<Instruction>(NewShuf0));
5906
5907 Builder.SetInsertPoint(Phi);
5908 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
5909 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
5910 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
5911 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
5912
5913 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
5914 PoisonVal = PoisonValue::get(NewPhi->getType());
5915 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
5916
5917 replaceValue(*Phi, *NewShuf1);
5918 return true;
5919}
5920
5921/// This is the entry point for all transforms. Pass manager differences are
5922/// handled in the callers of this function.
5923bool VectorCombine::run() {
5925 return false;
5926
5927 // Don't attempt vectorization if the target does not support vectors.
5928 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
5929 return false;
5930
5931 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
5932
5933 auto FoldInst = [this](Instruction &I) {
5934 Builder.SetInsertPoint(&I);
5935 bool IsVectorType = isa<VectorType>(I.getType());
5936 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
5937 auto Opcode = I.getOpcode();
5938
5939 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
5940
5941 // These folds should be beneficial regardless of when this pass is run
5942 // in the optimization pipeline.
5943 // The type checking is for run-time efficiency. We can avoid wasting time
5944 // dispatching to folding functions if there's no chance of matching.
5945 if (IsFixedVectorType) {
5946 switch (Opcode) {
5947 case Instruction::InsertElement:
5948 if (vectorizeLoadInsert(I))
5949 return true;
5950 break;
5951 case Instruction::ShuffleVector:
5952 if (widenSubvectorLoad(I))
5953 return true;
5954 break;
5955 default:
5956 break;
5957 }
5958 }
5959
5960 // This transform works with scalable and fixed vectors
5961 // TODO: Identify and allow other scalable transforms
5962 if (IsVectorType) {
5963 if (scalarizeOpOrCmp(I))
5964 return true;
5965 if (scalarizeLoad(I))
5966 return true;
5967 if (scalarizeExtExtract(I))
5968 return true;
5969 if (scalarizeVPIntrinsic(I))
5970 return true;
5971 if (foldInterleaveIntrinsics(I))
5972 return true;
5973 }
5974
5975 if (Opcode == Instruction::Store)
5976 if (foldSingleElementStore(I))
5977 return true;
5978
5979 // If this is an early pipeline invocation of this pass, we are done.
5980 if (TryEarlyFoldsOnly)
5981 return false;
5982
5983 // Otherwise, try folds that improve codegen but may interfere with
5984 // early IR canonicalizations.
5985 // The type checking is for run-time efficiency. We can avoid wasting time
5986 // dispatching to folding functions if there's no chance of matching.
5987 if (IsFixedVectorType) {
5988 switch (Opcode) {
5989 case Instruction::InsertElement:
5990 if (foldInsExtFNeg(I))
5991 return true;
5992 if (foldInsExtBinop(I))
5993 return true;
5994 if (foldInsExtVectorToShuffle(I))
5995 return true;
5996 break;
5997 case Instruction::ShuffleVector:
5998 if (foldPermuteOfBinops(I))
5999 return true;
6000 if (foldShuffleOfBinops(I))
6001 return true;
6002 if (foldShuffleOfSelects(I))
6003 return true;
6004 if (foldShuffleOfCastops(I))
6005 return true;
6006 if (foldShuffleOfShuffles(I))
6007 return true;
6008 if (foldPermuteOfIntrinsic(I))
6009 return true;
6010 if (foldShufflesOfLengthChangingShuffles(I))
6011 return true;
6012 if (foldShuffleOfIntrinsics(I))
6013 return true;
6014 if (foldSelectShuffle(I))
6015 return true;
6016 if (foldShuffleToIdentity(I))
6017 return true;
6018 break;
6019 case Instruction::Load:
6020 if (shrinkLoadForShuffles(I))
6021 return true;
6022 break;
6023 case Instruction::BitCast:
6024 if (foldBitcastShuffle(I))
6025 return true;
6026 if (foldSelectsFromBitcast(I))
6027 return true;
6028 break;
6029 case Instruction::And:
6030 case Instruction::Or:
6031 case Instruction::Xor:
6032 if (foldBitOpOfCastops(I))
6033 return true;
6034 if (foldBitOpOfCastConstant(I))
6035 return true;
6036 break;
6037 case Instruction::PHI:
6038 if (shrinkPhiOfShuffles(I))
6039 return true;
6040 break;
6041 default:
6042 if (shrinkType(I))
6043 return true;
6044 break;
6045 }
6046 } else {
6047 switch (Opcode) {
6048 case Instruction::Call:
6049 if (foldShuffleFromReductions(I))
6050 return true;
6051 if (foldCastFromReductions(I))
6052 return true;
6053 break;
6054 case Instruction::ExtractElement:
6055 if (foldShuffleChainsToReduce(I))
6056 return true;
6057 break;
6058 case Instruction::ICmp:
6059 if (foldSignBitReductionCmp(I))
6060 return true;
6061 if (foldICmpEqZeroVectorReduce(I))
6062 return true;
6063 if (foldEquivalentReductionCmp(I))
6064 return true;
6065 if (foldReduceAddCmpZero(I))
6066 return true;
6067 [[fallthrough]];
6068 case Instruction::FCmp:
6069 if (foldExtractExtract(I))
6070 return true;
6071 break;
6072 case Instruction::Or:
6073 if (foldConcatOfBoolMasks(I))
6074 return true;
6075 [[fallthrough]];
6076 default:
6077 if (Instruction::isBinaryOp(Opcode)) {
6078 if (foldExtractExtract(I))
6079 return true;
6080 if (foldExtractedCmps(I))
6081 return true;
6082 if (foldBinopOfReductions(I))
6083 return true;
6084 }
6085 break;
6086 }
6087 }
6088 return false;
6089 };
6090
6091 bool MadeChange = false;
6092 for (BasicBlock &BB : F) {
6093 // Ignore unreachable basic blocks.
6094 if (!DT.isReachableFromEntry(&BB))
6095 continue;
6096 // Use early increment range so that we can erase instructions in loop.
6097 // make_early_inc_range is not applicable here, as the next iterator may
6098 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
6099 // We manually maintain the next instruction and update it when it is about
6100 // to be deleted.
6101 Instruction *I = &BB.front();
6102 while (I) {
6103 NextInst = I->getNextNode();
6104 if (!I->isDebugOrPseudoInst())
6105 MadeChange |= FoldInst(*I);
6106 I = NextInst;
6107 }
6108 }
6109
6110 NextInst = nullptr;
6111
6112 while (!Worklist.isEmpty()) {
6113 Instruction *I = Worklist.removeOne();
6114 if (!I)
6115 continue;
6116
6119 continue;
6120 }
6121
6122 MadeChange |= FoldInst(*I);
6123 }
6124
6125 return MadeChange;
6126}
6127
6130 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
6132 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
6133 AAResults &AA = FAM.getResult<AAManager>(F);
6134 const DataLayout *DL = &F.getDataLayout();
6135 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
6136 TryEarlyFoldsOnly);
6137 if (!Combiner.run())
6138 return PreservedAnalyses::all();
6141 return PA;
6142}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
This file defines the DenseMap class.
#define Check(C,...)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
iv users
Definition IVUsers.cpp:48
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1457
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Value * generateNewInstTree(ArrayRef< InstLane > Item, Use *From, FixedVectorType *Ty, const DenseSet< std::pair< Value *, Use * > > &IdentityLeafs, const DenseSet< std::pair< Value *, Use * > > &SplatLeafs, const DenseSet< std::pair< Value *, Use * > > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
std::pair< Value *, int > InstLane
static bool isKnownNonPositive(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Used by foldReduceAddCmpZero to check if we can prove that a value is non-positive.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, const SimplifyQuery &SQ)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static InstLane lookThroughShuffles(Value *V, int Lane)
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1638
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:409
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
bool isFPPredicate() const
Definition InstrTypes.h:845
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:33
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
Predicate getSignedPredicate() const
For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
bool isEquality() const
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2627
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2615
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1935
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2693
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1554
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2276
Value * CreateIsNotNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg > -1.
Definition IRBuilder.h:2717
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:2018
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2301
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2508
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2539
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateIsNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg < 0.
Definition IRBuilder.h:2712
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2242
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1918
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2120
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2649
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1592
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1931
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2106
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:629
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1856
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2484
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
static LLVM_ABI OperandValueInfo commonOperandInfo(const Value *X, const Value *Y)
Collect common data between two OperandValueInfo inputs.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:737
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition APInt.h:2277
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2282
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_and< Ty... > m_CombineAnd(const Ty &...Ps)
Combine pattern matchers matching all of Ps patterns.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_Constant()
Match an arbitrary Constant and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
cst_pred_ty< is_non_zero_int > m_NonZeroInt()
Match a non-zero integer or a vector with all non-zero elements.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
auto m_AnyIntrinsic()
Matches any intrinsic call and ignore it.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Mul, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWMul(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:558
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2115
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
scope_exit(Callable) -> scope_exit< Callable >
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:432
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:447
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
const DataLayout & DL
const Instruction * CxtI
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC