LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoadExtract(Instruction &I);
133 bool scalarizeExtExtract(Instruction &I);
134 bool foldConcatOfBoolMasks(Instruction &I);
135 bool foldPermuteOfBinops(Instruction &I);
136 bool foldShuffleOfBinops(Instruction &I);
137 bool foldShuffleOfSelects(Instruction &I);
138 bool foldShuffleOfCastops(Instruction &I);
139 bool foldShuffleOfShuffles(Instruction &I);
140 bool foldShuffleOfIntrinsics(Instruction &I);
141 bool foldShuffleToIdentity(Instruction &I);
142 bool foldShuffleFromReductions(Instruction &I);
143 bool foldShuffleChainsToReduce(Instruction &I);
144 bool foldCastFromReductions(Instruction &I);
145 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
146 bool foldInterleaveIntrinsics(Instruction &I);
147 bool shrinkType(Instruction &I);
148 bool shrinkLoadForShuffles(Instruction &I);
149 bool shrinkPhiOfShuffles(Instruction &I);
150
151 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
152 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
153 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
154 Old.replaceAllUsesWith(&New);
155 if (auto *NewI = dyn_cast<Instruction>(&New)) {
156 New.takeName(&Old);
157 Worklist.pushUsersToWorkList(*NewI);
158 Worklist.pushValue(NewI);
159 }
160 if (Erase && isInstructionTriviallyDead(&Old)) {
161 eraseInstruction(Old);
162 } else {
163 Worklist.push(&Old);
164 }
165 }
166
167 void eraseInstruction(Instruction &I) {
168 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
169 SmallVector<Value *> Ops(I.operands());
170 Worklist.remove(&I);
171 I.eraseFromParent();
172
173 // Push remaining users of the operands and then the operand itself - allows
174 // further folds that were hindered by OneUse limits.
175 SmallPtrSet<Value *, 4> Visited;
176 for (Value *Op : Ops) {
177 if (!Visited.contains(Op)) {
178 if (auto *OpI = dyn_cast<Instruction>(Op)) {
180 OpI, nullptr, nullptr, [&](Value *V) {
181 if (auto *I = dyn_cast<Instruction>(V)) {
182 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
183 Worklist.remove(I);
184 if (I == NextInst)
185 NextInst = NextInst->getNextNode();
186 Visited.insert(I);
187 }
188 }))
189 continue;
190 Worklist.pushUsersToWorkList(*OpI);
191 Worklist.pushValue(OpI);
192 }
193 }
194 }
195 }
196};
197} // namespace
198
199/// Return the source operand of a potentially bitcasted value. If there is no
200/// bitcast, return the input value itself.
202 while (auto *BitCast = dyn_cast<BitCastInst>(V))
203 V = BitCast->getOperand(0);
204 return V;
205}
206
207static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
208 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
209 // The widened load may load data from dirty regions or create data races
210 // non-existent in the source.
211 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
212 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
214 return false;
215
216 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
217 // sure we have all of our type-based constraints in place for this target.
218 Type *ScalarTy = Load->getType()->getScalarType();
219 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
220 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
221 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
222 ScalarSize % 8 != 0)
223 return false;
224
225 return true;
226}
227
228bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
229 // Match insert into fixed vector of scalar value.
230 // TODO: Handle non-zero insert index.
231 Value *Scalar;
232 if (!match(&I,
234 return false;
235
236 // Optionally match an extract from another vector.
237 Value *X;
238 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
239 if (!HasExtract)
240 X = Scalar;
241
242 auto *Load = dyn_cast<LoadInst>(X);
243 if (!canWidenLoad(Load, TTI))
244 return false;
245
246 Type *ScalarTy = Scalar->getType();
247 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
248 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
249
250 // Check safety of replacing the scalar load with a larger vector load.
251 // We use minimal alignment (maximum flexibility) because we only care about
252 // the dereferenceable region. When calculating cost and creating a new op,
253 // we may use a larger value based on alignment attributes.
254 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
255 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
256
257 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
258 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
259 unsigned OffsetEltIndex = 0;
260 Align Alignment = Load->getAlign();
261 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
262 &DT)) {
263 // It is not safe to load directly from the pointer, but we can still peek
264 // through gep offsets and check if it safe to load from a base address with
265 // updated alignment. If it is, we can shuffle the element(s) into place
266 // after loading.
267 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
268 APInt Offset(OffsetBitWidth, 0);
270
271 // We want to shuffle the result down from a high element of a vector, so
272 // the offset must be positive.
273 if (Offset.isNegative())
274 return false;
275
276 // The offset must be a multiple of the scalar element to shuffle cleanly
277 // in the element's size.
278 uint64_t ScalarSizeInBytes = ScalarSize / 8;
279 if (Offset.urem(ScalarSizeInBytes) != 0)
280 return false;
281
282 // If we load MinVecNumElts, will our target element still be loaded?
283 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
284 if (OffsetEltIndex >= MinVecNumElts)
285 return false;
286
287 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
288 &DT))
289 return false;
290
291 // Update alignment with offset value. Note that the offset could be negated
292 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
293 // negation does not change the result of the alignment calculation.
294 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
295 }
296
297 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
298 // Use the greater of the alignment on the load or its source pointer.
299 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
300 Type *LoadTy = Load->getType();
301 unsigned AS = Load->getPointerAddressSpace();
302 InstructionCost OldCost =
303 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
304 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
305 OldCost +=
306 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
307 /* Insert */ true, HasExtract, CostKind);
308
309 // New pattern: load VecPtr
310 InstructionCost NewCost =
311 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
312 // Optionally, we are shuffling the loaded vector element(s) into place.
313 // For the mask set everything but element 0 to undef to prevent poison from
314 // propagating from the extra loaded memory. This will also optionally
315 // shrink/grow the vector from the loaded size to the output size.
316 // We assume this operation has no cost in codegen if there was no offset.
317 // Note that we could use freeze to avoid poison problems, but then we might
318 // still need a shuffle to change the vector size.
319 auto *Ty = cast<FixedVectorType>(I.getType());
320 unsigned OutputNumElts = Ty->getNumElements();
321 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
322 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
323 Mask[0] = OffsetEltIndex;
324 if (OffsetEltIndex)
325 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
326 CostKind);
327
328 // We can aggressively convert to the vector form because the backend can
329 // invert this transform if it does not result in a performance win.
330 if (OldCost < NewCost || !NewCost.isValid())
331 return false;
332
333 // It is safe and potentially profitable to load a vector directly:
334 // inselt undef, load Scalar, 0 --> load VecPtr
335 IRBuilder<> Builder(Load);
336 Value *CastedPtr =
337 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
338 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
339 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
340
341 replaceValue(I, *VecLd);
342 ++NumVecLoad;
343 return true;
344}
345
346/// If we are loading a vector and then inserting it into a larger vector with
347/// undefined elements, try to load the larger vector and eliminate the insert.
348/// This removes a shuffle in IR and may allow combining of other loaded values.
349bool VectorCombine::widenSubvectorLoad(Instruction &I) {
350 // Match subvector insert of fixed vector.
351 auto *Shuf = cast<ShuffleVectorInst>(&I);
352 if (!Shuf->isIdentityWithPadding())
353 return false;
354
355 // Allow a non-canonical shuffle mask that is choosing elements from op1.
356 unsigned NumOpElts =
357 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
358 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
359 return M >= (int)(NumOpElts);
360 });
361
362 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
363 if (!canWidenLoad(Load, TTI))
364 return false;
365
366 // We use minimal alignment (maximum flexibility) because we only care about
367 // the dereferenceable region. When calculating cost and creating a new op,
368 // we may use a larger value based on alignment attributes.
369 auto *Ty = cast<FixedVectorType>(I.getType());
370 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
371 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
372 Align Alignment = Load->getAlign();
373 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
374 return false;
375
376 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
377 Type *LoadTy = Load->getType();
378 unsigned AS = Load->getPointerAddressSpace();
379
380 // Original pattern: insert_subvector (load PtrOp)
381 // This conservatively assumes that the cost of a subvector insert into an
382 // undef value is 0. We could add that cost if the cost model accurately
383 // reflects the real cost of that operation.
384 InstructionCost OldCost =
385 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
386
387 // New pattern: load PtrOp
388 InstructionCost NewCost =
389 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
390
391 // We can aggressively convert to the vector form because the backend can
392 // invert this transform if it does not result in a performance win.
393 if (OldCost < NewCost || !NewCost.isValid())
394 return false;
395
396 IRBuilder<> Builder(Load);
397 Value *CastedPtr =
398 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
399 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
400 replaceValue(I, *VecLd);
401 ++NumVecLoad;
402 return true;
403}
404
405/// Determine which, if any, of the inputs should be replaced by a shuffle
406/// followed by extract from a different index.
407ExtractElementInst *VectorCombine::getShuffleExtract(
408 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
409 unsigned PreferredExtractIndex = InvalidIndex) const {
410 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
411 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
412 assert(Index0C && Index1C && "Expected constant extract indexes");
413
414 unsigned Index0 = Index0C->getZExtValue();
415 unsigned Index1 = Index1C->getZExtValue();
416
417 // If the extract indexes are identical, no shuffle is needed.
418 if (Index0 == Index1)
419 return nullptr;
420
421 Type *VecTy = Ext0->getVectorOperand()->getType();
422 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
423 InstructionCost Cost0 =
424 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
425 InstructionCost Cost1 =
426 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
427
428 // If both costs are invalid no shuffle is needed
429 if (!Cost0.isValid() && !Cost1.isValid())
430 return nullptr;
431
432 // We are extracting from 2 different indexes, so one operand must be shuffled
433 // before performing a vector operation and/or extract. The more expensive
434 // extract will be replaced by a shuffle.
435 if (Cost0 > Cost1)
436 return Ext0;
437 if (Cost1 > Cost0)
438 return Ext1;
439
440 // If the costs are equal and there is a preferred extract index, shuffle the
441 // opposite operand.
442 if (PreferredExtractIndex == Index0)
443 return Ext1;
444 if (PreferredExtractIndex == Index1)
445 return Ext0;
446
447 // Otherwise, replace the extract with the higher index.
448 return Index0 > Index1 ? Ext0 : Ext1;
449}
450
451/// Compare the relative costs of 2 extracts followed by scalar operation vs.
452/// vector operation(s) followed by extract. Return true if the existing
453/// instructions are cheaper than a vector alternative. Otherwise, return false
454/// and if one of the extracts should be transformed to a shufflevector, set
455/// \p ConvertToShuffle to that extract instruction.
456bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
457 ExtractElementInst *Ext1,
458 const Instruction &I,
459 ExtractElementInst *&ConvertToShuffle,
460 unsigned PreferredExtractIndex) {
461 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
462 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
463 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
464
465 unsigned Opcode = I.getOpcode();
466 Value *Ext0Src = Ext0->getVectorOperand();
467 Value *Ext1Src = Ext1->getVectorOperand();
468 Type *ScalarTy = Ext0->getType();
469 auto *VecTy = cast<VectorType>(Ext0Src->getType());
470 InstructionCost ScalarOpCost, VectorOpCost;
471
472 // Get cost estimates for scalar and vector versions of the operation.
473 bool IsBinOp = Instruction::isBinaryOp(Opcode);
474 if (IsBinOp) {
475 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
476 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
477 } else {
478 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
479 "Expected a compare");
480 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
481 ScalarOpCost = TTI.getCmpSelInstrCost(
482 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
483 VectorOpCost = TTI.getCmpSelInstrCost(
484 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
485 }
486
487 // Get cost estimates for the extract elements. These costs will factor into
488 // both sequences.
489 unsigned Ext0Index = Ext0IndexC->getZExtValue();
490 unsigned Ext1Index = Ext1IndexC->getZExtValue();
491
492 InstructionCost Extract0Cost =
493 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
494 InstructionCost Extract1Cost =
495 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
496
497 // A more expensive extract will always be replaced by a splat shuffle.
498 // For example, if Ext0 is more expensive:
499 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
500 // extelt (opcode (splat V0, Ext0), V1), Ext1
501 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
502 // check the cost of creating a broadcast shuffle and shuffling both
503 // operands to element 0.
504 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
505 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
506 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
507
508 // Extra uses of the extracts mean that we include those costs in the
509 // vector total because those instructions will not be eliminated.
510 InstructionCost OldCost, NewCost;
511 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
512 // Handle a special case. If the 2 extracts are identical, adjust the
513 // formulas to account for that. The extra use charge allows for either the
514 // CSE'd pattern or an unoptimized form with identical values:
515 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
516 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
517 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
518 OldCost = CheapExtractCost + ScalarOpCost;
519 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
520 } else {
521 // Handle the general case. Each extract is actually a different value:
522 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
523 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
524 NewCost = VectorOpCost + CheapExtractCost +
525 !Ext0->hasOneUse() * Extract0Cost +
526 !Ext1->hasOneUse() * Extract1Cost;
527 }
528
529 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
530 if (ConvertToShuffle) {
531 if (IsBinOp && DisableBinopExtractShuffle)
532 return true;
533
534 // If we are extracting from 2 different indexes, then one operand must be
535 // shuffled before performing the vector operation. The shuffle mask is
536 // poison except for 1 lane that is being translated to the remaining
537 // extraction lane. Therefore, it is a splat shuffle. Ex:
538 // ShufMask = { poison, poison, 0, poison }
539 // TODO: The cost model has an option for a "broadcast" shuffle
540 // (splat-from-element-0), but no option for a more general splat.
541 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
542 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
544 ShuffleMask[BestInsIndex] = BestExtIndex;
546 VecTy, VecTy, ShuffleMask, CostKind, 0,
547 nullptr, {ConvertToShuffle});
548 } else {
550 VecTy, VecTy, {}, CostKind, 0, nullptr,
551 {ConvertToShuffle});
552 }
553 }
554
555 // Aggressively form a vector op if the cost is equal because the transform
556 // may enable further optimization.
557 // Codegen can reverse this transform (scalarize) if it was not profitable.
558 return OldCost < NewCost;
559}
560
561/// Create a shuffle that translates (shifts) 1 element from the input vector
562/// to a new element location.
563static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
564 unsigned NewIndex, IRBuilderBase &Builder) {
565 // The shuffle mask is poison except for 1 lane that is being translated
566 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
567 // ShufMask = { 2, poison, poison, poison }
568 auto *VecTy = cast<FixedVectorType>(Vec->getType());
569 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
570 ShufMask[NewIndex] = OldIndex;
571 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
572}
573
574/// Given an extract element instruction with constant index operand, shuffle
575/// the source vector (shift the scalar element) to a NewIndex for extraction.
576/// Return null if the input can be constant folded, so that we are not creating
577/// unnecessary instructions.
578static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
579 IRBuilderBase &Builder) {
580 // Shufflevectors can only be created for fixed-width vectors.
581 Value *X = ExtElt->getVectorOperand();
582 if (!isa<FixedVectorType>(X->getType()))
583 return nullptr;
584
585 // If the extract can be constant-folded, this code is unsimplified. Defer
586 // to other passes to handle that.
587 Value *C = ExtElt->getIndexOperand();
588 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
589 if (isa<Constant>(X))
590 return nullptr;
591
592 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
593 NewIndex, Builder);
594 return Shuf;
595}
596
597/// Try to reduce extract element costs by converting scalar compares to vector
598/// compares followed by extract.
599/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
600Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
601 Instruction &I) {
602 assert(isa<CmpInst>(&I) && "Expected a compare");
603
604 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
605 // --> extelt (cmp Pred V0, V1), ExtIndex
606 ++NumVecCmp;
607 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
608 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
609 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
610}
611
612/// Try to reduce extract element costs by converting scalar binops to vector
613/// binops followed by extract.
614/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
615Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
616 Instruction &I) {
617 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
618
619 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
620 // --> extelt (bo V0, V1), ExtIndex
621 ++NumVecBO;
622 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
623 V1, "foldExtExtBinop");
624
625 // All IR flags are safe to back-propagate because any potential poison
626 // created in unused vector elements is discarded by the extract.
627 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
628 VecBOInst->copyIRFlags(&I);
629
630 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
631}
632
633/// Match an instruction with extracted vector operands.
634bool VectorCombine::foldExtractExtract(Instruction &I) {
635 // It is not safe to transform things like div, urem, etc. because we may
636 // create undefined behavior when executing those on unknown vector elements.
638 return false;
639
640 Instruction *I0, *I1;
641 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
642 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
644 return false;
645
646 Value *V0, *V1;
647 uint64_t C0, C1;
648 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
649 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
650 V0->getType() != V1->getType())
651 return false;
652
653 // If the scalar value 'I' is going to be re-inserted into a vector, then try
654 // to create an extract to that same element. The extract/insert can be
655 // reduced to a "select shuffle".
656 // TODO: If we add a larger pattern match that starts from an insert, this
657 // probably becomes unnecessary.
658 auto *Ext0 = cast<ExtractElementInst>(I0);
659 auto *Ext1 = cast<ExtractElementInst>(I1);
660 uint64_t InsertIndex = InvalidIndex;
661 if (I.hasOneUse())
662 match(I.user_back(),
663 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
664
665 ExtractElementInst *ExtractToChange;
666 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
667 return false;
668
669 Value *ExtOp0 = Ext0->getVectorOperand();
670 Value *ExtOp1 = Ext1->getVectorOperand();
671
672 if (ExtractToChange) {
673 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
674 Value *NewExtOp =
675 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
676 if (!NewExtOp)
677 return false;
678 if (ExtractToChange == Ext0)
679 ExtOp0 = NewExtOp;
680 else
681 ExtOp1 = NewExtOp;
682 }
683
684 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
685 : Ext0->getIndexOperand();
686 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
687 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
688 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
689 Worklist.push(Ext0);
690 Worklist.push(Ext1);
691 replaceValue(I, *NewExt);
692 return true;
693}
694
695/// Try to replace an extract + scalar fneg + insert with a vector fneg +
696/// shuffle.
697bool VectorCombine::foldInsExtFNeg(Instruction &I) {
698 // Match an insert (op (extract)) pattern.
699 Value *DestVec;
700 uint64_t Index;
701 Instruction *FNeg;
702 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
703 m_ConstantInt(Index))))
704 return false;
705
706 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
707 Value *SrcVec;
708 Instruction *Extract;
709 if (!match(FNeg, m_FNeg(m_CombineAnd(
710 m_Instruction(Extract),
711 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
712 return false;
713
714 auto *VecTy = cast<FixedVectorType>(I.getType());
715 auto *ScalarTy = VecTy->getScalarType();
716 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
717 if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
718 return false;
719
720 // Ignore bogus insert/extract index.
721 unsigned NumElts = VecTy->getNumElements();
722 if (Index >= NumElts)
723 return false;
724
725 // We are inserting the negated element into the same lane that we extracted
726 // from. This is equivalent to a select-shuffle that chooses all but the
727 // negated element from the destination vector.
728 SmallVector<int> Mask(NumElts);
729 std::iota(Mask.begin(), Mask.end(), 0);
730 Mask[Index] = Index + NumElts;
731 InstructionCost OldCost =
732 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
733 TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
734
735 // If the extract has one use, it will be eliminated, so count it in the
736 // original cost. If it has more than one use, ignore the cost because it will
737 // be the same before/after.
738 if (Extract->hasOneUse())
739 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
740
741 InstructionCost NewCost =
742 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
744 Mask, CostKind);
745
746 bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
747 // If the lengths of the two vectors are not equal,
748 // we need to add a length-change vector. Add this cost.
749 SmallVector<int> SrcMask;
750 if (NeedLenChg) {
751 SrcMask.assign(NumElts, PoisonMaskElem);
752 SrcMask[Index] = Index;
754 VecTy, SrcVecTy, SrcMask, CostKind);
755 }
756
757 if (NewCost > OldCost)
758 return false;
759
760 Value *NewShuf;
761 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
762 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
763 if (NeedLenChg) {
764 // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
765 Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
766 NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
767 } else {
768 // shuffle DestVec, (fneg SrcVec), Mask
769 NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
770 }
771
772 replaceValue(I, *NewShuf);
773 return true;
774}
775
776/// Try to fold insert(binop(x,y),binop(a,b),idx)
777/// --> binop(insert(x,a,idx),insert(y,b,idx))
778bool VectorCombine::foldInsExtBinop(Instruction &I) {
779 BinaryOperator *VecBinOp, *SclBinOp;
780 uint64_t Index;
781 if (!match(&I,
782 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
783 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
784 return false;
785
786 // TODO: Add support for addlike etc.
787 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
788 if (BinOpcode != SclBinOp->getOpcode())
789 return false;
790
791 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
792 if (!ResultTy)
793 return false;
794
795 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
796 // shuffle?
797
799 TTI.getInstructionCost(VecBinOp, CostKind) +
801 InstructionCost NewCost =
802 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
803 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
804 Index, VecBinOp->getOperand(0),
805 SclBinOp->getOperand(0)) +
806 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
807 Index, VecBinOp->getOperand(1),
808 SclBinOp->getOperand(1));
809
810 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
811 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
812 << "\n");
813 if (NewCost > OldCost)
814 return false;
815
816 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
817 SclBinOp->getOperand(0), Index);
818 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
819 SclBinOp->getOperand(1), Index);
820 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
821
822 // Intersect flags from the old binops.
823 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
824 NewInst->copyIRFlags(VecBinOp);
825 NewInst->andIRFlags(SclBinOp);
826 }
827
828 Worklist.pushValue(NewIns0);
829 Worklist.pushValue(NewIns1);
830 replaceValue(I, *NewBO);
831 return true;
832}
833
834/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
835/// Supports: bitcast, trunc, sext, zext
836bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
837 // Check if this is a bitwise logic operation
838 auto *BinOp = dyn_cast<BinaryOperator>(&I);
839 if (!BinOp || !BinOp->isBitwiseLogicOp())
840 return false;
841
842 // Get the cast instructions
843 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
844 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
845 if (!LHSCast || !RHSCast) {
846 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
847 return false;
848 }
849
850 // Both casts must be the same type
851 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
852 if (CastOpcode != RHSCast->getOpcode())
853 return false;
854
855 // Only handle supported cast operations
856 switch (CastOpcode) {
857 case Instruction::BitCast:
858 case Instruction::Trunc:
859 case Instruction::SExt:
860 case Instruction::ZExt:
861 break;
862 default:
863 return false;
864 }
865
866 Value *LHSSrc = LHSCast->getOperand(0);
867 Value *RHSSrc = RHSCast->getOperand(0);
868
869 // Source types must match
870 if (LHSSrc->getType() != RHSSrc->getType())
871 return false;
872
873 auto *SrcTy = LHSSrc->getType();
874 auto *DstTy = I.getType();
875 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
876 // Other casts only handle vector types with integer elements.
877 if (CastOpcode != Instruction::BitCast &&
878 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
879 return false;
880
881 // Only integer scalar/vector values are legal for bitwise logic operations.
882 if (!SrcTy->getScalarType()->isIntegerTy() ||
883 !DstTy->getScalarType()->isIntegerTy())
884 return false;
885
886 // Cost Check :
887 // OldCost = bitlogic + 2*casts
888 // NewCost = bitlogic + cast
889
890 // Calculate specific costs for each cast with instruction context
892 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
894 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
895
896 InstructionCost OldCost =
897 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
898 LHSCastCost + RHSCastCost;
899
900 // For new cost, we can't provide an instruction (it doesn't exist yet)
901 InstructionCost GenericCastCost = TTI.getCastInstrCost(
902 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
903
904 InstructionCost NewCost =
905 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
906 GenericCastCost;
907
908 // Account for multi-use casts using specific costs
909 if (!LHSCast->hasOneUse())
910 NewCost += LHSCastCost;
911 if (!RHSCast->hasOneUse())
912 NewCost += RHSCastCost;
913
914 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
915 << " NewCost=" << NewCost << "\n");
916
917 if (NewCost > OldCost)
918 return false;
919
920 // Create the operation on the source type
921 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
922 BinOp->getName() + ".inner");
923 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
924 NewBinOp->copyIRFlags(BinOp);
925
926 Worklist.pushValue(NewOp);
927
928 // Create the cast operation directly to ensure we get a new instruction
929 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
930
931 // Preserve cast instruction flags
932 NewCast->copyIRFlags(LHSCast);
933 NewCast->andIRFlags(RHSCast);
934
935 // Insert the new instruction
936 Value *Result = Builder.Insert(NewCast);
937
938 replaceValue(I, *Result);
939 return true;
940}
941
942/// Match:
943// bitop(castop(x), C) ->
944// bitop(castop(x), castop(InvC)) ->
945// castop(bitop(x, InvC))
946// Supports: bitcast
947bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
949 Constant *C;
950
951 // Check if this is a bitwise logic operation
953 return false;
954
955 // Get the cast instructions
956 auto *LHSCast = dyn_cast<CastInst>(LHS);
957 if (!LHSCast)
958 return false;
959
960 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
961
962 // Only handle supported cast operations
963 switch (CastOpcode) {
964 case Instruction::BitCast:
965 case Instruction::ZExt:
966 case Instruction::SExt:
967 case Instruction::Trunc:
968 break;
969 default:
970 return false;
971 }
972
973 Value *LHSSrc = LHSCast->getOperand(0);
974
975 auto *SrcTy = LHSSrc->getType();
976 auto *DstTy = I.getType();
977 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
978 // Other casts only handle vector types with integer elements.
979 if (CastOpcode != Instruction::BitCast &&
980 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
981 return false;
982
983 // Only integer scalar/vector values are legal for bitwise logic operations.
984 if (!SrcTy->getScalarType()->isIntegerTy() ||
985 !DstTy->getScalarType()->isIntegerTy())
986 return false;
987
988 // Find the constant InvC, such that castop(InvC) equals to C.
989 PreservedCastFlags RHSFlags;
990 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
991 if (!InvC)
992 return false;
993
994 // Cost Check :
995 // OldCost = bitlogic + cast
996 // NewCost = bitlogic + cast
997
998 // Calculate specific costs for each cast with instruction context
1000 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1001
1002 InstructionCost OldCost =
1003 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1004
1005 // For new cost, we can't provide an instruction (it doesn't exist yet)
1006 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1007 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1008
1009 InstructionCost NewCost =
1010 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1011 GenericCastCost;
1012
1013 // Account for multi-use casts using specific costs
1014 if (!LHSCast->hasOneUse())
1015 NewCost += LHSCastCost;
1016
1017 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1018 << " NewCost=" << NewCost << "\n");
1019
1020 if (NewCost > OldCost)
1021 return false;
1022
1023 // Create the operation on the source type
1024 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1025 LHSSrc, InvC, I.getName() + ".inner");
1026 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1027 NewBinOp->copyIRFlags(&I);
1028
1029 Worklist.pushValue(NewOp);
1030
1031 // Create the cast operation directly to ensure we get a new instruction
1032 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1033
1034 // Preserve cast instruction flags
1035 if (RHSFlags.NNeg)
1036 NewCast->setNonNeg();
1037 if (RHSFlags.NUW)
1038 NewCast->setHasNoUnsignedWrap();
1039 if (RHSFlags.NSW)
1040 NewCast->setHasNoSignedWrap();
1041
1042 NewCast->andIRFlags(LHSCast);
1043
1044 // Insert the new instruction
1045 Value *Result = Builder.Insert(NewCast);
1046
1047 replaceValue(I, *Result);
1048 return true;
1049}
1050
1051/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1052/// destination type followed by shuffle. This can enable further transforms by
1053/// moving bitcasts or shuffles together.
1054bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1055 Value *V0, *V1;
1056 ArrayRef<int> Mask;
1057 if (!match(&I, m_BitCast(m_OneUse(
1058 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1059 return false;
1060
1061 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1062 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1063 // mask for scalable type is a splat or not.
1064 // 2) Disallow non-vector casts.
1065 // TODO: We could allow any shuffle.
1066 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1067 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1068 if (!DestTy || !SrcTy)
1069 return false;
1070
1071 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1072 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1073 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1074 return false;
1075
1076 bool IsUnary = isa<UndefValue>(V1);
1077
1078 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1079 // if it won't increase the number of bitcasts.
1080 if (!IsUnary) {
1083 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1084 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1085 return false;
1086 }
1087
1088 SmallVector<int, 16> NewMask;
1089 if (DestEltSize <= SrcEltSize) {
1090 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1091 // always be expanded to the equivalent form choosing narrower elements.
1092 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1093 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1094 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1095 } else {
1096 // The bitcast is from narrow elements to wide elements. The shuffle mask
1097 // must choose consecutive elements to allow casting first.
1098 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1099 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1100 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1101 return false;
1102 }
1103
1104 // Bitcast the shuffle src - keep its original width but using the destination
1105 // scalar type.
1106 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1107 auto *NewShuffleTy =
1108 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1109 auto *OldShuffleTy =
1110 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1111 unsigned NumOps = IsUnary ? 1 : 2;
1112
1113 // The new shuffle must not cost more than the old shuffle.
1117
1118 InstructionCost NewCost =
1119 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1120 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1121 TargetTransformInfo::CastContextHint::None,
1122 CostKind));
1123 InstructionCost OldCost =
1124 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1125 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1126 TargetTransformInfo::CastContextHint::None,
1127 CostKind);
1128
1129 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1130 << OldCost << " vs NewCost: " << NewCost << "\n");
1131
1132 if (NewCost > OldCost || !NewCost.isValid())
1133 return false;
1134
1135 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1136 ++NumShufOfBitcast;
1137 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1138 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1139 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1140 replaceValue(I, *Shuf);
1141 return true;
1142}
1143
1144/// VP Intrinsics whose vector operands are both splat values may be simplified
1145/// into the scalar version of the operation and the result splatted. This
1146/// can lead to scalarization down the line.
1147bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1148 if (!isa<VPIntrinsic>(I))
1149 return false;
1150 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1151 Value *Op0 = VPI.getArgOperand(0);
1152 Value *Op1 = VPI.getArgOperand(1);
1153
1154 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1155 return false;
1156
1157 // Check getSplatValue early in this function, to avoid doing unnecessary
1158 // work.
1159 Value *ScalarOp0 = getSplatValue(Op0);
1160 Value *ScalarOp1 = getSplatValue(Op1);
1161 if (!ScalarOp0 || !ScalarOp1)
1162 return false;
1163
1164 // For the binary VP intrinsics supported here, the result on disabled lanes
1165 // is a poison value. For now, only do this simplification if all lanes
1166 // are active.
1167 // TODO: Relax the condition that all lanes are active by using insertelement
1168 // on inactive lanes.
1169 auto IsAllTrueMask = [](Value *MaskVal) {
1170 if (Value *SplattedVal = getSplatValue(MaskVal))
1171 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1172 return ConstValue->isAllOnesValue();
1173 return false;
1174 };
1175 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1176 return false;
1177
1178 // Check to make sure we support scalarization of the intrinsic
1179 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1180 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1181 return false;
1182
1183 // Calculate cost of splatting both operands into vectors and the vector
1184 // intrinsic
1185 VectorType *VecTy = cast<VectorType>(VPI.getType());
1186 SmallVector<int> Mask;
1187 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1188 Mask.resize(FVTy->getNumElements(), 0);
1189 InstructionCost SplatCost =
1190 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1192 CostKind);
1193
1194 // Calculate the cost of the VP Intrinsic
1196 for (Value *V : VPI.args())
1197 Args.push_back(V->getType());
1198 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1199 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1200 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1201
1202 // Determine scalar opcode
1203 std::optional<unsigned> FunctionalOpcode =
1204 VPI.getFunctionalOpcode();
1205 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1206 if (!FunctionalOpcode) {
1207 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1208 if (!ScalarIntrID)
1209 return false;
1210 }
1211
1212 // Calculate cost of scalarizing
1213 InstructionCost ScalarOpCost = 0;
1214 if (ScalarIntrID) {
1215 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1216 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1217 } else {
1218 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1219 VecTy->getScalarType(), CostKind);
1220 }
1221
1222 // The existing splats may be kept around if other instructions use them.
1223 InstructionCost CostToKeepSplats =
1224 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1225 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1226
1227 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1228 << "\n");
1229 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1230 << ", Cost of scalarizing:" << NewCost << "\n");
1231
1232 // We want to scalarize unless the vector variant actually has lower cost.
1233 if (OldCost < NewCost || !NewCost.isValid())
1234 return false;
1235
1236 // Scalarize the intrinsic
1237 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1238 Value *EVL = VPI.getArgOperand(3);
1239
1240 // If the VP op might introduce UB or poison, we can scalarize it provided
1241 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1242 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1243 // scalarizing it.
1244 bool SafeToSpeculate;
1245 if (ScalarIntrID)
1246 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1247 .hasAttribute(Attribute::AttrKind::Speculatable);
1248 else
1250 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1251 if (!SafeToSpeculate &&
1252 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1253 return false;
1254
1255 Value *ScalarVal =
1256 ScalarIntrID
1257 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1258 {ScalarOp0, ScalarOp1})
1259 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1260 ScalarOp0, ScalarOp1);
1261
1262 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1263 return true;
1264}
1265
1266/// Match a vector op/compare/intrinsic with at least one
1267/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1268/// by insertelement.
1269bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1270 auto *UO = dyn_cast<UnaryOperator>(&I);
1271 auto *BO = dyn_cast<BinaryOperator>(&I);
1272 auto *CI = dyn_cast<CmpInst>(&I);
1273 auto *II = dyn_cast<IntrinsicInst>(&I);
1274 if (!UO && !BO && !CI && !II)
1275 return false;
1276
1277 // TODO: Allow intrinsics with different argument types
1278 if (II) {
1279 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1280 return false;
1281 for (auto [Idx, Arg] : enumerate(II->args()))
1282 if (Arg->getType() != II->getType() &&
1283 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1284 return false;
1285 }
1286
1287 // Do not convert the vector condition of a vector select into a scalar
1288 // condition. That may cause problems for codegen because of differences in
1289 // boolean formats and register-file transfers.
1290 // TODO: Can we account for that in the cost model?
1291 if (CI)
1292 for (User *U : I.users())
1293 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1294 return false;
1295
1296 // Match constant vectors or scalars being inserted into constant vectors:
1297 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1298 SmallVector<Value *> VecCs, ScalarOps;
1299 std::optional<uint64_t> Index;
1300
1301 auto Ops = II ? II->args() : I.operands();
1302 for (auto [OpNum, Op] : enumerate(Ops)) {
1303 Constant *VecC;
1304 Value *V;
1305 uint64_t InsIdx = 0;
1306 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1307 m_ConstantInt(InsIdx)))) {
1308 // Bail if any inserts are out of bounds.
1309 VectorType *OpTy = cast<VectorType>(Op->getType());
1310 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1311 return false;
1312 // All inserts must have the same index.
1313 // TODO: Deal with mismatched index constants and variable indexes?
1314 if (!Index)
1315 Index = InsIdx;
1316 else if (InsIdx != *Index)
1317 return false;
1318 VecCs.push_back(VecC);
1319 ScalarOps.push_back(V);
1320 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1321 OpNum, &TTI)) {
1322 VecCs.push_back(Op.get());
1323 ScalarOps.push_back(Op.get());
1324 } else if (match(Op.get(), m_Constant(VecC))) {
1325 VecCs.push_back(VecC);
1326 ScalarOps.push_back(nullptr);
1327 } else {
1328 return false;
1329 }
1330 }
1331
1332 // Bail if all operands are constant.
1333 if (!Index.has_value())
1334 return false;
1335
1336 VectorType *VecTy = cast<VectorType>(I.getType());
1337 Type *ScalarTy = VecTy->getScalarType();
1338 assert(VecTy->isVectorTy() &&
1339 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1340 ScalarTy->isPointerTy()) &&
1341 "Unexpected types for insert element into binop or cmp");
1342
1343 unsigned Opcode = I.getOpcode();
1344 InstructionCost ScalarOpCost, VectorOpCost;
1345 if (CI) {
1346 CmpInst::Predicate Pred = CI->getPredicate();
1347 ScalarOpCost = TTI.getCmpSelInstrCost(
1348 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1349 VectorOpCost = TTI.getCmpSelInstrCost(
1350 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1351 } else if (UO || BO) {
1352 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1353 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1354 } else {
1355 IntrinsicCostAttributes ScalarICA(
1356 II->getIntrinsicID(), ScalarTy,
1357 SmallVector<Type *>(II->arg_size(), ScalarTy));
1358 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1359 IntrinsicCostAttributes VectorICA(
1360 II->getIntrinsicID(), VecTy,
1361 SmallVector<Type *>(II->arg_size(), VecTy));
1362 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1363 }
1364
1365 // Fold the vector constants in the original vectors into a new base vector to
1366 // get more accurate cost modelling.
1367 Value *NewVecC = nullptr;
1368 if (CI)
1369 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1370 else if (UO)
1371 NewVecC =
1372 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1373 else if (BO)
1374 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1375 else if (II)
1376 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1377
1378 if (!NewVecC)
1379 return false;
1380
1381 // Get cost estimate for the insert element. This cost will factor into
1382 // both sequences.
1383 InstructionCost OldCost = VectorOpCost;
1384 InstructionCost NewCost =
1385 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1386 CostKind, *Index, NewVecC);
1387
1388 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1389 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1390 II->getIntrinsicID(), Idx, &TTI)))
1391 continue;
1393 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1394 OldCost += InsertCost;
1395 NewCost += !Op->hasOneUse() * InsertCost;
1396 }
1397
1398 // We want to scalarize unless the vector variant actually has lower cost.
1399 if (OldCost < NewCost || !NewCost.isValid())
1400 return false;
1401
1402 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1403 // inselt NewVecC, (scalar_op V0, V1), Index
1404 if (CI)
1405 ++NumScalarCmp;
1406 else if (UO || BO)
1407 ++NumScalarOps;
1408 else
1409 ++NumScalarIntrinsic;
1410
1411 // For constant cases, extract the scalar element, this should constant fold.
1412 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1413 if (!Scalar)
1415 cast<Constant>(VecC), Builder.getInt64(*Index));
1416
1417 Value *Scalar;
1418 if (CI)
1419 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1420 else if (UO || BO)
1421 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1422 else
1423 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1424
1425 Scalar->setName(I.getName() + ".scalar");
1426
1427 // All IR flags are safe to back-propagate. There is no potential for extra
1428 // poison to be created by the scalar instruction.
1429 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1430 ScalarInst->copyIRFlags(&I);
1431
1432 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1433 replaceValue(I, *Insert);
1434 return true;
1435}
1436
1437/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1438/// a vector into vector operations followed by extract. Note: The SLP pass
1439/// may miss this pattern because of implementation problems.
1440bool VectorCombine::foldExtractedCmps(Instruction &I) {
1441 auto *BI = dyn_cast<BinaryOperator>(&I);
1442
1443 // We are looking for a scalar binop of booleans.
1444 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1445 if (!BI || !I.getType()->isIntegerTy(1))
1446 return false;
1447
1448 // The compare predicates should match, and each compare should have a
1449 // constant operand.
1450 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1451 Instruction *I0, *I1;
1452 Constant *C0, *C1;
1453 CmpPredicate P0, P1;
1454 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1455 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1456 return false;
1457
1458 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1459 if (!MatchingPred)
1460 return false;
1461
1462 // The compare operands must be extracts of the same vector with constant
1463 // extract indexes.
1464 Value *X;
1465 uint64_t Index0, Index1;
1466 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1467 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1468 return false;
1469
1470 auto *Ext0 = cast<ExtractElementInst>(I0);
1471 auto *Ext1 = cast<ExtractElementInst>(I1);
1472 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1473 if (!ConvertToShuf)
1474 return false;
1475 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1476 "Unknown ExtractElementInst");
1477
1478 // The original scalar pattern is:
1479 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1480 CmpInst::Predicate Pred = *MatchingPred;
1481 unsigned CmpOpcode =
1482 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1483 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1484 if (!VecTy)
1485 return false;
1486
1487 InstructionCost Ext0Cost =
1488 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1489 InstructionCost Ext1Cost =
1490 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1492 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1493 CostKind);
1494
1495 InstructionCost OldCost =
1496 Ext0Cost + Ext1Cost + CmpCost * 2 +
1497 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1498
1499 // The proposed vector pattern is:
1500 // vcmp = cmp Pred X, VecC
1501 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1502 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1503 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1506 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1507 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1508 ShufMask[CheapIndex] = ExpensiveIndex;
1510 CmpTy, ShufMask, CostKind);
1511 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1512 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1513 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1514 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1515
1516 // Aggressively form vector ops if the cost is equal because the transform
1517 // may enable further optimization.
1518 // Codegen can reverse this transform (scalarize) if it was not profitable.
1519 if (OldCost < NewCost || !NewCost.isValid())
1520 return false;
1521
1522 // Create a vector constant from the 2 scalar constants.
1523 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1524 PoisonValue::get(VecTy->getElementType()));
1525 CmpC[Index0] = C0;
1526 CmpC[Index1] = C1;
1527 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1528 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1529 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1530 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1531 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1532 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1533 replaceValue(I, *NewExt);
1534 ++NumVecCmpBO;
1535 return true;
1536}
1537
1540 const TargetTransformInfo &TTI,
1541 InstructionCost &CostBeforeReduction,
1542 InstructionCost &CostAfterReduction) {
1543 Instruction *Op0, *Op1;
1544 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1545 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1546 unsigned ReductionOpc =
1547 getArithmeticReductionInstruction(II.getIntrinsicID());
1548 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1549 bool IsUnsigned = isa<ZExtInst>(RedOp);
1550 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1551
1552 CostBeforeReduction =
1553 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1555 CostAfterReduction =
1556 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1557 ExtType, FastMathFlags(), CostKind);
1558 return;
1559 }
1560 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1561 match(RedOp,
1563 match(Op0, m_ZExtOrSExt(m_Value())) &&
1564 Op0->getOpcode() == Op1->getOpcode() &&
1565 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1566 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1567 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1568 bool IsUnsigned = isa<ZExtInst>(Op0);
1569 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1570 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1571
1572 InstructionCost ExtCost =
1573 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1575 InstructionCost MulCost =
1576 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1577 InstructionCost Ext2Cost =
1578 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1580
1581 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1582 CostAfterReduction = TTI.getMulAccReductionCost(
1583 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1584 return;
1585 }
1586 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1587 std::nullopt, CostKind);
1588}
1589
1590bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1591 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1592 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1593 if (BinOpOpc == Instruction::Sub)
1594 ReductionIID = Intrinsic::vector_reduce_add;
1595 if (ReductionIID == Intrinsic::not_intrinsic)
1596 return false;
1597
1598 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1599 Intrinsic::ID IID) -> Value * {
1600 auto *II = dyn_cast<IntrinsicInst>(V);
1601 if (!II)
1602 return nullptr;
1603 if (II->getIntrinsicID() == IID && II->hasOneUse())
1604 return II->getArgOperand(0);
1605 return nullptr;
1606 };
1607
1608 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1609 if (!V0)
1610 return false;
1611 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1612 if (!V1)
1613 return false;
1614
1615 auto *VTy = cast<VectorType>(V0->getType());
1616 if (V1->getType() != VTy)
1617 return false;
1618 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1619 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1620 unsigned ReductionOpc =
1621 getArithmeticReductionInstruction(II0.getIntrinsicID());
1622
1623 InstructionCost OldCost = 0;
1624 InstructionCost NewCost = 0;
1625 InstructionCost CostOfRedOperand0 = 0;
1626 InstructionCost CostOfRed0 = 0;
1627 InstructionCost CostOfRedOperand1 = 0;
1628 InstructionCost CostOfRed1 = 0;
1629 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1630 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1631 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1632 NewCost =
1633 CostOfRedOperand0 + CostOfRedOperand1 +
1634 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1635 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1636 if (NewCost >= OldCost || !NewCost.isValid())
1637 return false;
1638
1639 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1640 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1641 << "\n");
1642 Value *VectorBO;
1643 if (BinOpOpc == Instruction::Or)
1644 VectorBO = Builder.CreateOr(V0, V1, "",
1645 cast<PossiblyDisjointInst>(I).isDisjoint());
1646 else
1647 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1648
1649 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1650 replaceValue(I, *Rdx);
1651 return true;
1652}
1653
1654// Check if memory loc modified between two instrs in the same BB
1657 const MemoryLocation &Loc, AAResults &AA) {
1658 unsigned NumScanned = 0;
1659 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1660 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1661 ++NumScanned > MaxInstrsToScan;
1662 });
1663}
1664
1665namespace {
1666/// Helper class to indicate whether a vector index can be safely scalarized and
1667/// if a freeze needs to be inserted.
1668class ScalarizationResult {
1669 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1670
1671 StatusTy Status;
1672 Value *ToFreeze;
1673
1674 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1675 : Status(Status), ToFreeze(ToFreeze) {}
1676
1677public:
1678 ScalarizationResult(const ScalarizationResult &Other) = default;
1679 ~ScalarizationResult() {
1680 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1681 }
1682
1683 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1684 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1685 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1686 return {StatusTy::SafeWithFreeze, ToFreeze};
1687 }
1688
1689 /// Returns true if the index can be scalarize without requiring a freeze.
1690 bool isSafe() const { return Status == StatusTy::Safe; }
1691 /// Returns true if the index cannot be scalarized.
1692 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1693 /// Returns true if the index can be scalarize, but requires inserting a
1694 /// freeze.
1695 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1696
1697 /// Reset the state of Unsafe and clear ToFreze if set.
1698 void discard() {
1699 ToFreeze = nullptr;
1700 Status = StatusTy::Unsafe;
1701 }
1702
1703 /// Freeze the ToFreeze and update the use in \p User to use it.
1704 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1705 assert(isSafeWithFreeze() &&
1706 "should only be used when freezing is required");
1707 assert(is_contained(ToFreeze->users(), &UserI) &&
1708 "UserI must be a user of ToFreeze");
1709 IRBuilder<>::InsertPointGuard Guard(Builder);
1710 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1711 Value *Frozen =
1712 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1713 for (Use &U : make_early_inc_range((UserI.operands())))
1714 if (U.get() == ToFreeze)
1715 U.set(Frozen);
1716
1717 ToFreeze = nullptr;
1718 }
1719};
1720} // namespace
1721
1722/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1723/// Idx. \p Idx must access a valid vector element.
1724static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1725 Instruction *CtxI,
1726 AssumptionCache &AC,
1727 const DominatorTree &DT) {
1728 // We do checks for both fixed vector types and scalable vector types.
1729 // This is the number of elements of fixed vector types,
1730 // or the minimum number of elements of scalable vector types.
1731 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1732 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1733
1734 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1735 if (C->getValue().ult(NumElements))
1736 return ScalarizationResult::safe();
1737 return ScalarizationResult::unsafe();
1738 }
1739
1740 // Always unsafe if the index type can't handle all inbound values.
1741 if (!llvm::isUIntN(IntWidth, NumElements))
1742 return ScalarizationResult::unsafe();
1743
1744 APInt Zero(IntWidth, 0);
1745 APInt MaxElts(IntWidth, NumElements);
1746 ConstantRange ValidIndices(Zero, MaxElts);
1747 ConstantRange IdxRange(IntWidth, true);
1748
1749 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1750 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1751 true, &AC, CtxI, &DT)))
1752 return ScalarizationResult::safe();
1753 return ScalarizationResult::unsafe();
1754 }
1755
1756 // If the index may be poison, check if we can insert a freeze before the
1757 // range of the index is restricted.
1758 Value *IdxBase;
1759 ConstantInt *CI;
1760 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1761 IdxRange = IdxRange.binaryAnd(CI->getValue());
1762 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1763 IdxRange = IdxRange.urem(CI->getValue());
1764 }
1765
1766 if (ValidIndices.contains(IdxRange))
1767 return ScalarizationResult::safeWithFreeze(IdxBase);
1768 return ScalarizationResult::unsafe();
1769}
1770
1771/// The memory operation on a vector of \p ScalarType had alignment of
1772/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1773/// alignment that will be valid for the memory operation on a single scalar
1774/// element of the same type with index \p Idx.
1776 Type *ScalarType, Value *Idx,
1777 const DataLayout &DL) {
1778 if (auto *C = dyn_cast<ConstantInt>(Idx))
1779 return commonAlignment(VectorAlignment,
1780 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1781 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1782}
1783
1784// Combine patterns like:
1785// %0 = load <4 x i32>, <4 x i32>* %a
1786// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1787// store <4 x i32> %1, <4 x i32>* %a
1788// to:
1789// %0 = bitcast <4 x i32>* %a to i32*
1790// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1791// store i32 %b, i32* %1
1792bool VectorCombine::foldSingleElementStore(Instruction &I) {
1794 return false;
1795 auto *SI = cast<StoreInst>(&I);
1796 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1797 return false;
1798
1799 // TODO: Combine more complicated patterns (multiple insert) by referencing
1800 // TargetTransformInfo.
1802 Value *NewElement;
1803 Value *Idx;
1804 if (!match(SI->getValueOperand(),
1805 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1806 m_Value(Idx))))
1807 return false;
1808
1809 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1810 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1811 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1812 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1813 // modified between, vector type matches store size, and index is inbounds.
1814 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1815 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1816 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1817 return false;
1818
1819 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1820 if (ScalarizableIdx.isUnsafe() ||
1821 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1822 MemoryLocation::get(SI), AA))
1823 return false;
1824
1825 // Ensure we add the load back to the worklist BEFORE its users so they can
1826 // erased in the correct order.
1827 Worklist.push(Load);
1828
1829 if (ScalarizableIdx.isSafeWithFreeze())
1830 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1831 Value *GEP = Builder.CreateInBoundsGEP(
1832 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1833 {ConstantInt::get(Idx->getType(), 0), Idx});
1834 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1835 NSI->copyMetadata(*SI);
1836 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1837 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1838 *DL);
1839 NSI->setAlignment(ScalarOpAlignment);
1840 replaceValue(I, *NSI);
1842 return true;
1843 }
1844
1845 return false;
1846}
1847
1848/// Try to scalarize vector loads feeding extractelement instructions.
1849bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1851 return false;
1852
1853 Value *Ptr;
1854 if (!match(&I, m_Load(m_Value(Ptr))))
1855 return false;
1856
1857 auto *LI = cast<LoadInst>(&I);
1858 auto *VecTy = cast<VectorType>(LI->getType());
1859 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1860 return false;
1861
1862 InstructionCost OriginalCost =
1863 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1864 LI->getPointerAddressSpace(), CostKind);
1865 InstructionCost ScalarizedCost = 0;
1866
1867 Instruction *LastCheckedInst = LI;
1868 unsigned NumInstChecked = 0;
1869 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1870 auto FailureGuard = make_scope_exit([&]() {
1871 // If the transform is aborted, discard the ScalarizationResults.
1872 for (auto &Pair : NeedFreeze)
1873 Pair.second.discard();
1874 });
1875
1876 // Check if all users of the load are extracts with no memory modifications
1877 // between the load and the extract. Compute the cost of both the original
1878 // code and the scalarized version.
1879 for (User *U : LI->users()) {
1880 auto *UI = dyn_cast<ExtractElementInst>(U);
1881 if (!UI || UI->getParent() != LI->getParent())
1882 return false;
1883
1884 // If any extract is waiting to be erased, then bail out as this will
1885 // distort the cost calculation and possibly lead to infinite loops.
1886 if (UI->use_empty())
1887 return false;
1888
1889 // Check if any instruction between the load and the extract may modify
1890 // memory.
1891 if (LastCheckedInst->comesBefore(UI)) {
1892 for (Instruction &I :
1893 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1894 // Bail out if we reached the check limit or the instruction may write
1895 // to memory.
1896 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1897 return false;
1898 NumInstChecked++;
1899 }
1900 LastCheckedInst = UI;
1901 }
1902
1903 auto ScalarIdx =
1904 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1905 if (ScalarIdx.isUnsafe())
1906 return false;
1907 if (ScalarIdx.isSafeWithFreeze()) {
1908 NeedFreeze.try_emplace(UI, ScalarIdx);
1909 ScalarIdx.discard();
1910 }
1911
1912 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1913 OriginalCost +=
1914 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1915 Index ? Index->getZExtValue() : -1);
1916 ScalarizedCost +=
1917 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1918 Align(1), LI->getPointerAddressSpace(), CostKind);
1919 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1920 nullptr, nullptr, CostKind);
1921 }
1922
1923 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
1924 << "\n LoadExtractCost: " << OriginalCost
1925 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1926
1927 if (ScalarizedCost >= OriginalCost)
1928 return false;
1929
1930 // Ensure we add the load back to the worklist BEFORE its users so they can
1931 // erased in the correct order.
1932 Worklist.push(LI);
1933
1934 Type *ElemType = VecTy->getElementType();
1935
1936 // Replace extracts with narrow scalar loads.
1937 for (User *U : LI->users()) {
1938 auto *EI = cast<ExtractElementInst>(U);
1939 Value *Idx = EI->getIndexOperand();
1940
1941 // Insert 'freeze' for poison indexes.
1942 auto It = NeedFreeze.find(EI);
1943 if (It != NeedFreeze.end())
1944 It->second.freeze(Builder, *cast<Instruction>(Idx));
1945
1946 Builder.SetInsertPoint(EI);
1947 Value *GEP =
1948 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1949 auto *NewLoad = cast<LoadInst>(
1950 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1951
1952 Align ScalarOpAlignment =
1953 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1954 NewLoad->setAlignment(ScalarOpAlignment);
1955
1956 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1957 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1958 AAMDNodes OldAAMD = LI->getAAMetadata();
1959 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1960 }
1961
1962 replaceValue(*EI, *NewLoad, false);
1963 }
1964
1965 FailureGuard.release();
1966 return true;
1967}
1968
1969bool VectorCombine::scalarizeExtExtract(Instruction &I) {
1971 return false;
1972 auto *Ext = dyn_cast<ZExtInst>(&I);
1973 if (!Ext)
1974 return false;
1975
1976 // Try to convert a vector zext feeding only extracts to a set of scalar
1977 // (Src << ExtIdx *Size) & (Size -1)
1978 // if profitable .
1979 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
1980 if (!SrcTy)
1981 return false;
1982 auto *DstTy = cast<FixedVectorType>(Ext->getType());
1983
1984 Type *ScalarDstTy = DstTy->getElementType();
1985 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
1986 return false;
1987
1988 InstructionCost VectorCost =
1989 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
1991 unsigned ExtCnt = 0;
1992 bool ExtLane0 = false;
1993 for (User *U : Ext->users()) {
1994 uint64_t Idx;
1995 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
1996 return false;
1997 if (cast<Instruction>(U)->use_empty())
1998 continue;
1999 ExtCnt += 1;
2000 ExtLane0 |= !Idx;
2001 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2002 CostKind, Idx, U);
2003 }
2004
2005 InstructionCost ScalarCost =
2006 ExtCnt * TTI.getArithmeticInstrCost(
2007 Instruction::And, ScalarDstTy, CostKind,
2010 (ExtCnt - ExtLane0) *
2012 Instruction::LShr, ScalarDstTy, CostKind,
2015 if (ScalarCost > VectorCost)
2016 return false;
2017
2018 Value *ScalarV = Ext->getOperand(0);
2019 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2020 &DT)) {
2021 // Check wether all lanes are extracted, all extracts trigger UB
2022 // on poison, and the last extract (and hence all previous ones)
2023 // are guaranteed to execute if Ext executes. If so, we do not
2024 // need to insert a freeze.
2025 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2026 bool AllExtractsTriggerUB = true;
2027 ExtractElementInst *LastExtract = nullptr;
2028 BasicBlock *ExtBB = Ext->getParent();
2029 for (User *U : Ext->users()) {
2030 auto *Extract = cast<ExtractElementInst>(U);
2031 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2032 AllExtractsTriggerUB = false;
2033 break;
2034 }
2035 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2036 if (!LastExtract || LastExtract->comesBefore(Extract))
2037 LastExtract = Extract;
2038 }
2039 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2040 !AllExtractsTriggerUB ||
2042 LastExtract->getIterator()))
2043 ScalarV = Builder.CreateFreeze(ScalarV);
2044 }
2045 ScalarV = Builder.CreateBitCast(
2046 ScalarV,
2047 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2048 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2049 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2050 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2051 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2052 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2053 for (User *U : Ext->users()) {
2054 auto *Extract = cast<ExtractElementInst>(U);
2055 uint64_t Idx =
2056 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2057 uint64_t ShiftAmt =
2058 DL->isBigEndian()
2059 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2060 : (Idx * SrcEltSizeInBits);
2061 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2062 Value *And = Builder.CreateAnd(LShr, Mask);
2063 U->replaceAllUsesWith(And);
2064 }
2065 return true;
2066}
2067
2068/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2069/// to "(bitcast (concat X, Y))"
2070/// where X/Y are bitcasted from i1 mask vectors.
2071bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2072 Type *Ty = I.getType();
2073 if (!Ty->isIntegerTy())
2074 return false;
2075
2076 // TODO: Add big endian test coverage
2077 if (DL->isBigEndian())
2078 return false;
2079
2080 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2081 Instruction *X, *Y;
2083 return false;
2084
2085 // Allow both sources to contain shl, to handle more generic pattern:
2086 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2087 Value *SrcX;
2088 uint64_t ShAmtX = 0;
2089 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2090 !match(X, m_OneUse(
2092 m_ConstantInt(ShAmtX)))))
2093 return false;
2094
2095 Value *SrcY;
2096 uint64_t ShAmtY = 0;
2097 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2098 !match(Y, m_OneUse(
2100 m_ConstantInt(ShAmtY)))))
2101 return false;
2102
2103 // Canonicalize larger shift to the RHS.
2104 if (ShAmtX > ShAmtY) {
2105 std::swap(X, Y);
2106 std::swap(SrcX, SrcY);
2107 std::swap(ShAmtX, ShAmtY);
2108 }
2109
2110 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2111 // difference is the mask width so they can be easily concatenated together.
2112 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2113 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2114 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2115 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2116 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2117 !MaskTy->getElementType()->isIntegerTy(1) ||
2118 MaskTy->getNumElements() != ShAmtDiff ||
2119 MaskTy->getNumElements() > (BitWidth / 2))
2120 return false;
2121
2122 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2123 auto *ConcatIntTy =
2124 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2125 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2126
2127 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2128 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2129
2130 // TODO: Is it worth supporting multi use cases?
2131 InstructionCost OldCost = 0;
2132 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2133 OldCost +=
2134 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2135 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2137 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2139
2140 InstructionCost NewCost = 0;
2142 MaskTy, ConcatMask, CostKind);
2143 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2145 if (Ty != ConcatIntTy)
2146 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2148 if (ShAmtX > 0)
2149 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2150
2151 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2152 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2153 << "\n");
2154
2155 if (NewCost > OldCost)
2156 return false;
2157
2158 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2159 // any residual zero-extension or shifting.
2160 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2161 Worklist.pushValue(Concat);
2162
2163 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2164
2165 if (Ty != ConcatIntTy) {
2166 Worklist.pushValue(Result);
2167 Result = Builder.CreateZExt(Result, Ty);
2168 }
2169
2170 if (ShAmtX > 0) {
2171 Worklist.pushValue(Result);
2172 Result = Builder.CreateShl(Result, ShAmtX);
2173 }
2174
2175 replaceValue(I, *Result);
2176 return true;
2177}
2178
2179/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2180/// --> "binop (shuffle), (shuffle)".
2181bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2182 BinaryOperator *BinOp;
2183 ArrayRef<int> OuterMask;
2184 if (!match(&I,
2185 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2186 return false;
2187
2188 // Don't introduce poison into div/rem.
2189 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2190 return false;
2191
2192 Value *Op00, *Op01, *Op10, *Op11;
2193 ArrayRef<int> Mask0, Mask1;
2194 bool Match0 =
2195 match(BinOp->getOperand(0),
2196 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2197 bool Match1 =
2198 match(BinOp->getOperand(1),
2199 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2200 if (!Match0 && !Match1)
2201 return false;
2202
2203 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2204 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2205 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2206 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2207
2208 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2209 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2210 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2211 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2212 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2213 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2214 return false;
2215
2216 unsigned NumSrcElts = BinOpTy->getNumElements();
2217
2218 // Don't accept shuffles that reference the second operand in
2219 // div/rem or if its an undef arg.
2220 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2221 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2222 return false;
2223
2224 // Merge outer / inner (or identity if no match) shuffles.
2225 SmallVector<int> NewMask0, NewMask1;
2226 for (int M : OuterMask) {
2227 if (M < 0 || M >= (int)NumSrcElts) {
2228 NewMask0.push_back(PoisonMaskElem);
2229 NewMask1.push_back(PoisonMaskElem);
2230 } else {
2231 NewMask0.push_back(Match0 ? Mask0[M] : M);
2232 NewMask1.push_back(Match1 ? Mask1[M] : M);
2233 }
2234 }
2235
2236 unsigned NumOpElts = Op0Ty->getNumElements();
2237 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2238 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2239 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2240 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2241 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2242 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2243
2244 // Try to merge shuffles across the binop if the new shuffles are not costly.
2245 InstructionCost OldCost =
2246 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2248 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2249 if (Match0)
2250 OldCost += TTI.getShuffleCost(
2251 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2252 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2253 if (Match1)
2254 OldCost += TTI.getShuffleCost(
2255 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2256 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2257
2258 InstructionCost NewCost =
2259 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2260
2261 if (!IsIdentity0)
2262 NewCost +=
2264 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2265 if (!IsIdentity1)
2266 NewCost +=
2268 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2269
2270 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2271 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2272 << "\n");
2273
2274 // If costs are equal, still fold as we reduce instruction count.
2275 if (NewCost > OldCost)
2276 return false;
2277
2278 Value *LHS =
2279 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2280 Value *RHS =
2281 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2282 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2283
2284 // Intersect flags from the old binops.
2285 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2286 NewInst->copyIRFlags(BinOp);
2287
2288 Worklist.pushValue(LHS);
2289 Worklist.pushValue(RHS);
2290 replaceValue(I, *NewBO);
2291 return true;
2292}
2293
2294/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2295/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2296bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2297 ArrayRef<int> OldMask;
2298 Instruction *LHS, *RHS;
2300 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2301 return false;
2302
2303 // TODO: Add support for addlike etc.
2304 if (LHS->getOpcode() != RHS->getOpcode())
2305 return false;
2306
2307 Value *X, *Y, *Z, *W;
2308 bool IsCommutative = false;
2309 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2310 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2311 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2312 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2313 auto *BO = cast<BinaryOperator>(LHS);
2314 // Don't introduce poison into div/rem.
2315 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2316 return false;
2317 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2318 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2319 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2320 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2321 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2322 } else
2323 return false;
2324
2325 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2326 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2327 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2328 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2329 return false;
2330
2331 unsigned NumSrcElts = BinOpTy->getNumElements();
2332
2333 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2334 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2335 std::swap(X, Y);
2336
2337 auto ConvertToUnary = [NumSrcElts](int &M) {
2338 if (M >= (int)NumSrcElts)
2339 M -= NumSrcElts;
2340 };
2341
2342 SmallVector<int> NewMask0(OldMask);
2344 if (X == Z) {
2345 llvm::for_each(NewMask0, ConvertToUnary);
2347 Z = PoisonValue::get(BinOpTy);
2348 }
2349
2350 SmallVector<int> NewMask1(OldMask);
2352 if (Y == W) {
2353 llvm::for_each(NewMask1, ConvertToUnary);
2355 W = PoisonValue::get(BinOpTy);
2356 }
2357
2358 // Try to replace a binop with a shuffle if the shuffle is not costly.
2359 InstructionCost OldCost =
2363 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2364 &I);
2365
2366 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2367 // where one use shuffles have gotten split across the binop/cmp. These
2368 // often allow a major reduction in total cost that wouldn't happen as
2369 // individual folds.
2370 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2371 TTI::TargetCostKind CostKind) -> bool {
2372 Value *InnerOp;
2373 ArrayRef<int> InnerMask;
2374 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2375 m_Mask(InnerMask)))) &&
2376 InnerOp->getType() == Op->getType() &&
2377 all_of(InnerMask,
2378 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2379 for (int &M : Mask)
2380 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2381 M = InnerMask[M - Offset];
2382 M = 0 <= M ? M + Offset : M;
2383 }
2385 Op = InnerOp;
2386 return true;
2387 }
2388 return false;
2389 };
2390 bool ReducedInstCount = false;
2391 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2392 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2393 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2394 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2395
2396 auto *ShuffleCmpTy =
2397 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2398 InstructionCost NewCost =
2399 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2400 nullptr, {X, Z}) +
2401 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2402 nullptr, {Y, W});
2403
2404 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2405 NewCost +=
2406 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2407 } else {
2408 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2409 ShuffleDstTy, PredLHS, CostKind);
2410 }
2411
2412 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2413 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2414 << "\n");
2415
2416 // If either shuffle will constant fold away, then fold for the same cost as
2417 // we will reduce the instruction count.
2418 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2419 (isa<Constant>(Y) && isa<Constant>(W));
2420 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2421 return false;
2422
2423 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2424 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2425 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2426 ? Builder.CreateBinOp(
2427 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2428 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2429
2430 // Intersect flags from the old binops.
2431 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2432 NewInst->copyIRFlags(LHS);
2433 NewInst->andIRFlags(RHS);
2434 }
2435
2436 Worklist.pushValue(Shuf0);
2437 Worklist.pushValue(Shuf1);
2438 replaceValue(I, *NewBO);
2439 return true;
2440}
2441
2442/// Try to convert,
2443/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2444/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2445bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2446 ArrayRef<int> Mask;
2447 Value *C1, *T1, *F1, *C2, *T2, *F2;
2448 if (!match(&I, m_Shuffle(
2450 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2451 m_Mask(Mask))))
2452 return false;
2453
2454 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2455 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2456 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2457 return false;
2458
2459 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2460 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2461 // SelectInsts must have the same FMF.
2462 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2463 ((SI0FOp != nullptr) &&
2464 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2465 return false;
2466
2467 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2468 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2470 auto SelOp = Instruction::Select;
2472 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2473 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2475 OldCost +=
2476 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2477 {I.getOperand(0), I.getOperand(1)}, &I);
2478
2480 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2481 Mask, CostKind, 0, nullptr, {C1, C2});
2482 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2483 nullptr, {T1, T2});
2484 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2485 nullptr, {F1, F2});
2486 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2487 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2488 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2490
2491 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2492 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2493 << "\n");
2494 if (NewCost > OldCost)
2495 return false;
2496
2497 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2498 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2499 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2500 Value *NewSel;
2501 // We presuppose that the SelectInsts have the same FMF.
2502 if (SI0FOp)
2503 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2504 SI0FOp->getFastMathFlags());
2505 else
2506 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2507
2508 Worklist.pushValue(ShuffleCmp);
2509 Worklist.pushValue(ShuffleTrue);
2510 Worklist.pushValue(ShuffleFalse);
2511 replaceValue(I, *NewSel);
2512 return true;
2513}
2514
2515/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2516/// into "castop (shuffle)".
2517bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2518 Value *V0, *V1;
2519 ArrayRef<int> OldMask;
2520 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2521 return false;
2522
2523 // Check whether this is a binary shuffle.
2524 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2525
2526 auto *C0 = dyn_cast<CastInst>(V0);
2527 auto *C1 = dyn_cast<CastInst>(V1);
2528 if (!C0 || (IsBinaryShuffle && !C1))
2529 return false;
2530
2531 Instruction::CastOps Opcode = C0->getOpcode();
2532
2533 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2534 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2535 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2536 return false;
2537
2538 if (IsBinaryShuffle) {
2539 if (C0->getSrcTy() != C1->getSrcTy())
2540 return false;
2541 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2542 if (Opcode != C1->getOpcode()) {
2543 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2544 Opcode = Instruction::SExt;
2545 else
2546 return false;
2547 }
2548 }
2549
2550 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2551 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2552 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2553 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2554 return false;
2555
2556 unsigned NumSrcElts = CastSrcTy->getNumElements();
2557 unsigned NumDstElts = CastDstTy->getNumElements();
2558 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2559 "Only bitcasts expected to alter src/dst element counts");
2560
2561 // Check for bitcasting of unscalable vector types.
2562 // e.g. <32 x i40> -> <40 x i32>
2563 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2564 (NumDstElts % NumSrcElts) != 0)
2565 return false;
2566
2567 SmallVector<int, 16> NewMask;
2568 if (NumSrcElts >= NumDstElts) {
2569 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2570 // always be expanded to the equivalent form choosing narrower elements.
2571 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2572 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2573 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2574 } else {
2575 // The bitcast is from narrow elements to wide elements. The shuffle mask
2576 // must choose consecutive elements to allow casting first.
2577 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2578 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2579 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2580 return false;
2581 }
2582
2583 auto *NewShuffleDstTy =
2584 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2585
2586 // Try to replace a castop with a shuffle if the shuffle is not costly.
2587 InstructionCost CostC0 =
2588 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2590
2592 if (IsBinaryShuffle)
2594 else
2596
2597 InstructionCost OldCost = CostC0;
2598 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2599 CostKind, 0, nullptr, {}, &I);
2600
2601 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2602 CastSrcTy, NewMask, CostKind);
2603 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2605 if (!C0->hasOneUse())
2606 NewCost += CostC0;
2607 if (IsBinaryShuffle) {
2608 InstructionCost CostC1 =
2609 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2611 OldCost += CostC1;
2612 if (!C1->hasOneUse())
2613 NewCost += CostC1;
2614 }
2615
2616 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2617 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2618 << "\n");
2619 if (NewCost > OldCost)
2620 return false;
2621
2622 Value *Shuf;
2623 if (IsBinaryShuffle)
2624 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2625 NewMask);
2626 else
2627 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2628
2629 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2630
2631 // Intersect flags from the old casts.
2632 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2633 NewInst->copyIRFlags(C0);
2634 if (IsBinaryShuffle)
2635 NewInst->andIRFlags(C1);
2636 }
2637
2638 Worklist.pushValue(Shuf);
2639 replaceValue(I, *Cast);
2640 return true;
2641}
2642
2643/// Try to convert any of:
2644/// "shuffle (shuffle x, y), (shuffle y, x)"
2645/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2646/// "shuffle (shuffle x, undef), y"
2647/// "shuffle x, (shuffle y, undef)"
2648/// into "shuffle x, y".
2649bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2650 ArrayRef<int> OuterMask;
2651 Value *OuterV0, *OuterV1;
2652 if (!match(&I,
2653 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2654 return false;
2655
2656 ArrayRef<int> InnerMask0, InnerMask1;
2657 Value *X0, *X1, *Y0, *Y1;
2658 bool Match0 =
2659 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2660 bool Match1 =
2661 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2662 if (!Match0 && !Match1)
2663 return false;
2664
2665 // If the outer shuffle is a permute, then create a fake inner all-poison
2666 // shuffle. This is easier than accounting for length-changing shuffles below.
2667 SmallVector<int, 16> PoisonMask1;
2668 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2669 X1 = X0;
2670 Y1 = Y0;
2671 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2672 InnerMask1 = PoisonMask1;
2673 Match1 = true; // fake match
2674 }
2675
2676 X0 = Match0 ? X0 : OuterV0;
2677 Y0 = Match0 ? Y0 : OuterV0;
2678 X1 = Match1 ? X1 : OuterV1;
2679 Y1 = Match1 ? Y1 : OuterV1;
2680 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2681 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2682 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2683 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2684 X0->getType() != X1->getType())
2685 return false;
2686
2687 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2688 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2689
2690 // Attempt to merge shuffles, matching upto 2 source operands.
2691 // Replace index to a poison arg with PoisonMaskElem.
2692 // Bail if either inner masks reference an undef arg.
2693 SmallVector<int, 16> NewMask(OuterMask);
2694 Value *NewX = nullptr, *NewY = nullptr;
2695 for (int &M : NewMask) {
2696 Value *Src = nullptr;
2697 if (0 <= M && M < (int)NumImmElts) {
2698 Src = OuterV0;
2699 if (Match0) {
2700 M = InnerMask0[M];
2701 Src = M >= (int)NumSrcElts ? Y0 : X0;
2702 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2703 }
2704 } else if (M >= (int)NumImmElts) {
2705 Src = OuterV1;
2706 M -= NumImmElts;
2707 if (Match1) {
2708 M = InnerMask1[M];
2709 Src = M >= (int)NumSrcElts ? Y1 : X1;
2710 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2711 }
2712 }
2713 if (Src && M != PoisonMaskElem) {
2714 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2715 if (isa<UndefValue>(Src)) {
2716 // We've referenced an undef element - if its poison, update the shuffle
2717 // mask, else bail.
2718 if (!isa<PoisonValue>(Src))
2719 return false;
2720 M = PoisonMaskElem;
2721 continue;
2722 }
2723 if (!NewX || NewX == Src) {
2724 NewX = Src;
2725 continue;
2726 }
2727 if (!NewY || NewY == Src) {
2728 M += NumSrcElts;
2729 NewY = Src;
2730 continue;
2731 }
2732 return false;
2733 }
2734 }
2735
2736 if (!NewX)
2737 return PoisonValue::get(ShuffleDstTy);
2738 if (!NewY)
2739 NewY = PoisonValue::get(ShuffleSrcTy);
2740
2741 // Have we folded to an Identity shuffle?
2742 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2743 replaceValue(I, *NewX);
2744 return true;
2745 }
2746
2747 // Try to merge the shuffles if the new shuffle is not costly.
2748 InstructionCost InnerCost0 = 0;
2749 if (Match0)
2750 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2751
2752 InstructionCost InnerCost1 = 0;
2753 if (Match1)
2754 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2755
2757
2758 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2759
2760 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2764 InstructionCost NewCost =
2765 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2766 nullptr, {NewX, NewY});
2767 if (!OuterV0->hasOneUse())
2768 NewCost += InnerCost0;
2769 if (!OuterV1->hasOneUse())
2770 NewCost += InnerCost1;
2771
2772 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2773 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2774 << "\n");
2775 if (NewCost > OldCost)
2776 return false;
2777
2778 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2779 replaceValue(I, *Shuf);
2780 return true;
2781}
2782
2783/// Try to convert
2784/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2785bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2786 Value *V0, *V1;
2787 ArrayRef<int> OldMask;
2788 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2789 m_Mask(OldMask))))
2790 return false;
2791
2792 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2793 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2794 if (!II0 || !II1)
2795 return false;
2796
2797 Intrinsic::ID IID = II0->getIntrinsicID();
2798 if (IID != II1->getIntrinsicID())
2799 return false;
2800
2801 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2802 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2803 if (!ShuffleDstTy || !II0Ty)
2804 return false;
2805
2806 if (!isTriviallyVectorizable(IID))
2807 return false;
2808
2809 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2811 II0->getArgOperand(I) != II1->getArgOperand(I))
2812 return false;
2813
2814 InstructionCost OldCost =
2815 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2816 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
2818 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2819
2820 SmallVector<Type *> NewArgsTy;
2821 InstructionCost NewCost = 0;
2822 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2824 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2825 } else {
2826 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2827 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2828 ShuffleDstTy->getNumElements());
2829 NewArgsTy.push_back(ArgTy);
2831 ArgTy, VecTy, OldMask, CostKind);
2832 }
2833 }
2834 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2835 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2836
2837 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2838 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2839 << "\n");
2840
2841 if (NewCost > OldCost)
2842 return false;
2843
2844 SmallVector<Value *> NewArgs;
2845 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2847 NewArgs.push_back(II0->getArgOperand(I));
2848 } else {
2849 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2850 II1->getArgOperand(I), OldMask);
2851 NewArgs.push_back(Shuf);
2852 Worklist.pushValue(Shuf);
2853 }
2854 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2855
2856 // Intersect flags from the old intrinsics.
2857 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2858 NewInst->copyIRFlags(II0);
2859 NewInst->andIRFlags(II1);
2860 }
2861
2862 replaceValue(I, *NewIntrinsic);
2863 return true;
2864}
2865
2866using InstLane = std::pair<Use *, int>;
2867
2868static InstLane lookThroughShuffles(Use *U, int Lane) {
2869 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
2870 unsigned NumElts =
2871 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
2872 int M = SV->getMaskValue(Lane);
2873 if (M < 0)
2874 return {nullptr, PoisonMaskElem};
2875 if (static_cast<unsigned>(M) < NumElts) {
2876 U = &SV->getOperandUse(0);
2877 Lane = M;
2878 } else {
2879 U = &SV->getOperandUse(1);
2880 Lane = M - NumElts;
2881 }
2882 }
2883 return InstLane{U, Lane};
2884}
2885
2889 for (InstLane IL : Item) {
2890 auto [U, Lane] = IL;
2891 InstLane OpLane =
2892 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
2893 Lane)
2894 : InstLane{nullptr, PoisonMaskElem};
2895 NItem.emplace_back(OpLane);
2896 }
2897 return NItem;
2898}
2899
2900/// Detect concat of multiple values into a vector
2902 const TargetTransformInfo &TTI) {
2903 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
2904 unsigned NumElts = Ty->getNumElements();
2905 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
2906 return false;
2907
2908 // Check that the concat is free, usually meaning that the type will be split
2909 // during legalization.
2910 SmallVector<int, 16> ConcatMask(NumElts * 2);
2911 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2912 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
2913 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
2914 Ty, ConcatMask, CostKind) != 0)
2915 return false;
2916
2917 unsigned NumSlices = Item.size() / NumElts;
2918 // Currently we generate a tree of shuffles for the concats, which limits us
2919 // to a power2.
2920 if (!isPowerOf2_32(NumSlices))
2921 return false;
2922 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
2923 Use *SliceV = Item[Slice * NumElts].first;
2924 if (!SliceV || SliceV->get()->getType() != Ty)
2925 return false;
2926 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
2927 auto [V, Lane] = Item[Slice * NumElts + Elt];
2928 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
2929 return false;
2930 }
2931 }
2932 return true;
2933}
2934
2936 const SmallPtrSet<Use *, 4> &IdentityLeafs,
2937 const SmallPtrSet<Use *, 4> &SplatLeafs,
2938 const SmallPtrSet<Use *, 4> &ConcatLeafs,
2939 IRBuilderBase &Builder,
2940 const TargetTransformInfo *TTI) {
2941 auto [FrontU, FrontLane] = Item.front();
2942
2943 if (IdentityLeafs.contains(FrontU)) {
2944 return FrontU->get();
2945 }
2946 if (SplatLeafs.contains(FrontU)) {
2947 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
2948 return Builder.CreateShuffleVector(FrontU->get(), Mask);
2949 }
2950 if (ConcatLeafs.contains(FrontU)) {
2951 unsigned NumElts =
2952 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
2953 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
2954 for (unsigned S = 0; S < Values.size(); ++S)
2955 Values[S] = Item[S * NumElts].first->get();
2956
2957 while (Values.size() > 1) {
2958 NumElts *= 2;
2959 SmallVector<int, 16> Mask(NumElts, 0);
2960 std::iota(Mask.begin(), Mask.end(), 0);
2961 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
2962 for (unsigned S = 0; S < NewValues.size(); ++S)
2963 NewValues[S] =
2964 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
2965 Values = NewValues;
2966 }
2967 return Values[0];
2968 }
2969
2970 auto *I = cast<Instruction>(FrontU->get());
2971 auto *II = dyn_cast<IntrinsicInst>(I);
2972 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
2974 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
2975 if (II &&
2976 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
2977 Ops[Idx] = II->getOperand(Idx);
2978 continue;
2979 }
2981 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
2982 Builder, TTI);
2983 }
2984
2985 SmallVector<Value *, 8> ValueList;
2986 for (const auto &Lane : Item)
2987 if (Lane.first)
2988 ValueList.push_back(Lane.first->get());
2989
2990 Type *DstTy =
2991 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
2992 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
2993 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
2994 Ops[0], Ops[1]);
2995 propagateIRFlags(Value, ValueList);
2996 return Value;
2997 }
2998 if (auto *CI = dyn_cast<CmpInst>(I)) {
2999 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3000 propagateIRFlags(Value, ValueList);
3001 return Value;
3002 }
3003 if (auto *SI = dyn_cast<SelectInst>(I)) {
3004 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3005 propagateIRFlags(Value, ValueList);
3006 return Value;
3007 }
3008 if (auto *CI = dyn_cast<CastInst>(I)) {
3009 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3010 propagateIRFlags(Value, ValueList);
3011 return Value;
3012 }
3013 if (II) {
3014 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3015 propagateIRFlags(Value, ValueList);
3016 return Value;
3017 }
3018 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3019 auto *Value =
3020 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3021 propagateIRFlags(Value, ValueList);
3022 return Value;
3023}
3024
3025// Starting from a shuffle, look up through operands tracking the shuffled index
3026// of each lane. If we can simplify away the shuffles to identities then
3027// do so.
3028bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3029 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3030 if (!Ty || I.use_empty())
3031 return false;
3032
3033 SmallVector<InstLane> Start(Ty->getNumElements());
3034 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3035 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3036
3038 Worklist.push_back(Start);
3039 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3040 unsigned NumVisited = 0;
3041
3042 while (!Worklist.empty()) {
3043 if (++NumVisited > MaxInstrsToScan)
3044 return false;
3045
3046 SmallVector<InstLane> Item = Worklist.pop_back_val();
3047 auto [FrontU, FrontLane] = Item.front();
3048
3049 // If we found an undef first lane then bail out to keep things simple.
3050 if (!FrontU)
3051 return false;
3052
3053 // Helper to peek through bitcasts to the same value.
3054 auto IsEquiv = [&](Value *X, Value *Y) {
3055 return X->getType() == Y->getType() &&
3057 };
3058
3059 // Look for an identity value.
3060 if (FrontLane == 0 &&
3061 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3062 Ty->getNumElements() &&
3063 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3064 Value *FrontV = Item.front().first->get();
3065 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3066 E.value().second == (int)E.index());
3067 })) {
3068 IdentityLeafs.insert(FrontU);
3069 continue;
3070 }
3071 // Look for constants, for the moment only supporting constant splats.
3072 if (auto *C = dyn_cast<Constant>(FrontU);
3073 C && C->getSplatValue() &&
3074 all_of(drop_begin(Item), [Item](InstLane &IL) {
3075 Value *FrontV = Item.front().first->get();
3076 Use *U = IL.first;
3077 return !U || (isa<Constant>(U->get()) &&
3078 cast<Constant>(U->get())->getSplatValue() ==
3079 cast<Constant>(FrontV)->getSplatValue());
3080 })) {
3081 SplatLeafs.insert(FrontU);
3082 continue;
3083 }
3084 // Look for a splat value.
3085 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3086 auto [FrontU, FrontLane] = Item.front();
3087 auto [U, Lane] = IL;
3088 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3089 })) {
3090 SplatLeafs.insert(FrontU);
3091 continue;
3092 }
3093
3094 // We need each element to be the same type of value, and check that each
3095 // element has a single use.
3096 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3097 Value *FrontV = Item.front().first->get();
3098 if (!IL.first)
3099 return true;
3100 Value *V = IL.first->get();
3101 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3102 return false;
3103 if (V->getValueID() != FrontV->getValueID())
3104 return false;
3105 if (auto *CI = dyn_cast<CmpInst>(V))
3106 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3107 return false;
3108 if (auto *CI = dyn_cast<CastInst>(V))
3109 if (CI->getSrcTy()->getScalarType() !=
3110 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3111 return false;
3112 if (auto *SI = dyn_cast<SelectInst>(V))
3113 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3114 SI->getOperand(0)->getType() !=
3115 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3116 return false;
3117 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3118 return false;
3119 auto *II = dyn_cast<IntrinsicInst>(V);
3120 return !II || (isa<IntrinsicInst>(FrontV) &&
3121 II->getIntrinsicID() ==
3122 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3123 !II->hasOperandBundles());
3124 };
3125 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3126 // Check the operator is one that we support.
3127 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3128 // We exclude div/rem in case they hit UB from poison lanes.
3129 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3130 BO && BO->isIntDivRem())
3131 return false;
3134 continue;
3135 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3136 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3138 continue;
3139 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3140 // TODO: Handle vector widening/narrowing bitcasts.
3141 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3142 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3143 if (DstTy && SrcTy &&
3144 SrcTy->getNumElements() == DstTy->getNumElements()) {
3146 continue;
3147 }
3148 } else if (isa<SelectInst>(FrontU)) {
3152 continue;
3153 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3154 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3155 !II->hasOperandBundles()) {
3156 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3157 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3158 &TTI)) {
3159 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3160 Value *FrontV = Item.front().first->get();
3161 Use *U = IL.first;
3162 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3163 cast<Instruction>(FrontV)->getOperand(Op));
3164 }))
3165 return false;
3166 continue;
3167 }
3169 }
3170 continue;
3171 }
3172 }
3173
3174 if (isFreeConcat(Item, CostKind, TTI)) {
3175 ConcatLeafs.insert(FrontU);
3176 continue;
3177 }
3178
3179 return false;
3180 }
3181
3182 if (NumVisited <= 1)
3183 return false;
3184
3185 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3186
3187 // If we got this far, we know the shuffles are superfluous and can be
3188 // removed. Scan through again and generate the new tree of instructions.
3189 Builder.SetInsertPoint(&I);
3190 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3191 ConcatLeafs, Builder, &TTI);
3192 replaceValue(I, *V);
3193 return true;
3194}
3195
3196/// Given a commutative reduction, the order of the input lanes does not alter
3197/// the results. We can use this to remove certain shuffles feeding the
3198/// reduction, removing the need to shuffle at all.
3199bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3200 auto *II = dyn_cast<IntrinsicInst>(&I);
3201 if (!II)
3202 return false;
3203 switch (II->getIntrinsicID()) {
3204 case Intrinsic::vector_reduce_add:
3205 case Intrinsic::vector_reduce_mul:
3206 case Intrinsic::vector_reduce_and:
3207 case Intrinsic::vector_reduce_or:
3208 case Intrinsic::vector_reduce_xor:
3209 case Intrinsic::vector_reduce_smin:
3210 case Intrinsic::vector_reduce_smax:
3211 case Intrinsic::vector_reduce_umin:
3212 case Intrinsic::vector_reduce_umax:
3213 break;
3214 default:
3215 return false;
3216 }
3217
3218 // Find all the inputs when looking through operations that do not alter the
3219 // lane order (binops, for example). Currently we look for a single shuffle,
3220 // and can ignore splat values.
3221 std::queue<Value *> Worklist;
3222 SmallPtrSet<Value *, 4> Visited;
3223 ShuffleVectorInst *Shuffle = nullptr;
3224 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3225 Worklist.push(Op);
3226
3227 while (!Worklist.empty()) {
3228 Value *CV = Worklist.front();
3229 Worklist.pop();
3230 if (Visited.contains(CV))
3231 continue;
3232
3233 // Splats don't change the order, so can be safely ignored.
3234 if (isSplatValue(CV))
3235 continue;
3236
3237 Visited.insert(CV);
3238
3239 if (auto *CI = dyn_cast<Instruction>(CV)) {
3240 if (CI->isBinaryOp()) {
3241 for (auto *Op : CI->operand_values())
3242 Worklist.push(Op);
3243 continue;
3244 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3245 if (Shuffle && Shuffle != SV)
3246 return false;
3247 Shuffle = SV;
3248 continue;
3249 }
3250 }
3251
3252 // Anything else is currently an unknown node.
3253 return false;
3254 }
3255
3256 if (!Shuffle)
3257 return false;
3258
3259 // Check all uses of the binary ops and shuffles are also included in the
3260 // lane-invariant operations (Visited should be the list of lanewise
3261 // instructions, including the shuffle that we found).
3262 for (auto *V : Visited)
3263 for (auto *U : V->users())
3264 if (!Visited.contains(U) && U != &I)
3265 return false;
3266
3267 FixedVectorType *VecType =
3268 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3269 if (!VecType)
3270 return false;
3271 FixedVectorType *ShuffleInputType =
3273 if (!ShuffleInputType)
3274 return false;
3275 unsigned NumInputElts = ShuffleInputType->getNumElements();
3276
3277 // Find the mask from sorting the lanes into order. This is most likely to
3278 // become a identity or concat mask. Undef elements are pushed to the end.
3279 SmallVector<int> ConcatMask;
3280 Shuffle->getShuffleMask(ConcatMask);
3281 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3282 bool UsesSecondVec =
3283 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3284
3286 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3287 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3289 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3290 ShuffleInputType, ConcatMask, CostKind);
3291
3292 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3293 << "\n");
3294 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3295 << "\n");
3296 bool MadeChanges = false;
3297 if (NewCost < OldCost) {
3298 Builder.SetInsertPoint(Shuffle);
3299 Value *NewShuffle = Builder.CreateShuffleVector(
3300 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3301 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3302 replaceValue(*Shuffle, *NewShuffle);
3303 return true;
3304 }
3305
3306 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3307 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3308 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3309 return MadeChanges;
3310}
3311
3312/// For a given chain of patterns of the following form:
3313///
3314/// ```
3315/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3316///
3317/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3318/// ty1> %1)
3319/// OR
3320/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3321///
3322/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3323/// ...
3324/// ...
3325/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3326/// 3), <n x ty1> %(i - 2)
3327/// OR
3328/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3329///
3330/// %(i) = extractelement <n x ty1> %(i - 1), 0
3331/// ```
3332///
3333/// Where:
3334/// `mask` follows a partition pattern:
3335///
3336/// Ex:
3337/// [n = 8, p = poison]
3338///
3339/// 4 5 6 7 | p p p p
3340/// 2 3 | p p p p p p
3341/// 1 | p p p p p p p
3342///
3343/// For powers of 2, there's a consistent pattern, but for other cases
3344/// the parity of the current half value at each step decides the
3345/// next partition half (see `ExpectedParityMask` for more logical details
3346/// in generalising this).
3347///
3348/// Ex:
3349/// [n = 6]
3350///
3351/// 3 4 5 | p p p
3352/// 1 2 | p p p p
3353/// 1 | p p p p p
3354bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3355 // Going bottom-up for the pattern.
3356 std::queue<Value *> InstWorklist;
3357 InstructionCost OrigCost = 0;
3358
3359 // Common instruction operation after each shuffle op.
3360 std::optional<unsigned int> CommonCallOp = std::nullopt;
3361 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3362
3363 bool IsFirstCallOrBinInst = true;
3364 bool ShouldBeCallOrBinInst = true;
3365
3366 // This stores the last used instructions for shuffle/common op.
3367 //
3368 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3369 // instructions from either shuffle/common op.
3370 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3371
3372 Value *VecOpEE;
3373 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3374 return false;
3375
3376 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3377 if (!FVT)
3378 return false;
3379
3380 int64_t VecSize = FVT->getNumElements();
3381 if (VecSize < 2)
3382 return false;
3383
3384 // Number of levels would be ~log2(n), considering we always partition
3385 // by half for this fold pattern.
3386 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3387 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3388
3389 // This is how we generalise for all element sizes.
3390 // At each step, if vector size is odd, we need non-poison
3391 // values to cover the dominant half so we don't miss out on any element.
3392 //
3393 // This mask will help us retrieve this as we go from bottom to top:
3394 //
3395 // Mask Set -> N = N * 2 - 1
3396 // Mask Unset -> N = N * 2
3397 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3398 Cur = (Cur + 1) / 2, --Mask) {
3399 if (Cur & 1)
3400 ExpectedParityMask |= (1ll << Mask);
3401 }
3402
3403 InstWorklist.push(VecOpEE);
3404
3405 while (!InstWorklist.empty()) {
3406 Value *CI = InstWorklist.front();
3407 InstWorklist.pop();
3408
3409 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3410 if (!ShouldBeCallOrBinInst)
3411 return false;
3412
3413 if (!IsFirstCallOrBinInst &&
3414 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3415 return false;
3416
3417 // For the first found call/bin op, the vector has to come from the
3418 // extract element op.
3419 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3420 return false;
3421 IsFirstCallOrBinInst = false;
3422
3423 if (!CommonCallOp)
3424 CommonCallOp = II->getIntrinsicID();
3425 if (II->getIntrinsicID() != *CommonCallOp)
3426 return false;
3427
3428 switch (II->getIntrinsicID()) {
3429 case Intrinsic::umin:
3430 case Intrinsic::umax:
3431 case Intrinsic::smin:
3432 case Intrinsic::smax: {
3433 auto *Op0 = II->getOperand(0);
3434 auto *Op1 = II->getOperand(1);
3435 PrevVecV[0] = Op0;
3436 PrevVecV[1] = Op1;
3437 break;
3438 }
3439 default:
3440 return false;
3441 }
3442 ShouldBeCallOrBinInst ^= 1;
3443
3444 IntrinsicCostAttributes ICA(
3445 *CommonCallOp, II->getType(),
3446 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3447 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3448
3449 // We may need a swap here since it can be (a, b) or (b, a)
3450 // and accordingly change as we go up.
3451 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3452 std::swap(PrevVecV[0], PrevVecV[1]);
3453 InstWorklist.push(PrevVecV[1]);
3454 InstWorklist.push(PrevVecV[0]);
3455 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3456 // Similar logic for bin ops.
3457
3458 if (!ShouldBeCallOrBinInst)
3459 return false;
3460
3461 if (!IsFirstCallOrBinInst &&
3462 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3463 return false;
3464
3465 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3466 return false;
3467 IsFirstCallOrBinInst = false;
3468
3469 if (!CommonBinOp)
3470 CommonBinOp = BinOp->getOpcode();
3471
3472 if (BinOp->getOpcode() != *CommonBinOp)
3473 return false;
3474
3475 switch (*CommonBinOp) {
3476 case BinaryOperator::Add:
3477 case BinaryOperator::Mul:
3478 case BinaryOperator::Or:
3479 case BinaryOperator::And:
3480 case BinaryOperator::Xor: {
3481 auto *Op0 = BinOp->getOperand(0);
3482 auto *Op1 = BinOp->getOperand(1);
3483 PrevVecV[0] = Op0;
3484 PrevVecV[1] = Op1;
3485 break;
3486 }
3487 default:
3488 return false;
3489 }
3490 ShouldBeCallOrBinInst ^= 1;
3491
3492 OrigCost +=
3493 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3494
3495 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3496 std::swap(PrevVecV[0], PrevVecV[1]);
3497 InstWorklist.push(PrevVecV[1]);
3498 InstWorklist.push(PrevVecV[0]);
3499 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3500 // We shouldn't have any null values in the previous vectors,
3501 // is so, there was a mismatch in pattern.
3502 if (ShouldBeCallOrBinInst ||
3503 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3504 return false;
3505
3506 if (SVInst != PrevVecV[1])
3507 return false;
3508
3509 ArrayRef<int> CurMask;
3510 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3511 m_Mask(CurMask))))
3512 return false;
3513
3514 // Subtract the parity mask when checking the condition.
3515 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3516 if (Mask < ShuffleMaskHalf &&
3517 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3518 return false;
3519 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3520 return false;
3521 }
3522
3523 // Update mask values.
3524 ShuffleMaskHalf *= 2;
3525 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3526 ExpectedParityMask >>= 1;
3527
3529 SVInst->getType(), SVInst->getType(),
3530 CurMask, CostKind);
3531
3532 VisitedCnt += 1;
3533 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3534 break;
3535
3536 ShouldBeCallOrBinInst ^= 1;
3537 } else {
3538 return false;
3539 }
3540 }
3541
3542 // Pattern should end with a shuffle op.
3543 if (ShouldBeCallOrBinInst)
3544 return false;
3545
3546 assert(VecSize != -1 && "Expected Match for Vector Size");
3547
3548 Value *FinalVecV = PrevVecV[0];
3549 if (!FinalVecV)
3550 return false;
3551
3552 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3553
3554 Intrinsic::ID ReducedOp =
3555 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3556 : getReductionForBinop(*CommonBinOp));
3557 if (!ReducedOp)
3558 return false;
3559
3560 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3562
3563 if (NewCost >= OrigCost)
3564 return false;
3565
3566 auto *ReducedResult =
3567 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3568 replaceValue(I, *ReducedResult);
3569
3570 return true;
3571}
3572
3573/// Determine if its more efficient to fold:
3574/// reduce(trunc(x)) -> trunc(reduce(x)).
3575/// reduce(sext(x)) -> sext(reduce(x)).
3576/// reduce(zext(x)) -> zext(reduce(x)).
3577bool VectorCombine::foldCastFromReductions(Instruction &I) {
3578 auto *II = dyn_cast<IntrinsicInst>(&I);
3579 if (!II)
3580 return false;
3581
3582 bool TruncOnly = false;
3583 Intrinsic::ID IID = II->getIntrinsicID();
3584 switch (IID) {
3585 case Intrinsic::vector_reduce_add:
3586 case Intrinsic::vector_reduce_mul:
3587 TruncOnly = true;
3588 break;
3589 case Intrinsic::vector_reduce_and:
3590 case Intrinsic::vector_reduce_or:
3591 case Intrinsic::vector_reduce_xor:
3592 break;
3593 default:
3594 return false;
3595 }
3596
3597 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3598 Value *ReductionSrc = I.getOperand(0);
3599
3600 Value *Src;
3601 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3602 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3603 return false;
3604
3605 auto CastOpc =
3606 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3607
3608 auto *SrcTy = cast<VectorType>(Src->getType());
3609 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3610 Type *ResultTy = I.getType();
3611
3613 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3614 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3616 cast<CastInst>(ReductionSrc));
3617 InstructionCost NewCost =
3618 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3619 CostKind) +
3620 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3622
3623 if (OldCost <= NewCost || !NewCost.isValid())
3624 return false;
3625
3626 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3627 II->getIntrinsicID(), {Src});
3628 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3629 replaceValue(I, *NewCast);
3630 return true;
3631}
3632
3633/// Returns true if this ShuffleVectorInst eventually feeds into a
3634/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3635/// chains of shuffles and binary operators (in any combination/order).
3636/// The search does not go deeper than the given Depth.
3638 constexpr unsigned MaxVisited = 32;
3641 bool FoundReduction = false;
3642
3643 WorkList.push_back(SVI);
3644 while (!WorkList.empty()) {
3645 Instruction *I = WorkList.pop_back_val();
3646 for (User *U : I->users()) {
3647 auto *UI = cast<Instruction>(U);
3648 if (!UI || !Visited.insert(UI).second)
3649 continue;
3650 if (Visited.size() > MaxVisited)
3651 return false;
3652 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3653 // More than one reduction reached
3654 if (FoundReduction)
3655 return false;
3656 switch (II->getIntrinsicID()) {
3657 case Intrinsic::vector_reduce_add:
3658 case Intrinsic::vector_reduce_mul:
3659 case Intrinsic::vector_reduce_and:
3660 case Intrinsic::vector_reduce_or:
3661 case Intrinsic::vector_reduce_xor:
3662 case Intrinsic::vector_reduce_smin:
3663 case Intrinsic::vector_reduce_smax:
3664 case Intrinsic::vector_reduce_umin:
3665 case Intrinsic::vector_reduce_umax:
3666 FoundReduction = true;
3667 continue;
3668 default:
3669 return false;
3670 }
3671 }
3672
3674 return false;
3675
3676 WorkList.emplace_back(UI);
3677 }
3678 }
3679 return FoundReduction;
3680}
3681
3682/// This method looks for groups of shuffles acting on binops, of the form:
3683/// %x = shuffle ...
3684/// %y = shuffle ...
3685/// %a = binop %x, %y
3686/// %b = binop %x, %y
3687/// shuffle %a, %b, selectmask
3688/// We may, especially if the shuffle is wider than legal, be able to convert
3689/// the shuffle to a form where only parts of a and b need to be computed. On
3690/// architectures with no obvious "select" shuffle, this can reduce the total
3691/// number of operations if the target reports them as cheaper.
3692bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3693 auto *SVI = cast<ShuffleVectorInst>(&I);
3694 auto *VT = cast<FixedVectorType>(I.getType());
3695 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3696 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3697 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3698 VT != Op0->getType())
3699 return false;
3700
3701 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3702 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3703 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3704 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3705 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3706 auto checkSVNonOpUses = [&](Instruction *I) {
3707 if (!I || I->getOperand(0)->getType() != VT)
3708 return true;
3709 return any_of(I->users(), [&](User *U) {
3710 return U != Op0 && U != Op1 &&
3711 !(isa<ShuffleVectorInst>(U) &&
3712 (InputShuffles.contains(cast<Instruction>(U)) ||
3713 isInstructionTriviallyDead(cast<Instruction>(U))));
3714 });
3715 };
3716 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3717 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3718 return false;
3719
3720 // Collect all the uses that are shuffles that we can transform together. We
3721 // may not have a single shuffle, but a group that can all be transformed
3722 // together profitably.
3724 auto collectShuffles = [&](Instruction *I) {
3725 for (auto *U : I->users()) {
3726 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3727 if (!SV || SV->getType() != VT)
3728 return false;
3729 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3730 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3731 return false;
3732 if (!llvm::is_contained(Shuffles, SV))
3733 Shuffles.push_back(SV);
3734 }
3735 return true;
3736 };
3737 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3738 return false;
3739 // From a reduction, we need to be processing a single shuffle, otherwise the
3740 // other uses will not be lane-invariant.
3741 if (FromReduction && Shuffles.size() > 1)
3742 return false;
3743
3744 // Add any shuffle uses for the shuffles we have found, to include them in our
3745 // cost calculations.
3746 if (!FromReduction) {
3747 for (ShuffleVectorInst *SV : Shuffles) {
3748 for (auto *U : SV->users()) {
3749 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3750 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3751 Shuffles.push_back(SSV);
3752 }
3753 }
3754 }
3755
3756 // For each of the output shuffles, we try to sort all the first vector
3757 // elements to the beginning, followed by the second array elements at the
3758 // end. If the binops are legalized to smaller vectors, this may reduce total
3759 // number of binops. We compute the ReconstructMask mask needed to convert
3760 // back to the original lane order.
3762 SmallVector<SmallVector<int>> OrigReconstructMasks;
3763 int MaxV1Elt = 0, MaxV2Elt = 0;
3764 unsigned NumElts = VT->getNumElements();
3765 for (ShuffleVectorInst *SVN : Shuffles) {
3766 SmallVector<int> Mask;
3767 SVN->getShuffleMask(Mask);
3768
3769 // Check the operands are the same as the original, or reversed (in which
3770 // case we need to commute the mask).
3771 Value *SVOp0 = SVN->getOperand(0);
3772 Value *SVOp1 = SVN->getOperand(1);
3773 if (isa<UndefValue>(SVOp1)) {
3774 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3775 SVOp0 = SSV->getOperand(0);
3776 SVOp1 = SSV->getOperand(1);
3777 for (int &Elem : Mask) {
3778 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3779 return false;
3780 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3781 }
3782 }
3783 if (SVOp0 == Op1 && SVOp1 == Op0) {
3784 std::swap(SVOp0, SVOp1);
3786 }
3787 if (SVOp0 != Op0 || SVOp1 != Op1)
3788 return false;
3789
3790 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3791 // take the packed values from Op0/Op1 and reconstructing to the original
3792 // order.
3793 SmallVector<int> ReconstructMask;
3794 for (unsigned I = 0; I < Mask.size(); I++) {
3795 if (Mask[I] < 0) {
3796 ReconstructMask.push_back(-1);
3797 } else if (Mask[I] < static_cast<int>(NumElts)) {
3798 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3799 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3800 return Mask[I] == A.first;
3801 });
3802 if (It != V1.end())
3803 ReconstructMask.push_back(It - V1.begin());
3804 else {
3805 ReconstructMask.push_back(V1.size());
3806 V1.emplace_back(Mask[I], V1.size());
3807 }
3808 } else {
3809 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3810 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3811 return Mask[I] - static_cast<int>(NumElts) == A.first;
3812 });
3813 if (It != V2.end())
3814 ReconstructMask.push_back(NumElts + It - V2.begin());
3815 else {
3816 ReconstructMask.push_back(NumElts + V2.size());
3817 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3818 }
3819 }
3820 }
3821
3822 // For reductions, we know that the lane ordering out doesn't alter the
3823 // result. In-order can help simplify the shuffle away.
3824 if (FromReduction)
3825 sort(ReconstructMask);
3826 OrigReconstructMasks.push_back(std::move(ReconstructMask));
3827 }
3828
3829 // If the Maximum element used from V1 and V2 are not larger than the new
3830 // vectors, the vectors are already packes and performing the optimization
3831 // again will likely not help any further. This also prevents us from getting
3832 // stuck in a cycle in case the costs do not also rule it out.
3833 if (V1.empty() || V2.empty() ||
3834 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
3835 MaxV2Elt == static_cast<int>(V2.size()) - 1))
3836 return false;
3837
3838 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
3839 // shuffle of another shuffle, or not a shuffle (that is treated like a
3840 // identity shuffle).
3841 auto GetBaseMaskValue = [&](Instruction *I, int M) {
3842 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3843 if (!SV)
3844 return M;
3845 if (isa<UndefValue>(SV->getOperand(1)))
3846 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3847 if (InputShuffles.contains(SSV))
3848 return SSV->getMaskValue(SV->getMaskValue(M));
3849 return SV->getMaskValue(M);
3850 };
3851
3852 // Attempt to sort the inputs my ascending mask values to make simpler input
3853 // shuffles and push complex shuffles down to the uses. We sort on the first
3854 // of the two input shuffle orders, to try and get at least one input into a
3855 // nice order.
3856 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
3857 std::pair<int, int> Y) {
3858 int MXA = GetBaseMaskValue(A, X.first);
3859 int MYA = GetBaseMaskValue(A, Y.first);
3860 return MXA < MYA;
3861 };
3862 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
3863 return SortBase(SVI0A, A, B);
3864 });
3865 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
3866 return SortBase(SVI1A, A, B);
3867 });
3868 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
3869 // modified order of the input shuffles.
3870 SmallVector<SmallVector<int>> ReconstructMasks;
3871 for (const auto &Mask : OrigReconstructMasks) {
3872 SmallVector<int> ReconstructMask;
3873 for (int M : Mask) {
3874 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
3875 auto It = find_if(V, [M](auto A) { return A.second == M; });
3876 assert(It != V.end() && "Expected all entries in Mask");
3877 return std::distance(V.begin(), It);
3878 };
3879 if (M < 0)
3880 ReconstructMask.push_back(-1);
3881 else if (M < static_cast<int>(NumElts)) {
3882 ReconstructMask.push_back(FindIndex(V1, M));
3883 } else {
3884 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
3885 }
3886 }
3887 ReconstructMasks.push_back(std::move(ReconstructMask));
3888 }
3889
3890 // Calculate the masks needed for the new input shuffles, which get padded
3891 // with undef
3892 SmallVector<int> V1A, V1B, V2A, V2B;
3893 for (unsigned I = 0; I < V1.size(); I++) {
3894 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
3895 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
3896 }
3897 for (unsigned I = 0; I < V2.size(); I++) {
3898 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
3899 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
3900 }
3901 while (V1A.size() < NumElts) {
3904 }
3905 while (V2A.size() < NumElts) {
3908 }
3909
3910 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
3911 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3912 if (!SV)
3913 return C;
3914 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
3917 VT, VT, SV->getShuffleMask(), CostKind);
3918 };
3919 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3920 return C +
3922 };
3923
3924 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
3925 unsigned MaxVectorSize =
3927 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
3928 if (MaxElementsInVector == 0)
3929 return false;
3930 // When there are multiple shufflevector operations on the same input,
3931 // especially when the vector length is larger than the register size,
3932 // identical shuffle patterns may occur across different groups of elements.
3933 // To avoid overestimating the cost by counting these repeated shuffles more
3934 // than once, we only account for unique shuffle patterns. This adjustment
3935 // prevents inflated costs in the cost model for wide vectors split into
3936 // several register-sized groups.
3937 std::set<SmallVector<int, 4>> UniqueShuffles;
3938 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3939 // Compute the cost for performing the shuffle over the full vector.
3940 auto ShuffleCost =
3942 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
3943 if (NumFullVectors < 2)
3944 return C + ShuffleCost;
3945 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
3946 unsigned NumUniqueGroups = 0;
3947 unsigned NumGroups = Mask.size() / MaxElementsInVector;
3948 // For each group of MaxElementsInVector contiguous elements,
3949 // collect their shuffle pattern and insert into the set of unique patterns.
3950 for (unsigned I = 0; I < NumFullVectors; ++I) {
3951 for (unsigned J = 0; J < MaxElementsInVector; ++J)
3952 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
3953 if (UniqueShuffles.insert(SubShuffle).second)
3954 NumUniqueGroups += 1;
3955 }
3956 return C + ShuffleCost * NumUniqueGroups / NumGroups;
3957 };
3958 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
3959 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3960 if (!SV)
3961 return C;
3962 SmallVector<int, 16> Mask;
3963 SV->getShuffleMask(Mask);
3964 return AddShuffleMaskAdjustedCost(C, Mask);
3965 };
3966 // Check that input consists of ShuffleVectors applied to the same input
3967 auto AllShufflesHaveSameOperands =
3968 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
3969 if (InputShuffles.size() < 2)
3970 return false;
3971 ShuffleVectorInst *FirstSV =
3972 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
3973 if (!FirstSV)
3974 return false;
3975
3976 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
3977 return std::all_of(
3978 std::next(InputShuffles.begin()), InputShuffles.end(),
3979 [&](Instruction *I) {
3980 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
3981 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
3982 });
3983 };
3984
3985 // Get the costs of the shuffles + binops before and after with the new
3986 // shuffle masks.
3987 InstructionCost CostBefore =
3988 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
3989 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
3990 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
3991 InstructionCost(0), AddShuffleCost);
3992 if (AllShufflesHaveSameOperands(InputShuffles)) {
3993 UniqueShuffles.clear();
3994 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3995 InstructionCost(0), AddShuffleAdjustedCost);
3996 } else {
3997 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3998 InstructionCost(0), AddShuffleCost);
3999 }
4000
4001 // The new binops will be unused for lanes past the used shuffle lengths.
4002 // These types attempt to get the correct cost for that from the target.
4003 FixedVectorType *Op0SmallVT =
4004 FixedVectorType::get(VT->getScalarType(), V1.size());
4005 FixedVectorType *Op1SmallVT =
4006 FixedVectorType::get(VT->getScalarType(), V2.size());
4007 InstructionCost CostAfter =
4008 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4009 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4010 UniqueShuffles.clear();
4011 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4012 InstructionCost(0), AddShuffleMaskAdjustedCost);
4013 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4014 CostAfter +=
4015 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4016 InstructionCost(0), AddShuffleMaskCost);
4017
4018 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4019 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4020 << " vs CostAfter: " << CostAfter << "\n");
4021 if (CostBefore < CostAfter ||
4022 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4023 return false;
4024
4025 // The cost model has passed, create the new instructions.
4026 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4027 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4028 if (!SV)
4029 return I;
4030 if (isa<UndefValue>(SV->getOperand(1)))
4031 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4032 if (InputShuffles.contains(SSV))
4033 return SSV->getOperand(Op);
4034 return SV->getOperand(Op);
4035 };
4036 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4037 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4038 GetShuffleOperand(SVI0A, 1), V1A);
4039 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4040 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4041 GetShuffleOperand(SVI0B, 1), V1B);
4042 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4043 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4044 GetShuffleOperand(SVI1A, 1), V2A);
4045 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4046 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4047 GetShuffleOperand(SVI1B, 1), V2B);
4048 Builder.SetInsertPoint(Op0);
4049 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4050 NSV0A, NSV0B);
4051 if (auto *I = dyn_cast<Instruction>(NOp0))
4052 I->copyIRFlags(Op0, true);
4053 Builder.SetInsertPoint(Op1);
4054 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4055 NSV1A, NSV1B);
4056 if (auto *I = dyn_cast<Instruction>(NOp1))
4057 I->copyIRFlags(Op1, true);
4058
4059 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4060 Builder.SetInsertPoint(Shuffles[S]);
4061 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4062 replaceValue(*Shuffles[S], *NSV, false);
4063 }
4064
4065 Worklist.pushValue(NSV0A);
4066 Worklist.pushValue(NSV0B);
4067 Worklist.pushValue(NSV1A);
4068 Worklist.pushValue(NSV1B);
4069 return true;
4070}
4071
4072/// Check if instruction depends on ZExt and this ZExt can be moved after the
4073/// instruction. Move ZExt if it is profitable. For example:
4074/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4075/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4076/// Cost model calculations takes into account if zext(x) has other users and
4077/// whether it can be propagated through them too.
4078bool VectorCombine::shrinkType(Instruction &I) {
4079 Value *ZExted, *OtherOperand;
4080 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4081 m_Value(OtherOperand))) &&
4082 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4083 return false;
4084
4085 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4086
4087 auto *BigTy = cast<FixedVectorType>(I.getType());
4088 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4089 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4090
4091 if (I.getOpcode() == Instruction::LShr) {
4092 // Check that the shift amount is less than the number of bits in the
4093 // smaller type. Otherwise, the smaller lshr will return a poison value.
4094 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4095 if (ShAmtKB.getMaxValue().uge(BW))
4096 return false;
4097 } else {
4098 // Check that the expression overall uses at most the same number of bits as
4099 // ZExted
4100 KnownBits KB = computeKnownBits(&I, *DL);
4101 if (KB.countMaxActiveBits() > BW)
4102 return false;
4103 }
4104
4105 // Calculate costs of leaving current IR as it is and moving ZExt operation
4106 // later, along with adding truncates if needed
4108 Instruction::ZExt, BigTy, SmallTy,
4109 TargetTransformInfo::CastContextHint::None, CostKind);
4110 InstructionCost CurrentCost = ZExtCost;
4111 InstructionCost ShrinkCost = 0;
4112
4113 // Calculate total cost and check that we can propagate through all ZExt users
4114 for (User *U : ZExtOperand->users()) {
4115 auto *UI = cast<Instruction>(U);
4116 if (UI == &I) {
4117 CurrentCost +=
4118 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4119 ShrinkCost +=
4120 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4121 ShrinkCost += ZExtCost;
4122 continue;
4123 }
4124
4125 if (!Instruction::isBinaryOp(UI->getOpcode()))
4126 return false;
4127
4128 // Check if we can propagate ZExt through its other users
4129 KnownBits KB = computeKnownBits(UI, *DL);
4130 if (KB.countMaxActiveBits() > BW)
4131 return false;
4132
4133 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4134 ShrinkCost +=
4135 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4136 ShrinkCost += ZExtCost;
4137 }
4138
4139 // If the other instruction operand is not a constant, we'll need to
4140 // generate a truncate instruction. So we have to adjust cost
4141 if (!isa<Constant>(OtherOperand))
4142 ShrinkCost += TTI.getCastInstrCost(
4143 Instruction::Trunc, SmallTy, BigTy,
4144 TargetTransformInfo::CastContextHint::None, CostKind);
4145
4146 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4147 // towards modifying the IR because shrinking opens opportunities for other
4148 // shrinking optimisations.
4149 if (ShrinkCost > CurrentCost)
4150 return false;
4151
4152 Builder.SetInsertPoint(&I);
4153 Value *Op0 = ZExted;
4154 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4155 // Keep the order of operands the same
4156 if (I.getOperand(0) == OtherOperand)
4157 std::swap(Op0, Op1);
4158 Value *NewBinOp =
4159 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4160 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4161 cast<Instruction>(NewBinOp)->copyMetadata(I);
4162 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4163 replaceValue(I, *NewZExtr);
4164 return true;
4165}
4166
4167/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4168/// shuffle (DstVec, SrcVec, Mask)
4169bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4170 Value *DstVec, *SrcVec;
4171 uint64_t ExtIdx, InsIdx;
4172 if (!match(&I,
4173 m_InsertElt(m_Value(DstVec),
4174 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4175 m_ConstantInt(InsIdx))))
4176 return false;
4177
4178 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4179 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4180 // We can try combining vectors with different element sizes.
4181 if (!DstVecTy || !SrcVecTy ||
4182 SrcVecTy->getElementType() != DstVecTy->getElementType())
4183 return false;
4184
4185 unsigned NumDstElts = DstVecTy->getNumElements();
4186 unsigned NumSrcElts = SrcVecTy->getNumElements();
4187 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4188 return false;
4189
4190 // Insertion into poison is a cheaper single operand shuffle.
4192 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4193
4194 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4195 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4196 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4197 if (NeedDstSrcSwap) {
4199 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4200 Mask[InsIdx] = 0;
4201 else
4202 Mask[InsIdx] = ExtIdx;
4203 std::swap(DstVec, SrcVec);
4204 } else {
4206 std::iota(Mask.begin(), Mask.end(), 0);
4207 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4208 Mask[InsIdx] = NumDstElts;
4209 else
4210 Mask[InsIdx] = ExtIdx + NumDstElts;
4211 }
4212
4213 // Cost
4214 auto *Ins = cast<InsertElementInst>(&I);
4215 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4216 InstructionCost InsCost =
4217 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4218 InstructionCost ExtCost =
4219 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4220 InstructionCost OldCost = ExtCost + InsCost;
4221
4222 InstructionCost NewCost = 0;
4223 SmallVector<int> ExtToVecMask;
4224 if (!NeedExpOrNarrow) {
4225 // Ignore 'free' identity insertion shuffle.
4226 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4227 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4228 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4229 nullptr, {DstVec, SrcVec});
4230 } else {
4231 // When creating length-changing-vector, always create with a Mask whose
4232 // first element has an ExtIdx, so that the first element of the vector
4233 // being created is always the target to be extracted.
4234 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4235 if (IsExtIdxInBounds)
4236 ExtToVecMask[ExtIdx] = ExtIdx;
4237 else
4238 ExtToVecMask[0] = ExtIdx;
4239 // Add cost for expanding or narrowing
4241 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4242 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4243 }
4244
4245 if (!Ext->hasOneUse())
4246 NewCost += ExtCost;
4247
4248 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4249 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4250 << "\n");
4251
4252 if (OldCost < NewCost)
4253 return false;
4254
4255 if (NeedExpOrNarrow) {
4256 if (!NeedDstSrcSwap)
4257 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4258 else
4259 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4260 }
4261
4262 // Canonicalize undef param to RHS to help further folds.
4263 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4264 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4265 std::swap(DstVec, SrcVec);
4266 }
4267
4268 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4269 replaceValue(I, *Shuf);
4270
4271 return true;
4272}
4273
4274/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4275/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4276/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4277/// before casting it back into `<vscale x 16 x i32>`.
4278bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4279 const APInt *SplatVal0, *SplatVal1;
4281 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4282 return false;
4283
4284 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4285 << "\n");
4286
4287 auto *VTy =
4288 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4289 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4290 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4291
4292 // Just in case the cost of interleave2 intrinsic and bitcast are both
4293 // invalid, in which case we want to bail out, we use <= rather
4294 // than < here. Even they both have valid and equal costs, it's probably
4295 // not a good idea to emit a high-cost constant splat.
4297 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4299 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4300 << *I.getType() << " is too high.\n");
4301 return false;
4302 }
4303
4304 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4305 NewSplatVal <<= Width;
4306 NewSplatVal |= SplatVal0->zext(Width * 2);
4307 auto *NewSplat = ConstantVector::getSplat(
4308 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4309
4310 IRBuilder<> Builder(&I);
4311 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4312 return true;
4313}
4314
4315// Attempt to shrink loads that are only used by shufflevector instructions.
4316bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4317 auto *OldLoad = dyn_cast<LoadInst>(&I);
4318 if (!OldLoad || !OldLoad->isSimple())
4319 return false;
4320
4321 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4322 if (!OldLoadTy)
4323 return false;
4324
4325 unsigned const OldNumElements = OldLoadTy->getNumElements();
4326
4327 // Search all uses of load. If all uses are shufflevector instructions, and
4328 // the second operands are all poison values, find the minimum and maximum
4329 // indices of the vector elements referenced by all shuffle masks.
4330 // Otherwise return `std::nullopt`.
4331 using IndexRange = std::pair<int, int>;
4332 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4333 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4334 for (llvm::Use &Use : I.uses()) {
4335 // Ensure all uses match the required pattern.
4336 User *Shuffle = Use.getUser();
4337 ArrayRef<int> Mask;
4338
4339 if (!match(Shuffle,
4340 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4341 return std::nullopt;
4342
4343 // Ignore shufflevector instructions that have no uses.
4344 if (Shuffle->use_empty())
4345 continue;
4346
4347 // Find the min and max indices used by the shufflevector instruction.
4348 for (int Index : Mask) {
4349 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4350 OutputRange.first = std::min(Index, OutputRange.first);
4351 OutputRange.second = std::max(Index, OutputRange.second);
4352 }
4353 }
4354 }
4355
4356 if (OutputRange.second < OutputRange.first)
4357 return std::nullopt;
4358
4359 return OutputRange;
4360 };
4361
4362 // Get the range of vector elements used by shufflevector instructions.
4363 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4364 unsigned const NewNumElements = Indices->second + 1u;
4365
4366 // If the range of vector elements is smaller than the full load, attempt
4367 // to create a smaller load.
4368 if (NewNumElements < OldNumElements) {
4369 IRBuilder Builder(&I);
4370 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4371
4372 // Calculate costs of old and new ops.
4373 Type *ElemTy = OldLoadTy->getElementType();
4374 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4375 Value *PtrOp = OldLoad->getPointerOperand();
4376
4378 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4379 OldLoad->getPointerAddressSpace(), CostKind);
4380 InstructionCost NewCost =
4381 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4382 OldLoad->getPointerAddressSpace(), CostKind);
4383
4384 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4386 unsigned const MaxIndex = NewNumElements * 2u;
4387
4388 for (llvm::Use &Use : I.uses()) {
4389 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4390 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4391
4392 // Create entry for new use.
4393 NewUses.push_back({Shuffle, OldMask});
4394
4395 // Validate mask indices.
4396 for (int Index : OldMask) {
4397 if (Index >= static_cast<int>(MaxIndex))
4398 return false;
4399 }
4400
4401 // Update costs.
4402 OldCost +=
4404 OldLoadTy, OldMask, CostKind);
4405 NewCost +=
4407 NewLoadTy, OldMask, CostKind);
4408 }
4409
4410 LLVM_DEBUG(
4411 dbgs() << "Found a load used only by shufflevector instructions: "
4412 << I << "\n OldCost: " << OldCost
4413 << " vs NewCost: " << NewCost << "\n");
4414
4415 if (OldCost < NewCost || !NewCost.isValid())
4416 return false;
4417
4418 // Create new load of smaller vector.
4419 auto *NewLoad = cast<LoadInst>(
4420 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4421 NewLoad->copyMetadata(I);
4422
4423 // Replace all uses.
4424 for (UseEntry &Use : NewUses) {
4425 ShuffleVectorInst *Shuffle = Use.first;
4426 std::vector<int> &NewMask = Use.second;
4427
4428 Builder.SetInsertPoint(Shuffle);
4429 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4430 Value *NewShuffle = Builder.CreateShuffleVector(
4431 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4432
4433 replaceValue(*Shuffle, *NewShuffle, false);
4434 }
4435
4436 return true;
4437 }
4438 }
4439 return false;
4440}
4441
4442// Attempt to narrow a phi of shufflevector instructions where the two incoming
4443// values have the same operands but different masks. If the two shuffle masks
4444// are offsets of one another we can use one branch to rotate the incoming
4445// vector and perform one larger shuffle after the phi.
4446bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4447 auto *Phi = dyn_cast<PHINode>(&I);
4448 if (!Phi || Phi->getNumIncomingValues() != 2u)
4449 return false;
4450
4451 Value *Op = nullptr;
4452 ArrayRef<int> Mask0;
4453 ArrayRef<int> Mask1;
4454
4455 if (!match(Phi->getOperand(0u),
4456 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4457 !match(Phi->getOperand(1u),
4458 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4459 return false;
4460
4461 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4462
4463 // Ensure result vectors are wider than the argument vector.
4464 auto *InputVT = cast<FixedVectorType>(Op->getType());
4465 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4466 auto const InputNumElements = InputVT->getNumElements();
4467
4468 if (InputNumElements >= ResultVT->getNumElements())
4469 return false;
4470
4471 // Take the difference of the two shuffle masks at each index. Ignore poison
4472 // values at the same index in both masks.
4473 SmallVector<int, 16> NewMask;
4474 NewMask.reserve(Mask0.size());
4475
4476 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4477 if (M0 >= 0 && M1 >= 0)
4478 NewMask.push_back(M0 - M1);
4479 else if (M0 == -1 && M1 == -1)
4480 continue;
4481 else
4482 return false;
4483 }
4484
4485 // Ensure all elements of the new mask are equal. If the difference between
4486 // the incoming mask elements is the same, the two must be constant offsets
4487 // of one another.
4488 if (NewMask.empty() || !all_equal(NewMask))
4489 return false;
4490
4491 // Create new mask using difference of the two incoming masks.
4492 int MaskOffset = NewMask[0u];
4493 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
4494 NewMask.clear();
4495
4496 for (unsigned I = 0u; I < InputNumElements; ++I) {
4497 NewMask.push_back(Index);
4498 Index = (Index + 1u) % InputNumElements;
4499 }
4500
4501 // Calculate costs for worst cases and compare.
4502 auto const Kind = TTI::SK_PermuteSingleSrc;
4503 auto OldCost =
4504 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4505 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4506 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4507 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4508
4509 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4510 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4511 << "\n");
4512
4513 if (NewCost > OldCost)
4514 return false;
4515
4516 // Create new shuffles and narrowed phi.
4517 auto Builder = IRBuilder(Shuf);
4518 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4519 auto *PoisonVal = PoisonValue::get(InputVT);
4520 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4521 Worklist.push(cast<Instruction>(NewShuf0));
4522
4523 Builder.SetInsertPoint(Phi);
4524 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4525 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4526 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4527 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4528
4529 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4530 PoisonVal = PoisonValue::get(NewPhi->getType());
4531 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4532
4533 replaceValue(*Phi, *NewShuf1);
4534 return true;
4535}
4536
4537/// This is the entry point for all transforms. Pass manager differences are
4538/// handled in the callers of this function.
4539bool VectorCombine::run() {
4541 return false;
4542
4543 // Don't attempt vectorization if the target does not support vectors.
4544 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4545 return false;
4546
4547 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4548
4549 auto FoldInst = [this](Instruction &I) {
4550 Builder.SetInsertPoint(&I);
4551 bool IsVectorType = isa<VectorType>(I.getType());
4552 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4553 auto Opcode = I.getOpcode();
4554
4555 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4556
4557 // These folds should be beneficial regardless of when this pass is run
4558 // in the optimization pipeline.
4559 // The type checking is for run-time efficiency. We can avoid wasting time
4560 // dispatching to folding functions if there's no chance of matching.
4561 if (IsFixedVectorType) {
4562 switch (Opcode) {
4563 case Instruction::InsertElement:
4564 if (vectorizeLoadInsert(I))
4565 return true;
4566 break;
4567 case Instruction::ShuffleVector:
4568 if (widenSubvectorLoad(I))
4569 return true;
4570 break;
4571 default:
4572 break;
4573 }
4574 }
4575
4576 // This transform works with scalable and fixed vectors
4577 // TODO: Identify and allow other scalable transforms
4578 if (IsVectorType) {
4579 if (scalarizeOpOrCmp(I))
4580 return true;
4581 if (scalarizeLoadExtract(I))
4582 return true;
4583 if (scalarizeExtExtract(I))
4584 return true;
4585 if (scalarizeVPIntrinsic(I))
4586 return true;
4587 if (foldInterleaveIntrinsics(I))
4588 return true;
4589 }
4590
4591 if (Opcode == Instruction::Store)
4592 if (foldSingleElementStore(I))
4593 return true;
4594
4595 // If this is an early pipeline invocation of this pass, we are done.
4596 if (TryEarlyFoldsOnly)
4597 return false;
4598
4599 // Otherwise, try folds that improve codegen but may interfere with
4600 // early IR canonicalizations.
4601 // The type checking is for run-time efficiency. We can avoid wasting time
4602 // dispatching to folding functions if there's no chance of matching.
4603 if (IsFixedVectorType) {
4604 switch (Opcode) {
4605 case Instruction::InsertElement:
4606 if (foldInsExtFNeg(I))
4607 return true;
4608 if (foldInsExtBinop(I))
4609 return true;
4610 if (foldInsExtVectorToShuffle(I))
4611 return true;
4612 break;
4613 case Instruction::ShuffleVector:
4614 if (foldPermuteOfBinops(I))
4615 return true;
4616 if (foldShuffleOfBinops(I))
4617 return true;
4618 if (foldShuffleOfSelects(I))
4619 return true;
4620 if (foldShuffleOfCastops(I))
4621 return true;
4622 if (foldShuffleOfShuffles(I))
4623 return true;
4624 if (foldShuffleOfIntrinsics(I))
4625 return true;
4626 if (foldSelectShuffle(I))
4627 return true;
4628 if (foldShuffleToIdentity(I))
4629 return true;
4630 break;
4631 case Instruction::Load:
4632 if (shrinkLoadForShuffles(I))
4633 return true;
4634 break;
4635 case Instruction::BitCast:
4636 if (foldBitcastShuffle(I))
4637 return true;
4638 break;
4639 case Instruction::And:
4640 case Instruction::Or:
4641 case Instruction::Xor:
4642 if (foldBitOpOfCastops(I))
4643 return true;
4644 if (foldBitOpOfCastConstant(I))
4645 return true;
4646 break;
4647 case Instruction::PHI:
4648 if (shrinkPhiOfShuffles(I))
4649 return true;
4650 break;
4651 default:
4652 if (shrinkType(I))
4653 return true;
4654 break;
4655 }
4656 } else {
4657 switch (Opcode) {
4658 case Instruction::Call:
4659 if (foldShuffleFromReductions(I))
4660 return true;
4661 if (foldCastFromReductions(I))
4662 return true;
4663 break;
4664 case Instruction::ExtractElement:
4665 if (foldShuffleChainsToReduce(I))
4666 return true;
4667 break;
4668 case Instruction::ICmp:
4669 case Instruction::FCmp:
4670 if (foldExtractExtract(I))
4671 return true;
4672 break;
4673 case Instruction::Or:
4674 if (foldConcatOfBoolMasks(I))
4675 return true;
4676 [[fallthrough]];
4677 default:
4678 if (Instruction::isBinaryOp(Opcode)) {
4679 if (foldExtractExtract(I))
4680 return true;
4681 if (foldExtractedCmps(I))
4682 return true;
4683 if (foldBinopOfReductions(I))
4684 return true;
4685 }
4686 break;
4687 }
4688 }
4689 return false;
4690 };
4691
4692 bool MadeChange = false;
4693 for (BasicBlock &BB : F) {
4694 // Ignore unreachable basic blocks.
4695 if (!DT.isReachableFromEntry(&BB))
4696 continue;
4697 // Use early increment range so that we can erase instructions in loop.
4698 // make_early_inc_range is not applicable here, as the next iterator may
4699 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4700 // We manually maintain the next instruction and update it when it is about
4701 // to be deleted.
4702 Instruction *I = &BB.front();
4703 while (I) {
4704 NextInst = I->getNextNode();
4705 if (!I->isDebugOrPseudoInst())
4706 MadeChange |= FoldInst(*I);
4707 I = NextInst;
4708 }
4709 }
4710
4711 NextInst = nullptr;
4712
4713 while (!Worklist.isEmpty()) {
4714 Instruction *I = Worklist.removeOne();
4715 if (!I)
4716 continue;
4717
4720 continue;
4721 }
4722
4723 MadeChange |= FoldInst(*I);
4724 }
4725
4726 return MadeChange;
4727}
4728
4731 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4733 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
4734 AAResults &AA = FAM.getResult<AAManager>(F);
4735 const DataLayout *DL = &F.getDataLayout();
4736 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4737 TryEarlyFoldsOnly);
4738 if (!Combiner.run())
4739 return PreservedAnalyses::all();
4742 return PA;
4743}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1462
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:146
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2266
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:538
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:948
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:314
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1718
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145