LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoad(Instruction &I);
133 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
134 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
135 bool scalarizeExtExtract(Instruction &I);
136 bool foldConcatOfBoolMasks(Instruction &I);
137 bool foldPermuteOfBinops(Instruction &I);
138 bool foldShuffleOfBinops(Instruction &I);
139 bool foldShuffleOfSelects(Instruction &I);
140 bool foldShuffleOfCastops(Instruction &I);
141 bool foldShuffleOfShuffles(Instruction &I);
142 bool foldShuffleOfIntrinsics(Instruction &I);
143 bool foldShuffleToIdentity(Instruction &I);
144 bool foldShuffleFromReductions(Instruction &I);
145 bool foldShuffleChainsToReduce(Instruction &I);
146 bool foldCastFromReductions(Instruction &I);
147 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
148 bool foldInterleaveIntrinsics(Instruction &I);
149 bool shrinkType(Instruction &I);
150 bool shrinkLoadForShuffles(Instruction &I);
151 bool shrinkPhiOfShuffles(Instruction &I);
152
153 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
154 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
155 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
156 Old.replaceAllUsesWith(&New);
157 if (auto *NewI = dyn_cast<Instruction>(&New)) {
158 New.takeName(&Old);
159 Worklist.pushUsersToWorkList(*NewI);
160 Worklist.pushValue(NewI);
161 }
162 if (Erase && isInstructionTriviallyDead(&Old)) {
163 eraseInstruction(Old);
164 } else {
165 Worklist.push(&Old);
166 }
167 }
168
169 void eraseInstruction(Instruction &I) {
170 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
171 SmallVector<Value *> Ops(I.operands());
172 Worklist.remove(&I);
173 I.eraseFromParent();
174
175 // Push remaining users of the operands and then the operand itself - allows
176 // further folds that were hindered by OneUse limits.
177 SmallPtrSet<Value *, 4> Visited;
178 for (Value *Op : Ops) {
179 if (!Visited.contains(Op)) {
180 if (auto *OpI = dyn_cast<Instruction>(Op)) {
182 OpI, nullptr, nullptr, [&](Value *V) {
183 if (auto *I = dyn_cast<Instruction>(V)) {
184 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
185 Worklist.remove(I);
186 if (I == NextInst)
187 NextInst = NextInst->getNextNode();
188 Visited.insert(I);
189 }
190 }))
191 continue;
192 Worklist.pushUsersToWorkList(*OpI);
193 Worklist.pushValue(OpI);
194 }
195 }
196 }
197 }
198};
199} // namespace
200
201/// Return the source operand of a potentially bitcasted value. If there is no
202/// bitcast, return the input value itself.
204 while (auto *BitCast = dyn_cast<BitCastInst>(V))
205 V = BitCast->getOperand(0);
206 return V;
207}
208
209static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
210 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
211 // The widened load may load data from dirty regions or create data races
212 // non-existent in the source.
213 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
214 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
216 return false;
217
218 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
219 // sure we have all of our type-based constraints in place for this target.
220 Type *ScalarTy = Load->getType()->getScalarType();
221 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
222 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
223 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
224 ScalarSize % 8 != 0)
225 return false;
226
227 return true;
228}
229
230bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
231 // Match insert into fixed vector of scalar value.
232 // TODO: Handle non-zero insert index.
233 Value *Scalar;
234 if (!match(&I,
236 return false;
237
238 // Optionally match an extract from another vector.
239 Value *X;
240 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
241 if (!HasExtract)
242 X = Scalar;
243
244 auto *Load = dyn_cast<LoadInst>(X);
245 if (!canWidenLoad(Load, TTI))
246 return false;
247
248 Type *ScalarTy = Scalar->getType();
249 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
250 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
251
252 // Check safety of replacing the scalar load with a larger vector load.
253 // We use minimal alignment (maximum flexibility) because we only care about
254 // the dereferenceable region. When calculating cost and creating a new op,
255 // we may use a larger value based on alignment attributes.
256 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
257 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
258
259 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
260 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
261 unsigned OffsetEltIndex = 0;
262 Align Alignment = Load->getAlign();
263 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
264 &DT)) {
265 // It is not safe to load directly from the pointer, but we can still peek
266 // through gep offsets and check if it safe to load from a base address with
267 // updated alignment. If it is, we can shuffle the element(s) into place
268 // after loading.
269 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
270 APInt Offset(OffsetBitWidth, 0);
272
273 // We want to shuffle the result down from a high element of a vector, so
274 // the offset must be positive.
275 if (Offset.isNegative())
276 return false;
277
278 // The offset must be a multiple of the scalar element to shuffle cleanly
279 // in the element's size.
280 uint64_t ScalarSizeInBytes = ScalarSize / 8;
281 if (Offset.urem(ScalarSizeInBytes) != 0)
282 return false;
283
284 // If we load MinVecNumElts, will our target element still be loaded?
285 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
286 if (OffsetEltIndex >= MinVecNumElts)
287 return false;
288
289 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
290 &DT))
291 return false;
292
293 // Update alignment with offset value. Note that the offset could be negated
294 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
295 // negation does not change the result of the alignment calculation.
296 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
297 }
298
299 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
300 // Use the greater of the alignment on the load or its source pointer.
301 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
302 Type *LoadTy = Load->getType();
303 unsigned AS = Load->getPointerAddressSpace();
304 InstructionCost OldCost =
305 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
306 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
307 OldCost +=
308 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
309 /* Insert */ true, HasExtract, CostKind);
310
311 // New pattern: load VecPtr
312 InstructionCost NewCost =
313 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
314 // Optionally, we are shuffling the loaded vector element(s) into place.
315 // For the mask set everything but element 0 to undef to prevent poison from
316 // propagating from the extra loaded memory. This will also optionally
317 // shrink/grow the vector from the loaded size to the output size.
318 // We assume this operation has no cost in codegen if there was no offset.
319 // Note that we could use freeze to avoid poison problems, but then we might
320 // still need a shuffle to change the vector size.
321 auto *Ty = cast<FixedVectorType>(I.getType());
322 unsigned OutputNumElts = Ty->getNumElements();
323 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
324 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
325 Mask[0] = OffsetEltIndex;
326 if (OffsetEltIndex)
327 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
328 CostKind);
329
330 // We can aggressively convert to the vector form because the backend can
331 // invert this transform if it does not result in a performance win.
332 if (OldCost < NewCost || !NewCost.isValid())
333 return false;
334
335 // It is safe and potentially profitable to load a vector directly:
336 // inselt undef, load Scalar, 0 --> load VecPtr
337 IRBuilder<> Builder(Load);
338 Value *CastedPtr =
339 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
340 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
341 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
342
343 replaceValue(I, *VecLd);
344 ++NumVecLoad;
345 return true;
346}
347
348/// If we are loading a vector and then inserting it into a larger vector with
349/// undefined elements, try to load the larger vector and eliminate the insert.
350/// This removes a shuffle in IR and may allow combining of other loaded values.
351bool VectorCombine::widenSubvectorLoad(Instruction &I) {
352 // Match subvector insert of fixed vector.
353 auto *Shuf = cast<ShuffleVectorInst>(&I);
354 if (!Shuf->isIdentityWithPadding())
355 return false;
356
357 // Allow a non-canonical shuffle mask that is choosing elements from op1.
358 unsigned NumOpElts =
359 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
360 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
361 return M >= (int)(NumOpElts);
362 });
363
364 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
365 if (!canWidenLoad(Load, TTI))
366 return false;
367
368 // We use minimal alignment (maximum flexibility) because we only care about
369 // the dereferenceable region. When calculating cost and creating a new op,
370 // we may use a larger value based on alignment attributes.
371 auto *Ty = cast<FixedVectorType>(I.getType());
372 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
373 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
374 Align Alignment = Load->getAlign();
375 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
376 return false;
377
378 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
379 Type *LoadTy = Load->getType();
380 unsigned AS = Load->getPointerAddressSpace();
381
382 // Original pattern: insert_subvector (load PtrOp)
383 // This conservatively assumes that the cost of a subvector insert into an
384 // undef value is 0. We could add that cost if the cost model accurately
385 // reflects the real cost of that operation.
386 InstructionCost OldCost =
387 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
388
389 // New pattern: load PtrOp
390 InstructionCost NewCost =
391 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
392
393 // We can aggressively convert to the vector form because the backend can
394 // invert this transform if it does not result in a performance win.
395 if (OldCost < NewCost || !NewCost.isValid())
396 return false;
397
398 IRBuilder<> Builder(Load);
399 Value *CastedPtr =
400 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
401 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
402 replaceValue(I, *VecLd);
403 ++NumVecLoad;
404 return true;
405}
406
407/// Determine which, if any, of the inputs should be replaced by a shuffle
408/// followed by extract from a different index.
409ExtractElementInst *VectorCombine::getShuffleExtract(
410 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
411 unsigned PreferredExtractIndex = InvalidIndex) const {
412 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
413 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
414 assert(Index0C && Index1C && "Expected constant extract indexes");
415
416 unsigned Index0 = Index0C->getZExtValue();
417 unsigned Index1 = Index1C->getZExtValue();
418
419 // If the extract indexes are identical, no shuffle is needed.
420 if (Index0 == Index1)
421 return nullptr;
422
423 Type *VecTy = Ext0->getVectorOperand()->getType();
424 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
425 InstructionCost Cost0 =
426 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
427 InstructionCost Cost1 =
428 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
429
430 // If both costs are invalid no shuffle is needed
431 if (!Cost0.isValid() && !Cost1.isValid())
432 return nullptr;
433
434 // We are extracting from 2 different indexes, so one operand must be shuffled
435 // before performing a vector operation and/or extract. The more expensive
436 // extract will be replaced by a shuffle.
437 if (Cost0 > Cost1)
438 return Ext0;
439 if (Cost1 > Cost0)
440 return Ext1;
441
442 // If the costs are equal and there is a preferred extract index, shuffle the
443 // opposite operand.
444 if (PreferredExtractIndex == Index0)
445 return Ext1;
446 if (PreferredExtractIndex == Index1)
447 return Ext0;
448
449 // Otherwise, replace the extract with the higher index.
450 return Index0 > Index1 ? Ext0 : Ext1;
451}
452
453/// Compare the relative costs of 2 extracts followed by scalar operation vs.
454/// vector operation(s) followed by extract. Return true if the existing
455/// instructions are cheaper than a vector alternative. Otherwise, return false
456/// and if one of the extracts should be transformed to a shufflevector, set
457/// \p ConvertToShuffle to that extract instruction.
458bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
459 ExtractElementInst *Ext1,
460 const Instruction &I,
461 ExtractElementInst *&ConvertToShuffle,
462 unsigned PreferredExtractIndex) {
463 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
464 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
465 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
466
467 unsigned Opcode = I.getOpcode();
468 Value *Ext0Src = Ext0->getVectorOperand();
469 Value *Ext1Src = Ext1->getVectorOperand();
470 Type *ScalarTy = Ext0->getType();
471 auto *VecTy = cast<VectorType>(Ext0Src->getType());
472 InstructionCost ScalarOpCost, VectorOpCost;
473
474 // Get cost estimates for scalar and vector versions of the operation.
475 bool IsBinOp = Instruction::isBinaryOp(Opcode);
476 if (IsBinOp) {
477 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
478 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
479 } else {
480 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
481 "Expected a compare");
482 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
483 ScalarOpCost = TTI.getCmpSelInstrCost(
484 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
485 VectorOpCost = TTI.getCmpSelInstrCost(
486 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
487 }
488
489 // Get cost estimates for the extract elements. These costs will factor into
490 // both sequences.
491 unsigned Ext0Index = Ext0IndexC->getZExtValue();
492 unsigned Ext1Index = Ext1IndexC->getZExtValue();
493
494 InstructionCost Extract0Cost =
495 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
496 InstructionCost Extract1Cost =
497 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
498
499 // A more expensive extract will always be replaced by a splat shuffle.
500 // For example, if Ext0 is more expensive:
501 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
502 // extelt (opcode (splat V0, Ext0), V1), Ext1
503 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
504 // check the cost of creating a broadcast shuffle and shuffling both
505 // operands to element 0.
506 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
507 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
508 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
509
510 // Extra uses of the extracts mean that we include those costs in the
511 // vector total because those instructions will not be eliminated.
512 InstructionCost OldCost, NewCost;
513 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
514 // Handle a special case. If the 2 extracts are identical, adjust the
515 // formulas to account for that. The extra use charge allows for either the
516 // CSE'd pattern or an unoptimized form with identical values:
517 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
518 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
519 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
520 OldCost = CheapExtractCost + ScalarOpCost;
521 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
522 } else {
523 // Handle the general case. Each extract is actually a different value:
524 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
525 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
526 NewCost = VectorOpCost + CheapExtractCost +
527 !Ext0->hasOneUse() * Extract0Cost +
528 !Ext1->hasOneUse() * Extract1Cost;
529 }
530
531 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
532 if (ConvertToShuffle) {
533 if (IsBinOp && DisableBinopExtractShuffle)
534 return true;
535
536 // If we are extracting from 2 different indexes, then one operand must be
537 // shuffled before performing the vector operation. The shuffle mask is
538 // poison except for 1 lane that is being translated to the remaining
539 // extraction lane. Therefore, it is a splat shuffle. Ex:
540 // ShufMask = { poison, poison, 0, poison }
541 // TODO: The cost model has an option for a "broadcast" shuffle
542 // (splat-from-element-0), but no option for a more general splat.
543 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
544 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
546 ShuffleMask[BestInsIndex] = BestExtIndex;
548 VecTy, VecTy, ShuffleMask, CostKind, 0,
549 nullptr, {ConvertToShuffle});
550 } else {
552 VecTy, VecTy, {}, CostKind, 0, nullptr,
553 {ConvertToShuffle});
554 }
555 }
556
557 // Aggressively form a vector op if the cost is equal because the transform
558 // may enable further optimization.
559 // Codegen can reverse this transform (scalarize) if it was not profitable.
560 return OldCost < NewCost;
561}
562
563/// Create a shuffle that translates (shifts) 1 element from the input vector
564/// to a new element location.
565static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
566 unsigned NewIndex, IRBuilderBase &Builder) {
567 // The shuffle mask is poison except for 1 lane that is being translated
568 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
569 // ShufMask = { 2, poison, poison, poison }
570 auto *VecTy = cast<FixedVectorType>(Vec->getType());
571 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
572 ShufMask[NewIndex] = OldIndex;
573 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
574}
575
576/// Given an extract element instruction with constant index operand, shuffle
577/// the source vector (shift the scalar element) to a NewIndex for extraction.
578/// Return null if the input can be constant folded, so that we are not creating
579/// unnecessary instructions.
580static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
581 IRBuilderBase &Builder) {
582 // Shufflevectors can only be created for fixed-width vectors.
583 Value *X = ExtElt->getVectorOperand();
584 if (!isa<FixedVectorType>(X->getType()))
585 return nullptr;
586
587 // If the extract can be constant-folded, this code is unsimplified. Defer
588 // to other passes to handle that.
589 Value *C = ExtElt->getIndexOperand();
590 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
591 if (isa<Constant>(X))
592 return nullptr;
593
594 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
595 NewIndex, Builder);
596 return Shuf;
597}
598
599/// Try to reduce extract element costs by converting scalar compares to vector
600/// compares followed by extract.
601/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
602Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
603 Instruction &I) {
604 assert(isa<CmpInst>(&I) && "Expected a compare");
605
606 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
607 // --> extelt (cmp Pred V0, V1), ExtIndex
608 ++NumVecCmp;
609 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
610 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
611 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
612}
613
614/// Try to reduce extract element costs by converting scalar binops to vector
615/// binops followed by extract.
616/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
617Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
618 Instruction &I) {
619 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
620
621 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
622 // --> extelt (bo V0, V1), ExtIndex
623 ++NumVecBO;
624 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
625 V1, "foldExtExtBinop");
626
627 // All IR flags are safe to back-propagate because any potential poison
628 // created in unused vector elements is discarded by the extract.
629 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
630 VecBOInst->copyIRFlags(&I);
631
632 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
633}
634
635/// Match an instruction with extracted vector operands.
636bool VectorCombine::foldExtractExtract(Instruction &I) {
637 // It is not safe to transform things like div, urem, etc. because we may
638 // create undefined behavior when executing those on unknown vector elements.
640 return false;
641
642 Instruction *I0, *I1;
643 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
644 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
646 return false;
647
648 Value *V0, *V1;
649 uint64_t C0, C1;
650 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
651 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
652 V0->getType() != V1->getType())
653 return false;
654
655 // If the scalar value 'I' is going to be re-inserted into a vector, then try
656 // to create an extract to that same element. The extract/insert can be
657 // reduced to a "select shuffle".
658 // TODO: If we add a larger pattern match that starts from an insert, this
659 // probably becomes unnecessary.
660 auto *Ext0 = cast<ExtractElementInst>(I0);
661 auto *Ext1 = cast<ExtractElementInst>(I1);
662 uint64_t InsertIndex = InvalidIndex;
663 if (I.hasOneUse())
664 match(I.user_back(),
665 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
666
667 ExtractElementInst *ExtractToChange;
668 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
669 return false;
670
671 Value *ExtOp0 = Ext0->getVectorOperand();
672 Value *ExtOp1 = Ext1->getVectorOperand();
673
674 if (ExtractToChange) {
675 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
676 Value *NewExtOp =
677 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
678 if (!NewExtOp)
679 return false;
680 if (ExtractToChange == Ext0)
681 ExtOp0 = NewExtOp;
682 else
683 ExtOp1 = NewExtOp;
684 }
685
686 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
687 : Ext0->getIndexOperand();
688 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
689 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
690 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
691 Worklist.push(Ext0);
692 Worklist.push(Ext1);
693 replaceValue(I, *NewExt);
694 return true;
695}
696
697/// Try to replace an extract + scalar fneg + insert with a vector fneg +
698/// shuffle.
699bool VectorCombine::foldInsExtFNeg(Instruction &I) {
700 // Match an insert (op (extract)) pattern.
701 Value *DstVec;
702 uint64_t ExtIdx, InsIdx;
703 Instruction *FNeg;
704 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
705 m_ConstantInt(InsIdx))))
706 return false;
707
708 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
709 Value *SrcVec;
710 Instruction *Extract;
711 if (!match(FNeg, m_FNeg(m_CombineAnd(
712 m_Instruction(Extract),
713 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
714 return false;
715
716 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
717 auto *DstVecScalarTy = DstVecTy->getScalarType();
718 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
719 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
720 return false;
721
722 // Ignore if insert/extract index is out of bounds or destination vector has
723 // one element
724 unsigned NumDstElts = DstVecTy->getNumElements();
725 unsigned NumSrcElts = SrcVecTy->getNumElements();
726 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
727 return false;
728
729 // We are inserting the negated element into the same lane that we extracted
730 // from. This is equivalent to a select-shuffle that chooses all but the
731 // negated element from the destination vector.
732 SmallVector<int> Mask(NumDstElts);
733 std::iota(Mask.begin(), Mask.end(), 0);
734 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
735 InstructionCost OldCost =
736 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
737 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
738
739 // If the extract has one use, it will be eliminated, so count it in the
740 // original cost. If it has more than one use, ignore the cost because it will
741 // be the same before/after.
742 if (Extract->hasOneUse())
743 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
744
745 InstructionCost NewCost =
746 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
748 DstVecTy, Mask, CostKind);
749
750 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
751 // If the lengths of the two vectors are not equal,
752 // we need to add a length-change vector. Add this cost.
753 SmallVector<int> SrcMask;
754 if (NeedLenChg) {
755 SrcMask.assign(NumDstElts, PoisonMaskElem);
756 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
758 DstVecTy, SrcVecTy, SrcMask, CostKind);
759 }
760
761 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
762 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
763 << "\n");
764 if (NewCost > OldCost)
765 return false;
766
767 Value *NewShuf, *LenChgShuf = nullptr;
768 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
769 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
770 if (NeedLenChg) {
771 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
772 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
773 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
774 Worklist.pushValue(LenChgShuf);
775 } else {
776 // shuffle DstVec, (fneg SrcVec), Mask
777 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
778 }
779
780 Worklist.pushValue(VecFNeg);
781 replaceValue(I, *NewShuf);
782 return true;
783}
784
785/// Try to fold insert(binop(x,y),binop(a,b),idx)
786/// --> binop(insert(x,a,idx),insert(y,b,idx))
787bool VectorCombine::foldInsExtBinop(Instruction &I) {
788 BinaryOperator *VecBinOp, *SclBinOp;
789 uint64_t Index;
790 if (!match(&I,
791 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
792 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
793 return false;
794
795 // TODO: Add support for addlike etc.
796 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
797 if (BinOpcode != SclBinOp->getOpcode())
798 return false;
799
800 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
801 if (!ResultTy)
802 return false;
803
804 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
805 // shuffle?
806
808 TTI.getInstructionCost(VecBinOp, CostKind) +
810 InstructionCost NewCost =
811 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
812 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
813 Index, VecBinOp->getOperand(0),
814 SclBinOp->getOperand(0)) +
815 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
816 Index, VecBinOp->getOperand(1),
817 SclBinOp->getOperand(1));
818
819 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
820 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
821 << "\n");
822 if (NewCost > OldCost)
823 return false;
824
825 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
826 SclBinOp->getOperand(0), Index);
827 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
828 SclBinOp->getOperand(1), Index);
829 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
830
831 // Intersect flags from the old binops.
832 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
833 NewInst->copyIRFlags(VecBinOp);
834 NewInst->andIRFlags(SclBinOp);
835 }
836
837 Worklist.pushValue(NewIns0);
838 Worklist.pushValue(NewIns1);
839 replaceValue(I, *NewBO);
840 return true;
841}
842
843/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
844/// Supports: bitcast, trunc, sext, zext
845bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
846 // Check if this is a bitwise logic operation
847 auto *BinOp = dyn_cast<BinaryOperator>(&I);
848 if (!BinOp || !BinOp->isBitwiseLogicOp())
849 return false;
850
851 // Get the cast instructions
852 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
853 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
854 if (!LHSCast || !RHSCast) {
855 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
856 return false;
857 }
858
859 // Both casts must be the same type
860 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
861 if (CastOpcode != RHSCast->getOpcode())
862 return false;
863
864 // Only handle supported cast operations
865 switch (CastOpcode) {
866 case Instruction::BitCast:
867 case Instruction::Trunc:
868 case Instruction::SExt:
869 case Instruction::ZExt:
870 break;
871 default:
872 return false;
873 }
874
875 Value *LHSSrc = LHSCast->getOperand(0);
876 Value *RHSSrc = RHSCast->getOperand(0);
877
878 // Source types must match
879 if (LHSSrc->getType() != RHSSrc->getType())
880 return false;
881
882 auto *SrcTy = LHSSrc->getType();
883 auto *DstTy = I.getType();
884 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
885 // Other casts only handle vector types with integer elements.
886 if (CastOpcode != Instruction::BitCast &&
887 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
888 return false;
889
890 // Only integer scalar/vector values are legal for bitwise logic operations.
891 if (!SrcTy->getScalarType()->isIntegerTy() ||
892 !DstTy->getScalarType()->isIntegerTy())
893 return false;
894
895 // Cost Check :
896 // OldCost = bitlogic + 2*casts
897 // NewCost = bitlogic + cast
898
899 // Calculate specific costs for each cast with instruction context
901 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
903 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
904
905 InstructionCost OldCost =
906 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
907 LHSCastCost + RHSCastCost;
908
909 // For new cost, we can't provide an instruction (it doesn't exist yet)
910 InstructionCost GenericCastCost = TTI.getCastInstrCost(
911 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
912
913 InstructionCost NewCost =
914 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
915 GenericCastCost;
916
917 // Account for multi-use casts using specific costs
918 if (!LHSCast->hasOneUse())
919 NewCost += LHSCastCost;
920 if (!RHSCast->hasOneUse())
921 NewCost += RHSCastCost;
922
923 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
924 << " NewCost=" << NewCost << "\n");
925
926 if (NewCost > OldCost)
927 return false;
928
929 // Create the operation on the source type
930 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
931 BinOp->getName() + ".inner");
932 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
933 NewBinOp->copyIRFlags(BinOp);
934
935 Worklist.pushValue(NewOp);
936
937 // Create the cast operation directly to ensure we get a new instruction
938 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
939
940 // Preserve cast instruction flags
941 NewCast->copyIRFlags(LHSCast);
942 NewCast->andIRFlags(RHSCast);
943
944 // Insert the new instruction
945 Value *Result = Builder.Insert(NewCast);
946
947 replaceValue(I, *Result);
948 return true;
949}
950
951/// Match:
952// bitop(castop(x), C) ->
953// bitop(castop(x), castop(InvC)) ->
954// castop(bitop(x, InvC))
955// Supports: bitcast
956bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
958 Constant *C;
959
960 // Check if this is a bitwise logic operation
962 return false;
963
964 // Get the cast instructions
965 auto *LHSCast = dyn_cast<CastInst>(LHS);
966 if (!LHSCast)
967 return false;
968
969 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
970
971 // Only handle supported cast operations
972 switch (CastOpcode) {
973 case Instruction::BitCast:
974 case Instruction::ZExt:
975 case Instruction::SExt:
976 case Instruction::Trunc:
977 break;
978 default:
979 return false;
980 }
981
982 Value *LHSSrc = LHSCast->getOperand(0);
983
984 auto *SrcTy = LHSSrc->getType();
985 auto *DstTy = I.getType();
986 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
987 // Other casts only handle vector types with integer elements.
988 if (CastOpcode != Instruction::BitCast &&
989 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
990 return false;
991
992 // Only integer scalar/vector values are legal for bitwise logic operations.
993 if (!SrcTy->getScalarType()->isIntegerTy() ||
994 !DstTy->getScalarType()->isIntegerTy())
995 return false;
996
997 // Find the constant InvC, such that castop(InvC) equals to C.
998 PreservedCastFlags RHSFlags;
999 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1000 if (!InvC)
1001 return false;
1002
1003 // Cost Check :
1004 // OldCost = bitlogic + cast
1005 // NewCost = bitlogic + cast
1006
1007 // Calculate specific costs for each cast with instruction context
1008 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1009 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1010
1011 InstructionCost OldCost =
1012 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1013
1014 // For new cost, we can't provide an instruction (it doesn't exist yet)
1015 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1016 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1017
1018 InstructionCost NewCost =
1019 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1020 GenericCastCost;
1021
1022 // Account for multi-use casts using specific costs
1023 if (!LHSCast->hasOneUse())
1024 NewCost += LHSCastCost;
1025
1026 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1027 << " NewCost=" << NewCost << "\n");
1028
1029 if (NewCost > OldCost)
1030 return false;
1031
1032 // Create the operation on the source type
1033 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1034 LHSSrc, InvC, I.getName() + ".inner");
1035 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1036 NewBinOp->copyIRFlags(&I);
1037
1038 Worklist.pushValue(NewOp);
1039
1040 // Create the cast operation directly to ensure we get a new instruction
1041 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1042
1043 // Preserve cast instruction flags
1044 if (RHSFlags.NNeg)
1045 NewCast->setNonNeg();
1046 if (RHSFlags.NUW)
1047 NewCast->setHasNoUnsignedWrap();
1048 if (RHSFlags.NSW)
1049 NewCast->setHasNoSignedWrap();
1050
1051 NewCast->andIRFlags(LHSCast);
1052
1053 // Insert the new instruction
1054 Value *Result = Builder.Insert(NewCast);
1055
1056 replaceValue(I, *Result);
1057 return true;
1058}
1059
1060/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1061/// destination type followed by shuffle. This can enable further transforms by
1062/// moving bitcasts or shuffles together.
1063bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1064 Value *V0, *V1;
1065 ArrayRef<int> Mask;
1066 if (!match(&I, m_BitCast(m_OneUse(
1067 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1068 return false;
1069
1070 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1071 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1072 // mask for scalable type is a splat or not.
1073 // 2) Disallow non-vector casts.
1074 // TODO: We could allow any shuffle.
1075 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1076 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1077 if (!DestTy || !SrcTy)
1078 return false;
1079
1080 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1081 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1082 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1083 return false;
1084
1085 bool IsUnary = isa<UndefValue>(V1);
1086
1087 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1088 // if it won't increase the number of bitcasts.
1089 if (!IsUnary) {
1092 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1093 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1094 return false;
1095 }
1096
1097 SmallVector<int, 16> NewMask;
1098 if (DestEltSize <= SrcEltSize) {
1099 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1100 // always be expanded to the equivalent form choosing narrower elements.
1101 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1102 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1103 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1104 } else {
1105 // The bitcast is from narrow elements to wide elements. The shuffle mask
1106 // must choose consecutive elements to allow casting first.
1107 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1108 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1109 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1110 return false;
1111 }
1112
1113 // Bitcast the shuffle src - keep its original width but using the destination
1114 // scalar type.
1115 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1116 auto *NewShuffleTy =
1117 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1118 auto *OldShuffleTy =
1119 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1120 unsigned NumOps = IsUnary ? 1 : 2;
1121
1122 // The new shuffle must not cost more than the old shuffle.
1126
1127 InstructionCost NewCost =
1128 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1129 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1130 TargetTransformInfo::CastContextHint::None,
1131 CostKind));
1132 InstructionCost OldCost =
1133 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1134 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1135 TargetTransformInfo::CastContextHint::None,
1136 CostKind);
1137
1138 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1139 << OldCost << " vs NewCost: " << NewCost << "\n");
1140
1141 if (NewCost > OldCost || !NewCost.isValid())
1142 return false;
1143
1144 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1145 ++NumShufOfBitcast;
1146 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1147 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1148 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1149 replaceValue(I, *Shuf);
1150 return true;
1151}
1152
1153/// VP Intrinsics whose vector operands are both splat values may be simplified
1154/// into the scalar version of the operation and the result splatted. This
1155/// can lead to scalarization down the line.
1156bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1157 if (!isa<VPIntrinsic>(I))
1158 return false;
1159 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1160 Value *Op0 = VPI.getArgOperand(0);
1161 Value *Op1 = VPI.getArgOperand(1);
1162
1163 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1164 return false;
1165
1166 // Check getSplatValue early in this function, to avoid doing unnecessary
1167 // work.
1168 Value *ScalarOp0 = getSplatValue(Op0);
1169 Value *ScalarOp1 = getSplatValue(Op1);
1170 if (!ScalarOp0 || !ScalarOp1)
1171 return false;
1172
1173 // For the binary VP intrinsics supported here, the result on disabled lanes
1174 // is a poison value. For now, only do this simplification if all lanes
1175 // are active.
1176 // TODO: Relax the condition that all lanes are active by using insertelement
1177 // on inactive lanes.
1178 auto IsAllTrueMask = [](Value *MaskVal) {
1179 if (Value *SplattedVal = getSplatValue(MaskVal))
1180 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1181 return ConstValue->isAllOnesValue();
1182 return false;
1183 };
1184 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1185 return false;
1186
1187 // Check to make sure we support scalarization of the intrinsic
1188 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1189 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1190 return false;
1191
1192 // Calculate cost of splatting both operands into vectors and the vector
1193 // intrinsic
1194 VectorType *VecTy = cast<VectorType>(VPI.getType());
1195 SmallVector<int> Mask;
1196 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1197 Mask.resize(FVTy->getNumElements(), 0);
1198 InstructionCost SplatCost =
1199 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1201 CostKind);
1202
1203 // Calculate the cost of the VP Intrinsic
1205 for (Value *V : VPI.args())
1206 Args.push_back(V->getType());
1207 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1208 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1209 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1210
1211 // Determine scalar opcode
1212 std::optional<unsigned> FunctionalOpcode =
1213 VPI.getFunctionalOpcode();
1214 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1215 if (!FunctionalOpcode) {
1216 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1217 if (!ScalarIntrID)
1218 return false;
1219 }
1220
1221 // Calculate cost of scalarizing
1222 InstructionCost ScalarOpCost = 0;
1223 if (ScalarIntrID) {
1224 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1225 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1226 } else {
1227 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1228 VecTy->getScalarType(), CostKind);
1229 }
1230
1231 // The existing splats may be kept around if other instructions use them.
1232 InstructionCost CostToKeepSplats =
1233 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1234 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1235
1236 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1237 << "\n");
1238 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1239 << ", Cost of scalarizing:" << NewCost << "\n");
1240
1241 // We want to scalarize unless the vector variant actually has lower cost.
1242 if (OldCost < NewCost || !NewCost.isValid())
1243 return false;
1244
1245 // Scalarize the intrinsic
1246 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1247 Value *EVL = VPI.getArgOperand(3);
1248
1249 // If the VP op might introduce UB or poison, we can scalarize it provided
1250 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1251 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1252 // scalarizing it.
1253 bool SafeToSpeculate;
1254 if (ScalarIntrID)
1255 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1256 .hasAttribute(Attribute::AttrKind::Speculatable);
1257 else
1259 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1260 if (!SafeToSpeculate &&
1261 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1262 return false;
1263
1264 Value *ScalarVal =
1265 ScalarIntrID
1266 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1267 {ScalarOp0, ScalarOp1})
1268 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1269 ScalarOp0, ScalarOp1);
1270
1271 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1272 return true;
1273}
1274
1275/// Match a vector op/compare/intrinsic with at least one
1276/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1277/// by insertelement.
1278bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1279 auto *UO = dyn_cast<UnaryOperator>(&I);
1280 auto *BO = dyn_cast<BinaryOperator>(&I);
1281 auto *CI = dyn_cast<CmpInst>(&I);
1282 auto *II = dyn_cast<IntrinsicInst>(&I);
1283 if (!UO && !BO && !CI && !II)
1284 return false;
1285
1286 // TODO: Allow intrinsics with different argument types
1287 if (II) {
1288 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1289 return false;
1290 for (auto [Idx, Arg] : enumerate(II->args()))
1291 if (Arg->getType() != II->getType() &&
1292 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1293 return false;
1294 }
1295
1296 // Do not convert the vector condition of a vector select into a scalar
1297 // condition. That may cause problems for codegen because of differences in
1298 // boolean formats and register-file transfers.
1299 // TODO: Can we account for that in the cost model?
1300 if (CI)
1301 for (User *U : I.users())
1302 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1303 return false;
1304
1305 // Match constant vectors or scalars being inserted into constant vectors:
1306 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1307 SmallVector<Value *> VecCs, ScalarOps;
1308 std::optional<uint64_t> Index;
1309
1310 auto Ops = II ? II->args() : I.operands();
1311 for (auto [OpNum, Op] : enumerate(Ops)) {
1312 Constant *VecC;
1313 Value *V;
1314 uint64_t InsIdx = 0;
1315 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1316 m_ConstantInt(InsIdx)))) {
1317 // Bail if any inserts are out of bounds.
1318 VectorType *OpTy = cast<VectorType>(Op->getType());
1319 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1320 return false;
1321 // All inserts must have the same index.
1322 // TODO: Deal with mismatched index constants and variable indexes?
1323 if (!Index)
1324 Index = InsIdx;
1325 else if (InsIdx != *Index)
1326 return false;
1327 VecCs.push_back(VecC);
1328 ScalarOps.push_back(V);
1329 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1330 OpNum, &TTI)) {
1331 VecCs.push_back(Op.get());
1332 ScalarOps.push_back(Op.get());
1333 } else if (match(Op.get(), m_Constant(VecC))) {
1334 VecCs.push_back(VecC);
1335 ScalarOps.push_back(nullptr);
1336 } else {
1337 return false;
1338 }
1339 }
1340
1341 // Bail if all operands are constant.
1342 if (!Index.has_value())
1343 return false;
1344
1345 VectorType *VecTy = cast<VectorType>(I.getType());
1346 Type *ScalarTy = VecTy->getScalarType();
1347 assert(VecTy->isVectorTy() &&
1348 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1349 ScalarTy->isPointerTy()) &&
1350 "Unexpected types for insert element into binop or cmp");
1351
1352 unsigned Opcode = I.getOpcode();
1353 InstructionCost ScalarOpCost, VectorOpCost;
1354 if (CI) {
1355 CmpInst::Predicate Pred = CI->getPredicate();
1356 ScalarOpCost = TTI.getCmpSelInstrCost(
1357 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1358 VectorOpCost = TTI.getCmpSelInstrCost(
1359 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1360 } else if (UO || BO) {
1361 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1362 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1363 } else {
1364 IntrinsicCostAttributes ScalarICA(
1365 II->getIntrinsicID(), ScalarTy,
1366 SmallVector<Type *>(II->arg_size(), ScalarTy));
1367 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1368 IntrinsicCostAttributes VectorICA(
1369 II->getIntrinsicID(), VecTy,
1370 SmallVector<Type *>(II->arg_size(), VecTy));
1371 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1372 }
1373
1374 // Fold the vector constants in the original vectors into a new base vector to
1375 // get more accurate cost modelling.
1376 Value *NewVecC = nullptr;
1377 if (CI)
1378 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1379 else if (UO)
1380 NewVecC =
1381 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1382 else if (BO)
1383 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1384 else if (II)
1385 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1386
1387 if (!NewVecC)
1388 return false;
1389
1390 // Get cost estimate for the insert element. This cost will factor into
1391 // both sequences.
1392 InstructionCost OldCost = VectorOpCost;
1393 InstructionCost NewCost =
1394 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1395 CostKind, *Index, NewVecC);
1396
1397 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1398 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1399 II->getIntrinsicID(), Idx, &TTI)))
1400 continue;
1402 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1403 OldCost += InsertCost;
1404 NewCost += !Op->hasOneUse() * InsertCost;
1405 }
1406
1407 // We want to scalarize unless the vector variant actually has lower cost.
1408 if (OldCost < NewCost || !NewCost.isValid())
1409 return false;
1410
1411 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1412 // inselt NewVecC, (scalar_op V0, V1), Index
1413 if (CI)
1414 ++NumScalarCmp;
1415 else if (UO || BO)
1416 ++NumScalarOps;
1417 else
1418 ++NumScalarIntrinsic;
1419
1420 // For constant cases, extract the scalar element, this should constant fold.
1421 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1422 if (!Scalar)
1424 cast<Constant>(VecC), Builder.getInt64(*Index));
1425
1426 Value *Scalar;
1427 if (CI)
1428 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1429 else if (UO || BO)
1430 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1431 else
1432 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1433
1434 Scalar->setName(I.getName() + ".scalar");
1435
1436 // All IR flags are safe to back-propagate. There is no potential for extra
1437 // poison to be created by the scalar instruction.
1438 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1439 ScalarInst->copyIRFlags(&I);
1440
1441 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1442 replaceValue(I, *Insert);
1443 return true;
1444}
1445
1446/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1447/// a vector into vector operations followed by extract. Note: The SLP pass
1448/// may miss this pattern because of implementation problems.
1449bool VectorCombine::foldExtractedCmps(Instruction &I) {
1450 auto *BI = dyn_cast<BinaryOperator>(&I);
1451
1452 // We are looking for a scalar binop of booleans.
1453 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1454 if (!BI || !I.getType()->isIntegerTy(1))
1455 return false;
1456
1457 // The compare predicates should match, and each compare should have a
1458 // constant operand.
1459 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1460 Instruction *I0, *I1;
1461 Constant *C0, *C1;
1462 CmpPredicate P0, P1;
1463 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1464 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1465 return false;
1466
1467 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1468 if (!MatchingPred)
1469 return false;
1470
1471 // The compare operands must be extracts of the same vector with constant
1472 // extract indexes.
1473 Value *X;
1474 uint64_t Index0, Index1;
1475 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1476 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1477 return false;
1478
1479 auto *Ext0 = cast<ExtractElementInst>(I0);
1480 auto *Ext1 = cast<ExtractElementInst>(I1);
1481 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1482 if (!ConvertToShuf)
1483 return false;
1484 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1485 "Unknown ExtractElementInst");
1486
1487 // The original scalar pattern is:
1488 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1489 CmpInst::Predicate Pred = *MatchingPred;
1490 unsigned CmpOpcode =
1491 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1492 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1493 if (!VecTy)
1494 return false;
1495
1496 InstructionCost Ext0Cost =
1497 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1498 InstructionCost Ext1Cost =
1499 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1501 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1502 CostKind);
1503
1504 InstructionCost OldCost =
1505 Ext0Cost + Ext1Cost + CmpCost * 2 +
1506 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1507
1508 // The proposed vector pattern is:
1509 // vcmp = cmp Pred X, VecC
1510 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1511 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1512 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1515 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1516 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1517 ShufMask[CheapIndex] = ExpensiveIndex;
1519 CmpTy, ShufMask, CostKind);
1520 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1521 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1522 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1523 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1524
1525 // Aggressively form vector ops if the cost is equal because the transform
1526 // may enable further optimization.
1527 // Codegen can reverse this transform (scalarize) if it was not profitable.
1528 if (OldCost < NewCost || !NewCost.isValid())
1529 return false;
1530
1531 // Create a vector constant from the 2 scalar constants.
1532 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1533 PoisonValue::get(VecTy->getElementType()));
1534 CmpC[Index0] = C0;
1535 CmpC[Index1] = C1;
1536 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1537 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1538 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1539 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1540 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1541 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1542 replaceValue(I, *NewExt);
1543 ++NumVecCmpBO;
1544 return true;
1545}
1546
1549 const TargetTransformInfo &TTI,
1550 InstructionCost &CostBeforeReduction,
1551 InstructionCost &CostAfterReduction) {
1552 Instruction *Op0, *Op1;
1553 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1554 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1555 unsigned ReductionOpc =
1556 getArithmeticReductionInstruction(II.getIntrinsicID());
1557 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1558 bool IsUnsigned = isa<ZExtInst>(RedOp);
1559 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1560
1561 CostBeforeReduction =
1562 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1564 CostAfterReduction =
1565 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1566 ExtType, FastMathFlags(), CostKind);
1567 return;
1568 }
1569 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1570 match(RedOp,
1572 match(Op0, m_ZExtOrSExt(m_Value())) &&
1573 Op0->getOpcode() == Op1->getOpcode() &&
1574 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1575 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1576 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1577 bool IsUnsigned = isa<ZExtInst>(Op0);
1578 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1579 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1580
1581 InstructionCost ExtCost =
1582 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1584 InstructionCost MulCost =
1585 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1586 InstructionCost Ext2Cost =
1587 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1589
1590 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1591 CostAfterReduction = TTI.getMulAccReductionCost(
1592 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1593 return;
1594 }
1595 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1596 std::nullopt, CostKind);
1597}
1598
1599bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1600 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1601 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1602 if (BinOpOpc == Instruction::Sub)
1603 ReductionIID = Intrinsic::vector_reduce_add;
1604 if (ReductionIID == Intrinsic::not_intrinsic)
1605 return false;
1606
1607 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1608 Intrinsic::ID IID) -> Value * {
1609 auto *II = dyn_cast<IntrinsicInst>(V);
1610 if (!II)
1611 return nullptr;
1612 if (II->getIntrinsicID() == IID && II->hasOneUse())
1613 return II->getArgOperand(0);
1614 return nullptr;
1615 };
1616
1617 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1618 if (!V0)
1619 return false;
1620 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1621 if (!V1)
1622 return false;
1623
1624 auto *VTy = cast<VectorType>(V0->getType());
1625 if (V1->getType() != VTy)
1626 return false;
1627 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1628 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1629 unsigned ReductionOpc =
1630 getArithmeticReductionInstruction(II0.getIntrinsicID());
1631
1632 InstructionCost OldCost = 0;
1633 InstructionCost NewCost = 0;
1634 InstructionCost CostOfRedOperand0 = 0;
1635 InstructionCost CostOfRed0 = 0;
1636 InstructionCost CostOfRedOperand1 = 0;
1637 InstructionCost CostOfRed1 = 0;
1638 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1639 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1640 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1641 NewCost =
1642 CostOfRedOperand0 + CostOfRedOperand1 +
1643 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1644 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1645 if (NewCost >= OldCost || !NewCost.isValid())
1646 return false;
1647
1648 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1649 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1650 << "\n");
1651 Value *VectorBO;
1652 if (BinOpOpc == Instruction::Or)
1653 VectorBO = Builder.CreateOr(V0, V1, "",
1654 cast<PossiblyDisjointInst>(I).isDisjoint());
1655 else
1656 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1657
1658 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1659 replaceValue(I, *Rdx);
1660 return true;
1661}
1662
1663// Check if memory loc modified between two instrs in the same BB
1666 const MemoryLocation &Loc, AAResults &AA) {
1667 unsigned NumScanned = 0;
1668 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1669 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1670 ++NumScanned > MaxInstrsToScan;
1671 });
1672}
1673
1674namespace {
1675/// Helper class to indicate whether a vector index can be safely scalarized and
1676/// if a freeze needs to be inserted.
1677class ScalarizationResult {
1678 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1679
1680 StatusTy Status;
1681 Value *ToFreeze;
1682
1683 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1684 : Status(Status), ToFreeze(ToFreeze) {}
1685
1686public:
1687 ScalarizationResult(const ScalarizationResult &Other) = default;
1688 ~ScalarizationResult() {
1689 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1690 }
1691
1692 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1693 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1694 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1695 return {StatusTy::SafeWithFreeze, ToFreeze};
1696 }
1697
1698 /// Returns true if the index can be scalarize without requiring a freeze.
1699 bool isSafe() const { return Status == StatusTy::Safe; }
1700 /// Returns true if the index cannot be scalarized.
1701 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1702 /// Returns true if the index can be scalarize, but requires inserting a
1703 /// freeze.
1704 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1705
1706 /// Reset the state of Unsafe and clear ToFreze if set.
1707 void discard() {
1708 ToFreeze = nullptr;
1709 Status = StatusTy::Unsafe;
1710 }
1711
1712 /// Freeze the ToFreeze and update the use in \p User to use it.
1713 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1714 assert(isSafeWithFreeze() &&
1715 "should only be used when freezing is required");
1716 assert(is_contained(ToFreeze->users(), &UserI) &&
1717 "UserI must be a user of ToFreeze");
1718 IRBuilder<>::InsertPointGuard Guard(Builder);
1719 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1720 Value *Frozen =
1721 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1722 for (Use &U : make_early_inc_range((UserI.operands())))
1723 if (U.get() == ToFreeze)
1724 U.set(Frozen);
1725
1726 ToFreeze = nullptr;
1727 }
1728};
1729} // namespace
1730
1731/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1732/// Idx. \p Idx must access a valid vector element.
1733static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1734 Instruction *CtxI,
1735 AssumptionCache &AC,
1736 const DominatorTree &DT) {
1737 // We do checks for both fixed vector types and scalable vector types.
1738 // This is the number of elements of fixed vector types,
1739 // or the minimum number of elements of scalable vector types.
1740 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1741 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1742
1743 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1744 if (C->getValue().ult(NumElements))
1745 return ScalarizationResult::safe();
1746 return ScalarizationResult::unsafe();
1747 }
1748
1749 // Always unsafe if the index type can't handle all inbound values.
1750 if (!llvm::isUIntN(IntWidth, NumElements))
1751 return ScalarizationResult::unsafe();
1752
1753 APInt Zero(IntWidth, 0);
1754 APInt MaxElts(IntWidth, NumElements);
1755 ConstantRange ValidIndices(Zero, MaxElts);
1756 ConstantRange IdxRange(IntWidth, true);
1757
1758 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1759 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1760 true, &AC, CtxI, &DT)))
1761 return ScalarizationResult::safe();
1762 return ScalarizationResult::unsafe();
1763 }
1764
1765 // If the index may be poison, check if we can insert a freeze before the
1766 // range of the index is restricted.
1767 Value *IdxBase;
1768 ConstantInt *CI;
1769 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1770 IdxRange = IdxRange.binaryAnd(CI->getValue());
1771 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1772 IdxRange = IdxRange.urem(CI->getValue());
1773 }
1774
1775 if (ValidIndices.contains(IdxRange))
1776 return ScalarizationResult::safeWithFreeze(IdxBase);
1777 return ScalarizationResult::unsafe();
1778}
1779
1780/// The memory operation on a vector of \p ScalarType had alignment of
1781/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1782/// alignment that will be valid for the memory operation on a single scalar
1783/// element of the same type with index \p Idx.
1785 Type *ScalarType, Value *Idx,
1786 const DataLayout &DL) {
1787 if (auto *C = dyn_cast<ConstantInt>(Idx))
1788 return commonAlignment(VectorAlignment,
1789 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1790 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1791}
1792
1793// Combine patterns like:
1794// %0 = load <4 x i32>, <4 x i32>* %a
1795// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1796// store <4 x i32> %1, <4 x i32>* %a
1797// to:
1798// %0 = bitcast <4 x i32>* %a to i32*
1799// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1800// store i32 %b, i32* %1
1801bool VectorCombine::foldSingleElementStore(Instruction &I) {
1803 return false;
1804 auto *SI = cast<StoreInst>(&I);
1805 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1806 return false;
1807
1808 // TODO: Combine more complicated patterns (multiple insert) by referencing
1809 // TargetTransformInfo.
1811 Value *NewElement;
1812 Value *Idx;
1813 if (!match(SI->getValueOperand(),
1814 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1815 m_Value(Idx))))
1816 return false;
1817
1818 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1819 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1820 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1821 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1822 // modified between, vector type matches store size, and index is inbounds.
1823 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1824 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1825 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1826 return false;
1827
1828 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1829 if (ScalarizableIdx.isUnsafe() ||
1830 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1831 MemoryLocation::get(SI), AA))
1832 return false;
1833
1834 // Ensure we add the load back to the worklist BEFORE its users so they can
1835 // erased in the correct order.
1836 Worklist.push(Load);
1837
1838 if (ScalarizableIdx.isSafeWithFreeze())
1839 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1840 Value *GEP = Builder.CreateInBoundsGEP(
1841 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1842 {ConstantInt::get(Idx->getType(), 0), Idx});
1843 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1844 NSI->copyMetadata(*SI);
1845 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1846 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1847 *DL);
1848 NSI->setAlignment(ScalarOpAlignment);
1849 replaceValue(I, *NSI);
1851 return true;
1852 }
1853
1854 return false;
1855}
1856
1857/// Try to scalarize vector loads feeding extractelement or bitcast
1858/// instructions.
1859bool VectorCombine::scalarizeLoad(Instruction &I) {
1860 Value *Ptr;
1861 if (!match(&I, m_Load(m_Value(Ptr))))
1862 return false;
1863
1864 auto *LI = cast<LoadInst>(&I);
1865 auto *VecTy = cast<VectorType>(LI->getType());
1866 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1867 return false;
1868
1869 bool AllExtracts = true;
1870 bool AllBitcasts = true;
1871 Instruction *LastCheckedInst = LI;
1872 unsigned NumInstChecked = 0;
1873
1874 // Check what type of users we have (must either all be extracts or
1875 // bitcasts) and ensure no memory modifications between the load and
1876 // its users.
1877 for (User *U : LI->users()) {
1878 auto *UI = dyn_cast<Instruction>(U);
1879 if (!UI || UI->getParent() != LI->getParent())
1880 return false;
1881
1882 // If any user is waiting to be erased, then bail out as this will
1883 // distort the cost calculation and possibly lead to infinite loops.
1884 if (UI->use_empty())
1885 return false;
1886
1887 if (!isa<ExtractElementInst>(UI))
1888 AllExtracts = false;
1889 if (!isa<BitCastInst>(UI))
1890 AllBitcasts = false;
1891
1892 // Check if any instruction between the load and the user may modify memory.
1893 if (LastCheckedInst->comesBefore(UI)) {
1894 for (Instruction &I :
1895 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1896 // Bail out if we reached the check limit or the instruction may write
1897 // to memory.
1898 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1899 return false;
1900 NumInstChecked++;
1901 }
1902 LastCheckedInst = UI;
1903 }
1904 }
1905
1906 if (AllExtracts)
1907 return scalarizeLoadExtract(LI, VecTy, Ptr);
1908 if (AllBitcasts)
1909 return scalarizeLoadBitcast(LI, VecTy, Ptr);
1910 return false;
1911}
1912
1913/// Try to scalarize vector loads feeding extractelement instructions.
1914bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
1915 Value *Ptr) {
1917 return false;
1918
1919 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1920 auto FailureGuard = make_scope_exit([&]() {
1921 // If the transform is aborted, discard the ScalarizationResults.
1922 for (auto &Pair : NeedFreeze)
1923 Pair.second.discard();
1924 });
1925
1926 InstructionCost OriginalCost =
1927 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1929 InstructionCost ScalarizedCost = 0;
1930
1931 for (User *U : LI->users()) {
1932 auto *UI = cast<ExtractElementInst>(U);
1933
1934 auto ScalarIdx =
1935 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1936 if (ScalarIdx.isUnsafe())
1937 return false;
1938 if (ScalarIdx.isSafeWithFreeze()) {
1939 NeedFreeze.try_emplace(UI, ScalarIdx);
1940 ScalarIdx.discard();
1941 }
1942
1943 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1944 OriginalCost +=
1945 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1946 Index ? Index->getZExtValue() : -1);
1947 ScalarizedCost +=
1948 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1950 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1951 nullptr, nullptr, CostKind);
1952 }
1953
1954 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
1955 << "\n LoadExtractCost: " << OriginalCost
1956 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1957
1958 if (ScalarizedCost >= OriginalCost)
1959 return false;
1960
1961 // Ensure we add the load back to the worklist BEFORE its users so they can
1962 // erased in the correct order.
1963 Worklist.push(LI);
1964
1965 Type *ElemType = VecTy->getElementType();
1966
1967 // Replace extracts with narrow scalar loads.
1968 for (User *U : LI->users()) {
1969 auto *EI = cast<ExtractElementInst>(U);
1970 Value *Idx = EI->getIndexOperand();
1971
1972 // Insert 'freeze' for poison indexes.
1973 auto It = NeedFreeze.find(EI);
1974 if (It != NeedFreeze.end())
1975 It->second.freeze(Builder, *cast<Instruction>(Idx));
1976
1977 Builder.SetInsertPoint(EI);
1978 Value *GEP =
1979 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1980 auto *NewLoad = cast<LoadInst>(
1981 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1982
1983 Align ScalarOpAlignment =
1984 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1985 NewLoad->setAlignment(ScalarOpAlignment);
1986
1987 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1988 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1989 AAMDNodes OldAAMD = LI->getAAMetadata();
1990 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1991 }
1992
1993 replaceValue(*EI, *NewLoad, false);
1994 }
1995
1996 FailureGuard.release();
1997 return true;
1998}
1999
2000/// Try to scalarize vector loads feeding bitcast instructions.
2001bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2002 Value *Ptr) {
2003 InstructionCost OriginalCost =
2004 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2006
2007 Type *TargetScalarType = nullptr;
2008 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2009
2010 for (User *U : LI->users()) {
2011 auto *BC = cast<BitCastInst>(U);
2012
2013 Type *DestTy = BC->getDestTy();
2014 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2015 return false;
2016
2017 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2018 if (DestBitWidth != VecBitWidth)
2019 return false;
2020
2021 // All bitcasts must target the same scalar type.
2022 if (!TargetScalarType)
2023 TargetScalarType = DestTy;
2024 else if (TargetScalarType != DestTy)
2025 return false;
2026
2027 OriginalCost +=
2028 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2030 }
2031
2032 if (!TargetScalarType)
2033 return false;
2034
2035 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2036 InstructionCost ScalarizedCost =
2037 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2039
2040 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2041 << "\n OriginalCost: " << OriginalCost
2042 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2043
2044 if (ScalarizedCost >= OriginalCost)
2045 return false;
2046
2047 // Ensure we add the load back to the worklist BEFORE its users so they can
2048 // erased in the correct order.
2049 Worklist.push(LI);
2050
2051 Builder.SetInsertPoint(LI);
2052 auto *ScalarLoad =
2053 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2054 ScalarLoad->setAlignment(LI->getAlign());
2055 ScalarLoad->copyMetadata(*LI);
2056
2057 // Replace all bitcast users with the scalar load.
2058 for (User *U : LI->users()) {
2059 auto *BC = cast<BitCastInst>(U);
2060 replaceValue(*BC, *ScalarLoad, false);
2061 }
2062
2063 return true;
2064}
2065
2066bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2068 return false;
2069 auto *Ext = dyn_cast<ZExtInst>(&I);
2070 if (!Ext)
2071 return false;
2072
2073 // Try to convert a vector zext feeding only extracts to a set of scalar
2074 // (Src << ExtIdx *Size) & (Size -1)
2075 // if profitable .
2076 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2077 if (!SrcTy)
2078 return false;
2079 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2080
2081 Type *ScalarDstTy = DstTy->getElementType();
2082 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2083 return false;
2084
2085 InstructionCost VectorCost =
2086 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2088 unsigned ExtCnt = 0;
2089 bool ExtLane0 = false;
2090 for (User *U : Ext->users()) {
2091 uint64_t Idx;
2092 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2093 return false;
2094 if (cast<Instruction>(U)->use_empty())
2095 continue;
2096 ExtCnt += 1;
2097 ExtLane0 |= !Idx;
2098 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2099 CostKind, Idx, U);
2100 }
2101
2102 InstructionCost ScalarCost =
2103 ExtCnt * TTI.getArithmeticInstrCost(
2104 Instruction::And, ScalarDstTy, CostKind,
2107 (ExtCnt - ExtLane0) *
2109 Instruction::LShr, ScalarDstTy, CostKind,
2112 if (ScalarCost > VectorCost)
2113 return false;
2114
2115 Value *ScalarV = Ext->getOperand(0);
2116 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2117 &DT)) {
2118 // Check wether all lanes are extracted, all extracts trigger UB
2119 // on poison, and the last extract (and hence all previous ones)
2120 // are guaranteed to execute if Ext executes. If so, we do not
2121 // need to insert a freeze.
2122 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2123 bool AllExtractsTriggerUB = true;
2124 ExtractElementInst *LastExtract = nullptr;
2125 BasicBlock *ExtBB = Ext->getParent();
2126 for (User *U : Ext->users()) {
2127 auto *Extract = cast<ExtractElementInst>(U);
2128 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2129 AllExtractsTriggerUB = false;
2130 break;
2131 }
2132 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2133 if (!LastExtract || LastExtract->comesBefore(Extract))
2134 LastExtract = Extract;
2135 }
2136 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2137 !AllExtractsTriggerUB ||
2139 LastExtract->getIterator()))
2140 ScalarV = Builder.CreateFreeze(ScalarV);
2141 }
2142 ScalarV = Builder.CreateBitCast(
2143 ScalarV,
2144 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2145 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2146 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2147 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2148 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2149 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2150 for (User *U : Ext->users()) {
2151 auto *Extract = cast<ExtractElementInst>(U);
2152 uint64_t Idx =
2153 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2154 uint64_t ShiftAmt =
2155 DL->isBigEndian()
2156 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2157 : (Idx * SrcEltSizeInBits);
2158 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2159 Value *And = Builder.CreateAnd(LShr, Mask);
2160 U->replaceAllUsesWith(And);
2161 }
2162 return true;
2163}
2164
2165/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2166/// to "(bitcast (concat X, Y))"
2167/// where X/Y are bitcasted from i1 mask vectors.
2168bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2169 Type *Ty = I.getType();
2170 if (!Ty->isIntegerTy())
2171 return false;
2172
2173 // TODO: Add big endian test coverage
2174 if (DL->isBigEndian())
2175 return false;
2176
2177 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2178 Instruction *X, *Y;
2180 return false;
2181
2182 // Allow both sources to contain shl, to handle more generic pattern:
2183 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2184 Value *SrcX;
2185 uint64_t ShAmtX = 0;
2186 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2187 !match(X, m_OneUse(
2189 m_ConstantInt(ShAmtX)))))
2190 return false;
2191
2192 Value *SrcY;
2193 uint64_t ShAmtY = 0;
2194 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2195 !match(Y, m_OneUse(
2197 m_ConstantInt(ShAmtY)))))
2198 return false;
2199
2200 // Canonicalize larger shift to the RHS.
2201 if (ShAmtX > ShAmtY) {
2202 std::swap(X, Y);
2203 std::swap(SrcX, SrcY);
2204 std::swap(ShAmtX, ShAmtY);
2205 }
2206
2207 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2208 // difference is the mask width so they can be easily concatenated together.
2209 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2210 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2211 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2212 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2213 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2214 !MaskTy->getElementType()->isIntegerTy(1) ||
2215 MaskTy->getNumElements() != ShAmtDiff ||
2216 MaskTy->getNumElements() > (BitWidth / 2))
2217 return false;
2218
2219 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2220 auto *ConcatIntTy =
2221 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2222 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2223
2224 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2225 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2226
2227 // TODO: Is it worth supporting multi use cases?
2228 InstructionCost OldCost = 0;
2229 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2230 OldCost +=
2231 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2232 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2234 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2236
2237 InstructionCost NewCost = 0;
2239 MaskTy, ConcatMask, CostKind);
2240 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2242 if (Ty != ConcatIntTy)
2243 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2245 if (ShAmtX > 0)
2246 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2247
2248 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2249 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2250 << "\n");
2251
2252 if (NewCost > OldCost)
2253 return false;
2254
2255 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2256 // any residual zero-extension or shifting.
2257 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2258 Worklist.pushValue(Concat);
2259
2260 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2261
2262 if (Ty != ConcatIntTy) {
2263 Worklist.pushValue(Result);
2264 Result = Builder.CreateZExt(Result, Ty);
2265 }
2266
2267 if (ShAmtX > 0) {
2268 Worklist.pushValue(Result);
2269 Result = Builder.CreateShl(Result, ShAmtX);
2270 }
2271
2272 replaceValue(I, *Result);
2273 return true;
2274}
2275
2276/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2277/// --> "binop (shuffle), (shuffle)".
2278bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2279 BinaryOperator *BinOp;
2280 ArrayRef<int> OuterMask;
2281 if (!match(&I,
2282 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2283 return false;
2284
2285 // Don't introduce poison into div/rem.
2286 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2287 return false;
2288
2289 Value *Op00, *Op01, *Op10, *Op11;
2290 ArrayRef<int> Mask0, Mask1;
2291 bool Match0 =
2292 match(BinOp->getOperand(0),
2293 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2294 bool Match1 =
2295 match(BinOp->getOperand(1),
2296 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2297 if (!Match0 && !Match1)
2298 return false;
2299
2300 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2301 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2302 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2303 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2304
2305 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2306 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2307 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2308 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2309 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2310 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2311 return false;
2312
2313 unsigned NumSrcElts = BinOpTy->getNumElements();
2314
2315 // Don't accept shuffles that reference the second operand in
2316 // div/rem or if its an undef arg.
2317 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2318 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2319 return false;
2320
2321 // Merge outer / inner (or identity if no match) shuffles.
2322 SmallVector<int> NewMask0, NewMask1;
2323 for (int M : OuterMask) {
2324 if (M < 0 || M >= (int)NumSrcElts) {
2325 NewMask0.push_back(PoisonMaskElem);
2326 NewMask1.push_back(PoisonMaskElem);
2327 } else {
2328 NewMask0.push_back(Match0 ? Mask0[M] : M);
2329 NewMask1.push_back(Match1 ? Mask1[M] : M);
2330 }
2331 }
2332
2333 unsigned NumOpElts = Op0Ty->getNumElements();
2334 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2335 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2336 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2337 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2338 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2339 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2340
2341 // Try to merge shuffles across the binop if the new shuffles are not costly.
2342 InstructionCost OldCost =
2343 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2345 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2346 if (Match0)
2347 OldCost += TTI.getShuffleCost(
2348 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2349 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2350 if (Match1)
2351 OldCost += TTI.getShuffleCost(
2352 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2353 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2354
2355 InstructionCost NewCost =
2356 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2357
2358 if (!IsIdentity0)
2359 NewCost +=
2361 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2362 if (!IsIdentity1)
2363 NewCost +=
2365 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2366
2367 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2368 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2369 << "\n");
2370
2371 // If costs are equal, still fold as we reduce instruction count.
2372 if (NewCost > OldCost)
2373 return false;
2374
2375 Value *LHS =
2376 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2377 Value *RHS =
2378 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2379 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2380
2381 // Intersect flags from the old binops.
2382 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2383 NewInst->copyIRFlags(BinOp);
2384
2385 Worklist.pushValue(LHS);
2386 Worklist.pushValue(RHS);
2387 replaceValue(I, *NewBO);
2388 return true;
2389}
2390
2391/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2392/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2393bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2394 ArrayRef<int> OldMask;
2395 Instruction *LHS, *RHS;
2397 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2398 return false;
2399
2400 // TODO: Add support for addlike etc.
2401 if (LHS->getOpcode() != RHS->getOpcode())
2402 return false;
2403
2404 Value *X, *Y, *Z, *W;
2405 bool IsCommutative = false;
2406 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2407 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2408 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2409 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2410 auto *BO = cast<BinaryOperator>(LHS);
2411 // Don't introduce poison into div/rem.
2412 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2413 return false;
2414 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2415 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2416 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2417 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2418 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2419 } else
2420 return false;
2421
2422 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2423 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2424 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2425 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2426 return false;
2427
2428 unsigned NumSrcElts = BinOpTy->getNumElements();
2429
2430 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2431 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2432 std::swap(X, Y);
2433
2434 auto ConvertToUnary = [NumSrcElts](int &M) {
2435 if (M >= (int)NumSrcElts)
2436 M -= NumSrcElts;
2437 };
2438
2439 SmallVector<int> NewMask0(OldMask);
2441 if (X == Z) {
2442 llvm::for_each(NewMask0, ConvertToUnary);
2444 Z = PoisonValue::get(BinOpTy);
2445 }
2446
2447 SmallVector<int> NewMask1(OldMask);
2449 if (Y == W) {
2450 llvm::for_each(NewMask1, ConvertToUnary);
2452 W = PoisonValue::get(BinOpTy);
2453 }
2454
2455 // Try to replace a binop with a shuffle if the shuffle is not costly.
2456 InstructionCost OldCost =
2460 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2461 &I);
2462
2463 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2464 // where one use shuffles have gotten split across the binop/cmp. These
2465 // often allow a major reduction in total cost that wouldn't happen as
2466 // individual folds.
2467 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2468 TTI::TargetCostKind CostKind) -> bool {
2469 Value *InnerOp;
2470 ArrayRef<int> InnerMask;
2471 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2472 m_Mask(InnerMask)))) &&
2473 InnerOp->getType() == Op->getType() &&
2474 all_of(InnerMask,
2475 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2476 for (int &M : Mask)
2477 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2478 M = InnerMask[M - Offset];
2479 M = 0 <= M ? M + Offset : M;
2480 }
2482 Op = InnerOp;
2483 return true;
2484 }
2485 return false;
2486 };
2487 bool ReducedInstCount = false;
2488 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2489 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2490 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2491 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2492
2493 auto *ShuffleCmpTy =
2494 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2495 InstructionCost NewCost =
2496 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2497 nullptr, {X, Z}) +
2498 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2499 nullptr, {Y, W});
2500
2501 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2502 NewCost +=
2503 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2504 } else {
2505 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2506 ShuffleDstTy, PredLHS, CostKind);
2507 }
2508
2509 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2510 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2511 << "\n");
2512
2513 // If either shuffle will constant fold away, then fold for the same cost as
2514 // we will reduce the instruction count.
2515 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2516 (isa<Constant>(Y) && isa<Constant>(W));
2517 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2518 return false;
2519
2520 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2521 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2522 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2523 ? Builder.CreateBinOp(
2524 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2525 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2526
2527 // Intersect flags from the old binops.
2528 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2529 NewInst->copyIRFlags(LHS);
2530 NewInst->andIRFlags(RHS);
2531 }
2532
2533 Worklist.pushValue(Shuf0);
2534 Worklist.pushValue(Shuf1);
2535 replaceValue(I, *NewBO);
2536 return true;
2537}
2538
2539/// Try to convert,
2540/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2541/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2542bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2543 ArrayRef<int> Mask;
2544 Value *C1, *T1, *F1, *C2, *T2, *F2;
2545 if (!match(&I, m_Shuffle(
2547 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2548 m_Mask(Mask))))
2549 return false;
2550
2551 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2552 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2553 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2554 return false;
2555
2556 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2557 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2558 // SelectInsts must have the same FMF.
2559 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2560 ((SI0FOp != nullptr) &&
2561 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2562 return false;
2563
2564 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2565 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2567 auto SelOp = Instruction::Select;
2569 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2570 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2572 OldCost +=
2573 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2574 {I.getOperand(0), I.getOperand(1)}, &I);
2575
2577 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2578 Mask, CostKind, 0, nullptr, {C1, C2});
2579 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2580 nullptr, {T1, T2});
2581 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2582 nullptr, {F1, F2});
2583 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2584 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2585 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2587
2588 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2589 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2590 << "\n");
2591 if (NewCost > OldCost)
2592 return false;
2593
2594 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2595 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2596 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2597 Value *NewSel;
2598 // We presuppose that the SelectInsts have the same FMF.
2599 if (SI0FOp)
2600 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2601 SI0FOp->getFastMathFlags());
2602 else
2603 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2604
2605 Worklist.pushValue(ShuffleCmp);
2606 Worklist.pushValue(ShuffleTrue);
2607 Worklist.pushValue(ShuffleFalse);
2608 replaceValue(I, *NewSel);
2609 return true;
2610}
2611
2612/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2613/// into "castop (shuffle)".
2614bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2615 Value *V0, *V1;
2616 ArrayRef<int> OldMask;
2617 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2618 return false;
2619
2620 // Check whether this is a binary shuffle.
2621 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2622
2623 auto *C0 = dyn_cast<CastInst>(V0);
2624 auto *C1 = dyn_cast<CastInst>(V1);
2625 if (!C0 || (IsBinaryShuffle && !C1))
2626 return false;
2627
2628 Instruction::CastOps Opcode = C0->getOpcode();
2629
2630 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2631 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2632 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2633 return false;
2634
2635 if (IsBinaryShuffle) {
2636 if (C0->getSrcTy() != C1->getSrcTy())
2637 return false;
2638 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2639 if (Opcode != C1->getOpcode()) {
2640 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2641 Opcode = Instruction::SExt;
2642 else
2643 return false;
2644 }
2645 }
2646
2647 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2648 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2649 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2650 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2651 return false;
2652
2653 unsigned NumSrcElts = CastSrcTy->getNumElements();
2654 unsigned NumDstElts = CastDstTy->getNumElements();
2655 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2656 "Only bitcasts expected to alter src/dst element counts");
2657
2658 // Check for bitcasting of unscalable vector types.
2659 // e.g. <32 x i40> -> <40 x i32>
2660 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2661 (NumDstElts % NumSrcElts) != 0)
2662 return false;
2663
2664 SmallVector<int, 16> NewMask;
2665 if (NumSrcElts >= NumDstElts) {
2666 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2667 // always be expanded to the equivalent form choosing narrower elements.
2668 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2669 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2670 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2671 } else {
2672 // The bitcast is from narrow elements to wide elements. The shuffle mask
2673 // must choose consecutive elements to allow casting first.
2674 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2675 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2676 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2677 return false;
2678 }
2679
2680 auto *NewShuffleDstTy =
2681 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2682
2683 // Try to replace a castop with a shuffle if the shuffle is not costly.
2684 InstructionCost CostC0 =
2685 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2687
2689 if (IsBinaryShuffle)
2691 else
2693
2694 InstructionCost OldCost = CostC0;
2695 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2696 CostKind, 0, nullptr, {}, &I);
2697
2698 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2699 CastSrcTy, NewMask, CostKind);
2700 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2702 if (!C0->hasOneUse())
2703 NewCost += CostC0;
2704 if (IsBinaryShuffle) {
2705 InstructionCost CostC1 =
2706 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2708 OldCost += CostC1;
2709 if (!C1->hasOneUse())
2710 NewCost += CostC1;
2711 }
2712
2713 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2714 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2715 << "\n");
2716 if (NewCost > OldCost)
2717 return false;
2718
2719 Value *Shuf;
2720 if (IsBinaryShuffle)
2721 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2722 NewMask);
2723 else
2724 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2725
2726 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2727
2728 // Intersect flags from the old casts.
2729 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2730 NewInst->copyIRFlags(C0);
2731 if (IsBinaryShuffle)
2732 NewInst->andIRFlags(C1);
2733 }
2734
2735 Worklist.pushValue(Shuf);
2736 replaceValue(I, *Cast);
2737 return true;
2738}
2739
2740/// Try to convert any of:
2741/// "shuffle (shuffle x, y), (shuffle y, x)"
2742/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2743/// "shuffle (shuffle x, undef), y"
2744/// "shuffle x, (shuffle y, undef)"
2745/// into "shuffle x, y".
2746bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2747 ArrayRef<int> OuterMask;
2748 Value *OuterV0, *OuterV1;
2749 if (!match(&I,
2750 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2751 return false;
2752
2753 ArrayRef<int> InnerMask0, InnerMask1;
2754 Value *X0, *X1, *Y0, *Y1;
2755 bool Match0 =
2756 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2757 bool Match1 =
2758 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2759 if (!Match0 && !Match1)
2760 return false;
2761
2762 // If the outer shuffle is a permute, then create a fake inner all-poison
2763 // shuffle. This is easier than accounting for length-changing shuffles below.
2764 SmallVector<int, 16> PoisonMask1;
2765 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2766 X1 = X0;
2767 Y1 = Y0;
2768 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2769 InnerMask1 = PoisonMask1;
2770 Match1 = true; // fake match
2771 }
2772
2773 X0 = Match0 ? X0 : OuterV0;
2774 Y0 = Match0 ? Y0 : OuterV0;
2775 X1 = Match1 ? X1 : OuterV1;
2776 Y1 = Match1 ? Y1 : OuterV1;
2777 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2778 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2779 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2780 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2781 X0->getType() != X1->getType())
2782 return false;
2783
2784 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2785 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2786
2787 // Attempt to merge shuffles, matching upto 2 source operands.
2788 // Replace index to a poison arg with PoisonMaskElem.
2789 // Bail if either inner masks reference an undef arg.
2790 SmallVector<int, 16> NewMask(OuterMask);
2791 Value *NewX = nullptr, *NewY = nullptr;
2792 for (int &M : NewMask) {
2793 Value *Src = nullptr;
2794 if (0 <= M && M < (int)NumImmElts) {
2795 Src = OuterV0;
2796 if (Match0) {
2797 M = InnerMask0[M];
2798 Src = M >= (int)NumSrcElts ? Y0 : X0;
2799 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2800 }
2801 } else if (M >= (int)NumImmElts) {
2802 Src = OuterV1;
2803 M -= NumImmElts;
2804 if (Match1) {
2805 M = InnerMask1[M];
2806 Src = M >= (int)NumSrcElts ? Y1 : X1;
2807 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2808 }
2809 }
2810 if (Src && M != PoisonMaskElem) {
2811 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2812 if (isa<UndefValue>(Src)) {
2813 // We've referenced an undef element - if its poison, update the shuffle
2814 // mask, else bail.
2815 if (!isa<PoisonValue>(Src))
2816 return false;
2817 M = PoisonMaskElem;
2818 continue;
2819 }
2820 if (!NewX || NewX == Src) {
2821 NewX = Src;
2822 continue;
2823 }
2824 if (!NewY || NewY == Src) {
2825 M += NumSrcElts;
2826 NewY = Src;
2827 continue;
2828 }
2829 return false;
2830 }
2831 }
2832
2833 if (!NewX)
2834 return PoisonValue::get(ShuffleDstTy);
2835 if (!NewY)
2836 NewY = PoisonValue::get(ShuffleSrcTy);
2837
2838 // Have we folded to an Identity shuffle?
2839 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2840 replaceValue(I, *NewX);
2841 return true;
2842 }
2843
2844 // Try to merge the shuffles if the new shuffle is not costly.
2845 InstructionCost InnerCost0 = 0;
2846 if (Match0)
2847 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2848
2849 InstructionCost InnerCost1 = 0;
2850 if (Match1)
2851 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2852
2854
2855 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2856
2857 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2861 InstructionCost NewCost =
2862 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2863 nullptr, {NewX, NewY});
2864 if (!OuterV0->hasOneUse())
2865 NewCost += InnerCost0;
2866 if (!OuterV1->hasOneUse())
2867 NewCost += InnerCost1;
2868
2869 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2870 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2871 << "\n");
2872 if (NewCost > OldCost)
2873 return false;
2874
2875 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2876 replaceValue(I, *Shuf);
2877 return true;
2878}
2879
2880/// Try to convert
2881/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2882bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2883 Value *V0, *V1;
2884 ArrayRef<int> OldMask;
2885 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2886 m_Mask(OldMask))))
2887 return false;
2888
2889 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2890 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2891 if (!II0 || !II1)
2892 return false;
2893
2894 Intrinsic::ID IID = II0->getIntrinsicID();
2895 if (IID != II1->getIntrinsicID())
2896 return false;
2897
2898 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2899 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2900 if (!ShuffleDstTy || !II0Ty)
2901 return false;
2902
2903 if (!isTriviallyVectorizable(IID))
2904 return false;
2905
2906 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2908 II0->getArgOperand(I) != II1->getArgOperand(I))
2909 return false;
2910
2911 InstructionCost OldCost =
2912 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2913 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
2915 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2916
2917 SmallVector<Type *> NewArgsTy;
2918 InstructionCost NewCost = 0;
2919 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2921 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2922 } else {
2923 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2924 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2925 ShuffleDstTy->getNumElements());
2926 NewArgsTy.push_back(ArgTy);
2928 ArgTy, VecTy, OldMask, CostKind);
2929 }
2930 }
2931 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2932 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2933
2934 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2935 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2936 << "\n");
2937
2938 if (NewCost > OldCost)
2939 return false;
2940
2941 SmallVector<Value *> NewArgs;
2942 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2944 NewArgs.push_back(II0->getArgOperand(I));
2945 } else {
2946 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2947 II1->getArgOperand(I), OldMask);
2948 NewArgs.push_back(Shuf);
2949 Worklist.pushValue(Shuf);
2950 }
2951 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2952
2953 // Intersect flags from the old intrinsics.
2954 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2955 NewInst->copyIRFlags(II0);
2956 NewInst->andIRFlags(II1);
2957 }
2958
2959 replaceValue(I, *NewIntrinsic);
2960 return true;
2961}
2962
2963using InstLane = std::pair<Use *, int>;
2964
2965static InstLane lookThroughShuffles(Use *U, int Lane) {
2966 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
2967 unsigned NumElts =
2968 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
2969 int M = SV->getMaskValue(Lane);
2970 if (M < 0)
2971 return {nullptr, PoisonMaskElem};
2972 if (static_cast<unsigned>(M) < NumElts) {
2973 U = &SV->getOperandUse(0);
2974 Lane = M;
2975 } else {
2976 U = &SV->getOperandUse(1);
2977 Lane = M - NumElts;
2978 }
2979 }
2980 return InstLane{U, Lane};
2981}
2982
2986 for (InstLane IL : Item) {
2987 auto [U, Lane] = IL;
2988 InstLane OpLane =
2989 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
2990 Lane)
2991 : InstLane{nullptr, PoisonMaskElem};
2992 NItem.emplace_back(OpLane);
2993 }
2994 return NItem;
2995}
2996
2997/// Detect concat of multiple values into a vector
2999 const TargetTransformInfo &TTI) {
3000 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3001 unsigned NumElts = Ty->getNumElements();
3002 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3003 return false;
3004
3005 // Check that the concat is free, usually meaning that the type will be split
3006 // during legalization.
3007 SmallVector<int, 16> ConcatMask(NumElts * 2);
3008 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3009 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3010 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3011 Ty, ConcatMask, CostKind) != 0)
3012 return false;
3013
3014 unsigned NumSlices = Item.size() / NumElts;
3015 // Currently we generate a tree of shuffles for the concats, which limits us
3016 // to a power2.
3017 if (!isPowerOf2_32(NumSlices))
3018 return false;
3019 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3020 Use *SliceV = Item[Slice * NumElts].first;
3021 if (!SliceV || SliceV->get()->getType() != Ty)
3022 return false;
3023 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3024 auto [V, Lane] = Item[Slice * NumElts + Elt];
3025 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3026 return false;
3027 }
3028 }
3029 return true;
3030}
3031
3033 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3034 const SmallPtrSet<Use *, 4> &SplatLeafs,
3035 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3036 IRBuilderBase &Builder,
3037 const TargetTransformInfo *TTI) {
3038 auto [FrontU, FrontLane] = Item.front();
3039
3040 if (IdentityLeafs.contains(FrontU)) {
3041 return FrontU->get();
3042 }
3043 if (SplatLeafs.contains(FrontU)) {
3044 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3045 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3046 }
3047 if (ConcatLeafs.contains(FrontU)) {
3048 unsigned NumElts =
3049 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3050 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3051 for (unsigned S = 0; S < Values.size(); ++S)
3052 Values[S] = Item[S * NumElts].first->get();
3053
3054 while (Values.size() > 1) {
3055 NumElts *= 2;
3056 SmallVector<int, 16> Mask(NumElts, 0);
3057 std::iota(Mask.begin(), Mask.end(), 0);
3058 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3059 for (unsigned S = 0; S < NewValues.size(); ++S)
3060 NewValues[S] =
3061 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3062 Values = NewValues;
3063 }
3064 return Values[0];
3065 }
3066
3067 auto *I = cast<Instruction>(FrontU->get());
3068 auto *II = dyn_cast<IntrinsicInst>(I);
3069 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3071 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3072 if (II &&
3073 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3074 Ops[Idx] = II->getOperand(Idx);
3075 continue;
3076 }
3078 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3079 Builder, TTI);
3080 }
3081
3082 SmallVector<Value *, 8> ValueList;
3083 for (const auto &Lane : Item)
3084 if (Lane.first)
3085 ValueList.push_back(Lane.first->get());
3086
3087 Type *DstTy =
3088 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3089 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3090 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3091 Ops[0], Ops[1]);
3092 propagateIRFlags(Value, ValueList);
3093 return Value;
3094 }
3095 if (auto *CI = dyn_cast<CmpInst>(I)) {
3096 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3097 propagateIRFlags(Value, ValueList);
3098 return Value;
3099 }
3100 if (auto *SI = dyn_cast<SelectInst>(I)) {
3101 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3102 propagateIRFlags(Value, ValueList);
3103 return Value;
3104 }
3105 if (auto *CI = dyn_cast<CastInst>(I)) {
3106 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3107 propagateIRFlags(Value, ValueList);
3108 return Value;
3109 }
3110 if (II) {
3111 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3112 propagateIRFlags(Value, ValueList);
3113 return Value;
3114 }
3115 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3116 auto *Value =
3117 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3118 propagateIRFlags(Value, ValueList);
3119 return Value;
3120}
3121
3122// Starting from a shuffle, look up through operands tracking the shuffled index
3123// of each lane. If we can simplify away the shuffles to identities then
3124// do so.
3125bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3126 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3127 if (!Ty || I.use_empty())
3128 return false;
3129
3130 SmallVector<InstLane> Start(Ty->getNumElements());
3131 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3132 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3133
3135 Worklist.push_back(Start);
3136 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3137 unsigned NumVisited = 0;
3138
3139 while (!Worklist.empty()) {
3140 if (++NumVisited > MaxInstrsToScan)
3141 return false;
3142
3143 SmallVector<InstLane> Item = Worklist.pop_back_val();
3144 auto [FrontU, FrontLane] = Item.front();
3145
3146 // If we found an undef first lane then bail out to keep things simple.
3147 if (!FrontU)
3148 return false;
3149
3150 // Helper to peek through bitcasts to the same value.
3151 auto IsEquiv = [&](Value *X, Value *Y) {
3152 return X->getType() == Y->getType() &&
3154 };
3155
3156 // Look for an identity value.
3157 if (FrontLane == 0 &&
3158 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3159 Ty->getNumElements() &&
3160 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3161 Value *FrontV = Item.front().first->get();
3162 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3163 E.value().second == (int)E.index());
3164 })) {
3165 IdentityLeafs.insert(FrontU);
3166 continue;
3167 }
3168 // Look for constants, for the moment only supporting constant splats.
3169 if (auto *C = dyn_cast<Constant>(FrontU);
3170 C && C->getSplatValue() &&
3171 all_of(drop_begin(Item), [Item](InstLane &IL) {
3172 Value *FrontV = Item.front().first->get();
3173 Use *U = IL.first;
3174 return !U || (isa<Constant>(U->get()) &&
3175 cast<Constant>(U->get())->getSplatValue() ==
3176 cast<Constant>(FrontV)->getSplatValue());
3177 })) {
3178 SplatLeafs.insert(FrontU);
3179 continue;
3180 }
3181 // Look for a splat value.
3182 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3183 auto [FrontU, FrontLane] = Item.front();
3184 auto [U, Lane] = IL;
3185 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3186 })) {
3187 SplatLeafs.insert(FrontU);
3188 continue;
3189 }
3190
3191 // We need each element to be the same type of value, and check that each
3192 // element has a single use.
3193 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3194 Value *FrontV = Item.front().first->get();
3195 if (!IL.first)
3196 return true;
3197 Value *V = IL.first->get();
3198 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3199 return false;
3200 if (V->getValueID() != FrontV->getValueID())
3201 return false;
3202 if (auto *CI = dyn_cast<CmpInst>(V))
3203 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3204 return false;
3205 if (auto *CI = dyn_cast<CastInst>(V))
3206 if (CI->getSrcTy()->getScalarType() !=
3207 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3208 return false;
3209 if (auto *SI = dyn_cast<SelectInst>(V))
3210 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3211 SI->getOperand(0)->getType() !=
3212 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3213 return false;
3214 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3215 return false;
3216 auto *II = dyn_cast<IntrinsicInst>(V);
3217 return !II || (isa<IntrinsicInst>(FrontV) &&
3218 II->getIntrinsicID() ==
3219 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3220 !II->hasOperandBundles());
3221 };
3222 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3223 // Check the operator is one that we support.
3224 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3225 // We exclude div/rem in case they hit UB from poison lanes.
3226 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3227 BO && BO->isIntDivRem())
3228 return false;
3231 continue;
3232 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3233 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3235 continue;
3236 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3237 // TODO: Handle vector widening/narrowing bitcasts.
3238 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3239 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3240 if (DstTy && SrcTy &&
3241 SrcTy->getNumElements() == DstTy->getNumElements()) {
3243 continue;
3244 }
3245 } else if (isa<SelectInst>(FrontU)) {
3249 continue;
3250 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3251 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3252 !II->hasOperandBundles()) {
3253 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3254 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3255 &TTI)) {
3256 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3257 Value *FrontV = Item.front().first->get();
3258 Use *U = IL.first;
3259 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3260 cast<Instruction>(FrontV)->getOperand(Op));
3261 }))
3262 return false;
3263 continue;
3264 }
3266 }
3267 continue;
3268 }
3269 }
3270
3271 if (isFreeConcat(Item, CostKind, TTI)) {
3272 ConcatLeafs.insert(FrontU);
3273 continue;
3274 }
3275
3276 return false;
3277 }
3278
3279 if (NumVisited <= 1)
3280 return false;
3281
3282 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3283
3284 // If we got this far, we know the shuffles are superfluous and can be
3285 // removed. Scan through again and generate the new tree of instructions.
3286 Builder.SetInsertPoint(&I);
3287 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3288 ConcatLeafs, Builder, &TTI);
3289 replaceValue(I, *V);
3290 return true;
3291}
3292
3293/// Given a commutative reduction, the order of the input lanes does not alter
3294/// the results. We can use this to remove certain shuffles feeding the
3295/// reduction, removing the need to shuffle at all.
3296bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3297 auto *II = dyn_cast<IntrinsicInst>(&I);
3298 if (!II)
3299 return false;
3300 switch (II->getIntrinsicID()) {
3301 case Intrinsic::vector_reduce_add:
3302 case Intrinsic::vector_reduce_mul:
3303 case Intrinsic::vector_reduce_and:
3304 case Intrinsic::vector_reduce_or:
3305 case Intrinsic::vector_reduce_xor:
3306 case Intrinsic::vector_reduce_smin:
3307 case Intrinsic::vector_reduce_smax:
3308 case Intrinsic::vector_reduce_umin:
3309 case Intrinsic::vector_reduce_umax:
3310 break;
3311 default:
3312 return false;
3313 }
3314
3315 // Find all the inputs when looking through operations that do not alter the
3316 // lane order (binops, for example). Currently we look for a single shuffle,
3317 // and can ignore splat values.
3318 std::queue<Value *> Worklist;
3319 SmallPtrSet<Value *, 4> Visited;
3320 ShuffleVectorInst *Shuffle = nullptr;
3321 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3322 Worklist.push(Op);
3323
3324 while (!Worklist.empty()) {
3325 Value *CV = Worklist.front();
3326 Worklist.pop();
3327 if (Visited.contains(CV))
3328 continue;
3329
3330 // Splats don't change the order, so can be safely ignored.
3331 if (isSplatValue(CV))
3332 continue;
3333
3334 Visited.insert(CV);
3335
3336 if (auto *CI = dyn_cast<Instruction>(CV)) {
3337 if (CI->isBinaryOp()) {
3338 for (auto *Op : CI->operand_values())
3339 Worklist.push(Op);
3340 continue;
3341 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3342 if (Shuffle && Shuffle != SV)
3343 return false;
3344 Shuffle = SV;
3345 continue;
3346 }
3347 }
3348
3349 // Anything else is currently an unknown node.
3350 return false;
3351 }
3352
3353 if (!Shuffle)
3354 return false;
3355
3356 // Check all uses of the binary ops and shuffles are also included in the
3357 // lane-invariant operations (Visited should be the list of lanewise
3358 // instructions, including the shuffle that we found).
3359 for (auto *V : Visited)
3360 for (auto *U : V->users())
3361 if (!Visited.contains(U) && U != &I)
3362 return false;
3363
3364 FixedVectorType *VecType =
3365 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3366 if (!VecType)
3367 return false;
3368 FixedVectorType *ShuffleInputType =
3370 if (!ShuffleInputType)
3371 return false;
3372 unsigned NumInputElts = ShuffleInputType->getNumElements();
3373
3374 // Find the mask from sorting the lanes into order. This is most likely to
3375 // become a identity or concat mask. Undef elements are pushed to the end.
3376 SmallVector<int> ConcatMask;
3377 Shuffle->getShuffleMask(ConcatMask);
3378 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3379 bool UsesSecondVec =
3380 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3381
3383 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3384 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3386 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3387 ShuffleInputType, ConcatMask, CostKind);
3388
3389 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3390 << "\n");
3391 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3392 << "\n");
3393 bool MadeChanges = false;
3394 if (NewCost < OldCost) {
3395 Builder.SetInsertPoint(Shuffle);
3396 Value *NewShuffle = Builder.CreateShuffleVector(
3397 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3398 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3399 replaceValue(*Shuffle, *NewShuffle);
3400 return true;
3401 }
3402
3403 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3404 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3405 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3406 return MadeChanges;
3407}
3408
3409/// For a given chain of patterns of the following form:
3410///
3411/// ```
3412/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3413///
3414/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3415/// ty1> %1)
3416/// OR
3417/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3418///
3419/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3420/// ...
3421/// ...
3422/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3423/// 3), <n x ty1> %(i - 2)
3424/// OR
3425/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3426///
3427/// %(i) = extractelement <n x ty1> %(i - 1), 0
3428/// ```
3429///
3430/// Where:
3431/// `mask` follows a partition pattern:
3432///
3433/// Ex:
3434/// [n = 8, p = poison]
3435///
3436/// 4 5 6 7 | p p p p
3437/// 2 3 | p p p p p p
3438/// 1 | p p p p p p p
3439///
3440/// For powers of 2, there's a consistent pattern, but for other cases
3441/// the parity of the current half value at each step decides the
3442/// next partition half (see `ExpectedParityMask` for more logical details
3443/// in generalising this).
3444///
3445/// Ex:
3446/// [n = 6]
3447///
3448/// 3 4 5 | p p p
3449/// 1 2 | p p p p
3450/// 1 | p p p p p
3451bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3452 // Going bottom-up for the pattern.
3453 std::queue<Value *> InstWorklist;
3454 InstructionCost OrigCost = 0;
3455
3456 // Common instruction operation after each shuffle op.
3457 std::optional<unsigned int> CommonCallOp = std::nullopt;
3458 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3459
3460 bool IsFirstCallOrBinInst = true;
3461 bool ShouldBeCallOrBinInst = true;
3462
3463 // This stores the last used instructions for shuffle/common op.
3464 //
3465 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3466 // instructions from either shuffle/common op.
3467 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3468
3469 Value *VecOpEE;
3470 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3471 return false;
3472
3473 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3474 if (!FVT)
3475 return false;
3476
3477 int64_t VecSize = FVT->getNumElements();
3478 if (VecSize < 2)
3479 return false;
3480
3481 // Number of levels would be ~log2(n), considering we always partition
3482 // by half for this fold pattern.
3483 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3484 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3485
3486 // This is how we generalise for all element sizes.
3487 // At each step, if vector size is odd, we need non-poison
3488 // values to cover the dominant half so we don't miss out on any element.
3489 //
3490 // This mask will help us retrieve this as we go from bottom to top:
3491 //
3492 // Mask Set -> N = N * 2 - 1
3493 // Mask Unset -> N = N * 2
3494 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3495 Cur = (Cur + 1) / 2, --Mask) {
3496 if (Cur & 1)
3497 ExpectedParityMask |= (1ll << Mask);
3498 }
3499
3500 InstWorklist.push(VecOpEE);
3501
3502 while (!InstWorklist.empty()) {
3503 Value *CI = InstWorklist.front();
3504 InstWorklist.pop();
3505
3506 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3507 if (!ShouldBeCallOrBinInst)
3508 return false;
3509
3510 if (!IsFirstCallOrBinInst &&
3511 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3512 return false;
3513
3514 // For the first found call/bin op, the vector has to come from the
3515 // extract element op.
3516 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3517 return false;
3518 IsFirstCallOrBinInst = false;
3519
3520 if (!CommonCallOp)
3521 CommonCallOp = II->getIntrinsicID();
3522 if (II->getIntrinsicID() != *CommonCallOp)
3523 return false;
3524
3525 switch (II->getIntrinsicID()) {
3526 case Intrinsic::umin:
3527 case Intrinsic::umax:
3528 case Intrinsic::smin:
3529 case Intrinsic::smax: {
3530 auto *Op0 = II->getOperand(0);
3531 auto *Op1 = II->getOperand(1);
3532 PrevVecV[0] = Op0;
3533 PrevVecV[1] = Op1;
3534 break;
3535 }
3536 default:
3537 return false;
3538 }
3539 ShouldBeCallOrBinInst ^= 1;
3540
3541 IntrinsicCostAttributes ICA(
3542 *CommonCallOp, II->getType(),
3543 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3544 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3545
3546 // We may need a swap here since it can be (a, b) or (b, a)
3547 // and accordingly change as we go up.
3548 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3549 std::swap(PrevVecV[0], PrevVecV[1]);
3550 InstWorklist.push(PrevVecV[1]);
3551 InstWorklist.push(PrevVecV[0]);
3552 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3553 // Similar logic for bin ops.
3554
3555 if (!ShouldBeCallOrBinInst)
3556 return false;
3557
3558 if (!IsFirstCallOrBinInst &&
3559 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3560 return false;
3561
3562 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3563 return false;
3564 IsFirstCallOrBinInst = false;
3565
3566 if (!CommonBinOp)
3567 CommonBinOp = BinOp->getOpcode();
3568
3569 if (BinOp->getOpcode() != *CommonBinOp)
3570 return false;
3571
3572 switch (*CommonBinOp) {
3573 case BinaryOperator::Add:
3574 case BinaryOperator::Mul:
3575 case BinaryOperator::Or:
3576 case BinaryOperator::And:
3577 case BinaryOperator::Xor: {
3578 auto *Op0 = BinOp->getOperand(0);
3579 auto *Op1 = BinOp->getOperand(1);
3580 PrevVecV[0] = Op0;
3581 PrevVecV[1] = Op1;
3582 break;
3583 }
3584 default:
3585 return false;
3586 }
3587 ShouldBeCallOrBinInst ^= 1;
3588
3589 OrigCost +=
3590 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3591
3592 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3593 std::swap(PrevVecV[0], PrevVecV[1]);
3594 InstWorklist.push(PrevVecV[1]);
3595 InstWorklist.push(PrevVecV[0]);
3596 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3597 // We shouldn't have any null values in the previous vectors,
3598 // is so, there was a mismatch in pattern.
3599 if (ShouldBeCallOrBinInst ||
3600 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3601 return false;
3602
3603 if (SVInst != PrevVecV[1])
3604 return false;
3605
3606 ArrayRef<int> CurMask;
3607 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3608 m_Mask(CurMask))))
3609 return false;
3610
3611 // Subtract the parity mask when checking the condition.
3612 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3613 if (Mask < ShuffleMaskHalf &&
3614 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3615 return false;
3616 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3617 return false;
3618 }
3619
3620 // Update mask values.
3621 ShuffleMaskHalf *= 2;
3622 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3623 ExpectedParityMask >>= 1;
3624
3626 SVInst->getType(), SVInst->getType(),
3627 CurMask, CostKind);
3628
3629 VisitedCnt += 1;
3630 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3631 break;
3632
3633 ShouldBeCallOrBinInst ^= 1;
3634 } else {
3635 return false;
3636 }
3637 }
3638
3639 // Pattern should end with a shuffle op.
3640 if (ShouldBeCallOrBinInst)
3641 return false;
3642
3643 assert(VecSize != -1 && "Expected Match for Vector Size");
3644
3645 Value *FinalVecV = PrevVecV[0];
3646 if (!FinalVecV)
3647 return false;
3648
3649 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3650
3651 Intrinsic::ID ReducedOp =
3652 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3653 : getReductionForBinop(*CommonBinOp));
3654 if (!ReducedOp)
3655 return false;
3656
3657 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3659
3660 if (NewCost >= OrigCost)
3661 return false;
3662
3663 auto *ReducedResult =
3664 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3665 replaceValue(I, *ReducedResult);
3666
3667 return true;
3668}
3669
3670/// Determine if its more efficient to fold:
3671/// reduce(trunc(x)) -> trunc(reduce(x)).
3672/// reduce(sext(x)) -> sext(reduce(x)).
3673/// reduce(zext(x)) -> zext(reduce(x)).
3674bool VectorCombine::foldCastFromReductions(Instruction &I) {
3675 auto *II = dyn_cast<IntrinsicInst>(&I);
3676 if (!II)
3677 return false;
3678
3679 bool TruncOnly = false;
3680 Intrinsic::ID IID = II->getIntrinsicID();
3681 switch (IID) {
3682 case Intrinsic::vector_reduce_add:
3683 case Intrinsic::vector_reduce_mul:
3684 TruncOnly = true;
3685 break;
3686 case Intrinsic::vector_reduce_and:
3687 case Intrinsic::vector_reduce_or:
3688 case Intrinsic::vector_reduce_xor:
3689 break;
3690 default:
3691 return false;
3692 }
3693
3694 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3695 Value *ReductionSrc = I.getOperand(0);
3696
3697 Value *Src;
3698 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3699 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3700 return false;
3701
3702 auto CastOpc =
3703 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3704
3705 auto *SrcTy = cast<VectorType>(Src->getType());
3706 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3707 Type *ResultTy = I.getType();
3708
3710 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3711 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3713 cast<CastInst>(ReductionSrc));
3714 InstructionCost NewCost =
3715 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3716 CostKind) +
3717 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3719
3720 if (OldCost <= NewCost || !NewCost.isValid())
3721 return false;
3722
3723 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3724 II->getIntrinsicID(), {Src});
3725 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3726 replaceValue(I, *NewCast);
3727 return true;
3728}
3729
3730/// Returns true if this ShuffleVectorInst eventually feeds into a
3731/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3732/// chains of shuffles and binary operators (in any combination/order).
3733/// The search does not go deeper than the given Depth.
3735 constexpr unsigned MaxVisited = 32;
3738 bool FoundReduction = false;
3739
3740 WorkList.push_back(SVI);
3741 while (!WorkList.empty()) {
3742 Instruction *I = WorkList.pop_back_val();
3743 for (User *U : I->users()) {
3744 auto *UI = cast<Instruction>(U);
3745 if (!UI || !Visited.insert(UI).second)
3746 continue;
3747 if (Visited.size() > MaxVisited)
3748 return false;
3749 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3750 // More than one reduction reached
3751 if (FoundReduction)
3752 return false;
3753 switch (II->getIntrinsicID()) {
3754 case Intrinsic::vector_reduce_add:
3755 case Intrinsic::vector_reduce_mul:
3756 case Intrinsic::vector_reduce_and:
3757 case Intrinsic::vector_reduce_or:
3758 case Intrinsic::vector_reduce_xor:
3759 case Intrinsic::vector_reduce_smin:
3760 case Intrinsic::vector_reduce_smax:
3761 case Intrinsic::vector_reduce_umin:
3762 case Intrinsic::vector_reduce_umax:
3763 FoundReduction = true;
3764 continue;
3765 default:
3766 return false;
3767 }
3768 }
3769
3771 return false;
3772
3773 WorkList.emplace_back(UI);
3774 }
3775 }
3776 return FoundReduction;
3777}
3778
3779/// This method looks for groups of shuffles acting on binops, of the form:
3780/// %x = shuffle ...
3781/// %y = shuffle ...
3782/// %a = binop %x, %y
3783/// %b = binop %x, %y
3784/// shuffle %a, %b, selectmask
3785/// We may, especially if the shuffle is wider than legal, be able to convert
3786/// the shuffle to a form where only parts of a and b need to be computed. On
3787/// architectures with no obvious "select" shuffle, this can reduce the total
3788/// number of operations if the target reports them as cheaper.
3789bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3790 auto *SVI = cast<ShuffleVectorInst>(&I);
3791 auto *VT = cast<FixedVectorType>(I.getType());
3792 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3793 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3794 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3795 VT != Op0->getType())
3796 return false;
3797
3798 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3799 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3800 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3801 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3802 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3803 auto checkSVNonOpUses = [&](Instruction *I) {
3804 if (!I || I->getOperand(0)->getType() != VT)
3805 return true;
3806 return any_of(I->users(), [&](User *U) {
3807 return U != Op0 && U != Op1 &&
3808 !(isa<ShuffleVectorInst>(U) &&
3809 (InputShuffles.contains(cast<Instruction>(U)) ||
3810 isInstructionTriviallyDead(cast<Instruction>(U))));
3811 });
3812 };
3813 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3814 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3815 return false;
3816
3817 // Collect all the uses that are shuffles that we can transform together. We
3818 // may not have a single shuffle, but a group that can all be transformed
3819 // together profitably.
3821 auto collectShuffles = [&](Instruction *I) {
3822 for (auto *U : I->users()) {
3823 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3824 if (!SV || SV->getType() != VT)
3825 return false;
3826 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3827 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3828 return false;
3829 if (!llvm::is_contained(Shuffles, SV))
3830 Shuffles.push_back(SV);
3831 }
3832 return true;
3833 };
3834 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3835 return false;
3836 // From a reduction, we need to be processing a single shuffle, otherwise the
3837 // other uses will not be lane-invariant.
3838 if (FromReduction && Shuffles.size() > 1)
3839 return false;
3840
3841 // Add any shuffle uses for the shuffles we have found, to include them in our
3842 // cost calculations.
3843 if (!FromReduction) {
3844 for (ShuffleVectorInst *SV : Shuffles) {
3845 for (auto *U : SV->users()) {
3846 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3847 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3848 Shuffles.push_back(SSV);
3849 }
3850 }
3851 }
3852
3853 // For each of the output shuffles, we try to sort all the first vector
3854 // elements to the beginning, followed by the second array elements at the
3855 // end. If the binops are legalized to smaller vectors, this may reduce total
3856 // number of binops. We compute the ReconstructMask mask needed to convert
3857 // back to the original lane order.
3859 SmallVector<SmallVector<int>> OrigReconstructMasks;
3860 int MaxV1Elt = 0, MaxV2Elt = 0;
3861 unsigned NumElts = VT->getNumElements();
3862 for (ShuffleVectorInst *SVN : Shuffles) {
3863 SmallVector<int> Mask;
3864 SVN->getShuffleMask(Mask);
3865
3866 // Check the operands are the same as the original, or reversed (in which
3867 // case we need to commute the mask).
3868 Value *SVOp0 = SVN->getOperand(0);
3869 Value *SVOp1 = SVN->getOperand(1);
3870 if (isa<UndefValue>(SVOp1)) {
3871 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3872 SVOp0 = SSV->getOperand(0);
3873 SVOp1 = SSV->getOperand(1);
3874 for (int &Elem : Mask) {
3875 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3876 return false;
3877 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3878 }
3879 }
3880 if (SVOp0 == Op1 && SVOp1 == Op0) {
3881 std::swap(SVOp0, SVOp1);
3883 }
3884 if (SVOp0 != Op0 || SVOp1 != Op1)
3885 return false;
3886
3887 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3888 // take the packed values from Op0/Op1 and reconstructing to the original
3889 // order.
3890 SmallVector<int> ReconstructMask;
3891 for (unsigned I = 0; I < Mask.size(); I++) {
3892 if (Mask[I] < 0) {
3893 ReconstructMask.push_back(-1);
3894 } else if (Mask[I] < static_cast<int>(NumElts)) {
3895 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3896 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3897 return Mask[I] == A.first;
3898 });
3899 if (It != V1.end())
3900 ReconstructMask.push_back(It - V1.begin());
3901 else {
3902 ReconstructMask.push_back(V1.size());
3903 V1.emplace_back(Mask[I], V1.size());
3904 }
3905 } else {
3906 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3907 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3908 return Mask[I] - static_cast<int>(NumElts) == A.first;
3909 });
3910 if (It != V2.end())
3911 ReconstructMask.push_back(NumElts + It - V2.begin());
3912 else {
3913 ReconstructMask.push_back(NumElts + V2.size());
3914 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3915 }
3916 }
3917 }
3918
3919 // For reductions, we know that the lane ordering out doesn't alter the
3920 // result. In-order can help simplify the shuffle away.
3921 if (FromReduction)
3922 sort(ReconstructMask);
3923 OrigReconstructMasks.push_back(std::move(ReconstructMask));
3924 }
3925
3926 // If the Maximum element used from V1 and V2 are not larger than the new
3927 // vectors, the vectors are already packes and performing the optimization
3928 // again will likely not help any further. This also prevents us from getting
3929 // stuck in a cycle in case the costs do not also rule it out.
3930 if (V1.empty() || V2.empty() ||
3931 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
3932 MaxV2Elt == static_cast<int>(V2.size()) - 1))
3933 return false;
3934
3935 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
3936 // shuffle of another shuffle, or not a shuffle (that is treated like a
3937 // identity shuffle).
3938 auto GetBaseMaskValue = [&](Instruction *I, int M) {
3939 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3940 if (!SV)
3941 return M;
3942 if (isa<UndefValue>(SV->getOperand(1)))
3943 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3944 if (InputShuffles.contains(SSV))
3945 return SSV->getMaskValue(SV->getMaskValue(M));
3946 return SV->getMaskValue(M);
3947 };
3948
3949 // Attempt to sort the inputs my ascending mask values to make simpler input
3950 // shuffles and push complex shuffles down to the uses. We sort on the first
3951 // of the two input shuffle orders, to try and get at least one input into a
3952 // nice order.
3953 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
3954 std::pair<int, int> Y) {
3955 int MXA = GetBaseMaskValue(A, X.first);
3956 int MYA = GetBaseMaskValue(A, Y.first);
3957 return MXA < MYA;
3958 };
3959 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
3960 return SortBase(SVI0A, A, B);
3961 });
3962 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
3963 return SortBase(SVI1A, A, B);
3964 });
3965 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
3966 // modified order of the input shuffles.
3967 SmallVector<SmallVector<int>> ReconstructMasks;
3968 for (const auto &Mask : OrigReconstructMasks) {
3969 SmallVector<int> ReconstructMask;
3970 for (int M : Mask) {
3971 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
3972 auto It = find_if(V, [M](auto A) { return A.second == M; });
3973 assert(It != V.end() && "Expected all entries in Mask");
3974 return std::distance(V.begin(), It);
3975 };
3976 if (M < 0)
3977 ReconstructMask.push_back(-1);
3978 else if (M < static_cast<int>(NumElts)) {
3979 ReconstructMask.push_back(FindIndex(V1, M));
3980 } else {
3981 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
3982 }
3983 }
3984 ReconstructMasks.push_back(std::move(ReconstructMask));
3985 }
3986
3987 // Calculate the masks needed for the new input shuffles, which get padded
3988 // with undef
3989 SmallVector<int> V1A, V1B, V2A, V2B;
3990 for (unsigned I = 0; I < V1.size(); I++) {
3991 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
3992 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
3993 }
3994 for (unsigned I = 0; I < V2.size(); I++) {
3995 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
3996 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
3997 }
3998 while (V1A.size() < NumElts) {
4001 }
4002 while (V2A.size() < NumElts) {
4005 }
4006
4007 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
4008 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4009 if (!SV)
4010 return C;
4011 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
4014 VT, VT, SV->getShuffleMask(), CostKind);
4015 };
4016 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4017 return C +
4019 };
4020
4021 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
4022 unsigned MaxVectorSize =
4024 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
4025 if (MaxElementsInVector == 0)
4026 return false;
4027 // When there are multiple shufflevector operations on the same input,
4028 // especially when the vector length is larger than the register size,
4029 // identical shuffle patterns may occur across different groups of elements.
4030 // To avoid overestimating the cost by counting these repeated shuffles more
4031 // than once, we only account for unique shuffle patterns. This adjustment
4032 // prevents inflated costs in the cost model for wide vectors split into
4033 // several register-sized groups.
4034 std::set<SmallVector<int, 4>> UniqueShuffles;
4035 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4036 // Compute the cost for performing the shuffle over the full vector.
4037 auto ShuffleCost =
4039 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
4040 if (NumFullVectors < 2)
4041 return C + ShuffleCost;
4042 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
4043 unsigned NumUniqueGroups = 0;
4044 unsigned NumGroups = Mask.size() / MaxElementsInVector;
4045 // For each group of MaxElementsInVector contiguous elements,
4046 // collect their shuffle pattern and insert into the set of unique patterns.
4047 for (unsigned I = 0; I < NumFullVectors; ++I) {
4048 for (unsigned J = 0; J < MaxElementsInVector; ++J)
4049 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
4050 if (UniqueShuffles.insert(SubShuffle).second)
4051 NumUniqueGroups += 1;
4052 }
4053 return C + ShuffleCost * NumUniqueGroups / NumGroups;
4054 };
4055 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
4056 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4057 if (!SV)
4058 return C;
4059 SmallVector<int, 16> Mask;
4060 SV->getShuffleMask(Mask);
4061 return AddShuffleMaskAdjustedCost(C, Mask);
4062 };
4063 // Check that input consists of ShuffleVectors applied to the same input
4064 auto AllShufflesHaveSameOperands =
4065 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
4066 if (InputShuffles.size() < 2)
4067 return false;
4068 ShuffleVectorInst *FirstSV =
4069 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
4070 if (!FirstSV)
4071 return false;
4072
4073 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
4074 return std::all_of(
4075 std::next(InputShuffles.begin()), InputShuffles.end(),
4076 [&](Instruction *I) {
4077 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
4078 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
4079 });
4080 };
4081
4082 // Get the costs of the shuffles + binops before and after with the new
4083 // shuffle masks.
4084 InstructionCost CostBefore =
4085 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
4086 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
4087 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
4088 InstructionCost(0), AddShuffleCost);
4089 if (AllShufflesHaveSameOperands(InputShuffles)) {
4090 UniqueShuffles.clear();
4091 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4092 InstructionCost(0), AddShuffleAdjustedCost);
4093 } else {
4094 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4095 InstructionCost(0), AddShuffleCost);
4096 }
4097
4098 // The new binops will be unused for lanes past the used shuffle lengths.
4099 // These types attempt to get the correct cost for that from the target.
4100 FixedVectorType *Op0SmallVT =
4101 FixedVectorType::get(VT->getScalarType(), V1.size());
4102 FixedVectorType *Op1SmallVT =
4103 FixedVectorType::get(VT->getScalarType(), V2.size());
4104 InstructionCost CostAfter =
4105 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4106 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4107 UniqueShuffles.clear();
4108 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4109 InstructionCost(0), AddShuffleMaskAdjustedCost);
4110 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4111 CostAfter +=
4112 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4113 InstructionCost(0), AddShuffleMaskCost);
4114
4115 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4116 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4117 << " vs CostAfter: " << CostAfter << "\n");
4118 if (CostBefore < CostAfter ||
4119 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4120 return false;
4121
4122 // The cost model has passed, create the new instructions.
4123 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4124 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4125 if (!SV)
4126 return I;
4127 if (isa<UndefValue>(SV->getOperand(1)))
4128 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4129 if (InputShuffles.contains(SSV))
4130 return SSV->getOperand(Op);
4131 return SV->getOperand(Op);
4132 };
4133 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4134 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4135 GetShuffleOperand(SVI0A, 1), V1A);
4136 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4137 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4138 GetShuffleOperand(SVI0B, 1), V1B);
4139 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4140 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4141 GetShuffleOperand(SVI1A, 1), V2A);
4142 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4143 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4144 GetShuffleOperand(SVI1B, 1), V2B);
4145 Builder.SetInsertPoint(Op0);
4146 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4147 NSV0A, NSV0B);
4148 if (auto *I = dyn_cast<Instruction>(NOp0))
4149 I->copyIRFlags(Op0, true);
4150 Builder.SetInsertPoint(Op1);
4151 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4152 NSV1A, NSV1B);
4153 if (auto *I = dyn_cast<Instruction>(NOp1))
4154 I->copyIRFlags(Op1, true);
4155
4156 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4157 Builder.SetInsertPoint(Shuffles[S]);
4158 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4159 replaceValue(*Shuffles[S], *NSV, false);
4160 }
4161
4162 Worklist.pushValue(NSV0A);
4163 Worklist.pushValue(NSV0B);
4164 Worklist.pushValue(NSV1A);
4165 Worklist.pushValue(NSV1B);
4166 return true;
4167}
4168
4169/// Check if instruction depends on ZExt and this ZExt can be moved after the
4170/// instruction. Move ZExt if it is profitable. For example:
4171/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4172/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4173/// Cost model calculations takes into account if zext(x) has other users and
4174/// whether it can be propagated through them too.
4175bool VectorCombine::shrinkType(Instruction &I) {
4176 Value *ZExted, *OtherOperand;
4177 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4178 m_Value(OtherOperand))) &&
4179 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4180 return false;
4181
4182 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4183
4184 auto *BigTy = cast<FixedVectorType>(I.getType());
4185 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4186 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4187
4188 if (I.getOpcode() == Instruction::LShr) {
4189 // Check that the shift amount is less than the number of bits in the
4190 // smaller type. Otherwise, the smaller lshr will return a poison value.
4191 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4192 if (ShAmtKB.getMaxValue().uge(BW))
4193 return false;
4194 } else {
4195 // Check that the expression overall uses at most the same number of bits as
4196 // ZExted
4197 KnownBits KB = computeKnownBits(&I, *DL);
4198 if (KB.countMaxActiveBits() > BW)
4199 return false;
4200 }
4201
4202 // Calculate costs of leaving current IR as it is and moving ZExt operation
4203 // later, along with adding truncates if needed
4205 Instruction::ZExt, BigTy, SmallTy,
4206 TargetTransformInfo::CastContextHint::None, CostKind);
4207 InstructionCost CurrentCost = ZExtCost;
4208 InstructionCost ShrinkCost = 0;
4209
4210 // Calculate total cost and check that we can propagate through all ZExt users
4211 for (User *U : ZExtOperand->users()) {
4212 auto *UI = cast<Instruction>(U);
4213 if (UI == &I) {
4214 CurrentCost +=
4215 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4216 ShrinkCost +=
4217 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4218 ShrinkCost += ZExtCost;
4219 continue;
4220 }
4221
4222 if (!Instruction::isBinaryOp(UI->getOpcode()))
4223 return false;
4224
4225 // Check if we can propagate ZExt through its other users
4226 KnownBits KB = computeKnownBits(UI, *DL);
4227 if (KB.countMaxActiveBits() > BW)
4228 return false;
4229
4230 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4231 ShrinkCost +=
4232 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4233 ShrinkCost += ZExtCost;
4234 }
4235
4236 // If the other instruction operand is not a constant, we'll need to
4237 // generate a truncate instruction. So we have to adjust cost
4238 if (!isa<Constant>(OtherOperand))
4239 ShrinkCost += TTI.getCastInstrCost(
4240 Instruction::Trunc, SmallTy, BigTy,
4241 TargetTransformInfo::CastContextHint::None, CostKind);
4242
4243 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4244 // towards modifying the IR because shrinking opens opportunities for other
4245 // shrinking optimisations.
4246 if (ShrinkCost > CurrentCost)
4247 return false;
4248
4249 Builder.SetInsertPoint(&I);
4250 Value *Op0 = ZExted;
4251 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4252 // Keep the order of operands the same
4253 if (I.getOperand(0) == OtherOperand)
4254 std::swap(Op0, Op1);
4255 Value *NewBinOp =
4256 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4257 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4258 cast<Instruction>(NewBinOp)->copyMetadata(I);
4259 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4260 replaceValue(I, *NewZExtr);
4261 return true;
4262}
4263
4264/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4265/// shuffle (DstVec, SrcVec, Mask)
4266bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4267 Value *DstVec, *SrcVec;
4268 uint64_t ExtIdx, InsIdx;
4269 if (!match(&I,
4270 m_InsertElt(m_Value(DstVec),
4271 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4272 m_ConstantInt(InsIdx))))
4273 return false;
4274
4275 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4276 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4277 // We can try combining vectors with different element sizes.
4278 if (!DstVecTy || !SrcVecTy ||
4279 SrcVecTy->getElementType() != DstVecTy->getElementType())
4280 return false;
4281
4282 unsigned NumDstElts = DstVecTy->getNumElements();
4283 unsigned NumSrcElts = SrcVecTy->getNumElements();
4284 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4285 return false;
4286
4287 // Insertion into poison is a cheaper single operand shuffle.
4289 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4290
4291 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4292 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4293 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4294 if (NeedDstSrcSwap) {
4296 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4297 Mask[InsIdx] = 0;
4298 else
4299 Mask[InsIdx] = ExtIdx;
4300 std::swap(DstVec, SrcVec);
4301 } else {
4303 std::iota(Mask.begin(), Mask.end(), 0);
4304 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4305 Mask[InsIdx] = NumDstElts;
4306 else
4307 Mask[InsIdx] = ExtIdx + NumDstElts;
4308 }
4309
4310 // Cost
4311 auto *Ins = cast<InsertElementInst>(&I);
4312 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4313 InstructionCost InsCost =
4314 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4315 InstructionCost ExtCost =
4316 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4317 InstructionCost OldCost = ExtCost + InsCost;
4318
4319 InstructionCost NewCost = 0;
4320 SmallVector<int> ExtToVecMask;
4321 if (!NeedExpOrNarrow) {
4322 // Ignore 'free' identity insertion shuffle.
4323 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4324 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4325 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4326 nullptr, {DstVec, SrcVec});
4327 } else {
4328 // When creating length-changing-vector, always create with a Mask whose
4329 // first element has an ExtIdx, so that the first element of the vector
4330 // being created is always the target to be extracted.
4331 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4332 if (IsExtIdxInBounds)
4333 ExtToVecMask[ExtIdx] = ExtIdx;
4334 else
4335 ExtToVecMask[0] = ExtIdx;
4336 // Add cost for expanding or narrowing
4338 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4339 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4340 }
4341
4342 if (!Ext->hasOneUse())
4343 NewCost += ExtCost;
4344
4345 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4346 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4347 << "\n");
4348
4349 if (OldCost < NewCost)
4350 return false;
4351
4352 if (NeedExpOrNarrow) {
4353 if (!NeedDstSrcSwap)
4354 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4355 else
4356 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4357 }
4358
4359 // Canonicalize undef param to RHS to help further folds.
4360 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4361 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4362 std::swap(DstVec, SrcVec);
4363 }
4364
4365 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4366 replaceValue(I, *Shuf);
4367
4368 return true;
4369}
4370
4371/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4372/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4373/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4374/// before casting it back into `<vscale x 16 x i32>`.
4375bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4376 const APInt *SplatVal0, *SplatVal1;
4378 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4379 return false;
4380
4381 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4382 << "\n");
4383
4384 auto *VTy =
4385 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4386 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4387 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4388
4389 // Just in case the cost of interleave2 intrinsic and bitcast are both
4390 // invalid, in which case we want to bail out, we use <= rather
4391 // than < here. Even they both have valid and equal costs, it's probably
4392 // not a good idea to emit a high-cost constant splat.
4394 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4396 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4397 << *I.getType() << " is too high.\n");
4398 return false;
4399 }
4400
4401 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4402 NewSplatVal <<= Width;
4403 NewSplatVal |= SplatVal0->zext(Width * 2);
4404 auto *NewSplat = ConstantVector::getSplat(
4405 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4406
4407 IRBuilder<> Builder(&I);
4408 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4409 return true;
4410}
4411
4412// Attempt to shrink loads that are only used by shufflevector instructions.
4413bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4414 auto *OldLoad = dyn_cast<LoadInst>(&I);
4415 if (!OldLoad || !OldLoad->isSimple())
4416 return false;
4417
4418 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4419 if (!OldLoadTy)
4420 return false;
4421
4422 unsigned const OldNumElements = OldLoadTy->getNumElements();
4423
4424 // Search all uses of load. If all uses are shufflevector instructions, and
4425 // the second operands are all poison values, find the minimum and maximum
4426 // indices of the vector elements referenced by all shuffle masks.
4427 // Otherwise return `std::nullopt`.
4428 using IndexRange = std::pair<int, int>;
4429 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4430 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4431 for (llvm::Use &Use : I.uses()) {
4432 // Ensure all uses match the required pattern.
4433 User *Shuffle = Use.getUser();
4434 ArrayRef<int> Mask;
4435
4436 if (!match(Shuffle,
4437 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4438 return std::nullopt;
4439
4440 // Ignore shufflevector instructions that have no uses.
4441 if (Shuffle->use_empty())
4442 continue;
4443
4444 // Find the min and max indices used by the shufflevector instruction.
4445 for (int Index : Mask) {
4446 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4447 OutputRange.first = std::min(Index, OutputRange.first);
4448 OutputRange.second = std::max(Index, OutputRange.second);
4449 }
4450 }
4451 }
4452
4453 if (OutputRange.second < OutputRange.first)
4454 return std::nullopt;
4455
4456 return OutputRange;
4457 };
4458
4459 // Get the range of vector elements used by shufflevector instructions.
4460 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4461 unsigned const NewNumElements = Indices->second + 1u;
4462
4463 // If the range of vector elements is smaller than the full load, attempt
4464 // to create a smaller load.
4465 if (NewNumElements < OldNumElements) {
4466 IRBuilder Builder(&I);
4467 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4468
4469 // Calculate costs of old and new ops.
4470 Type *ElemTy = OldLoadTy->getElementType();
4471 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4472 Value *PtrOp = OldLoad->getPointerOperand();
4473
4475 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4476 OldLoad->getPointerAddressSpace(), CostKind);
4477 InstructionCost NewCost =
4478 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4479 OldLoad->getPointerAddressSpace(), CostKind);
4480
4481 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4483 unsigned const MaxIndex = NewNumElements * 2u;
4484
4485 for (llvm::Use &Use : I.uses()) {
4486 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4487 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4488
4489 // Create entry for new use.
4490 NewUses.push_back({Shuffle, OldMask});
4491
4492 // Validate mask indices.
4493 for (int Index : OldMask) {
4494 if (Index >= static_cast<int>(MaxIndex))
4495 return false;
4496 }
4497
4498 // Update costs.
4499 OldCost +=
4501 OldLoadTy, OldMask, CostKind);
4502 NewCost +=
4504 NewLoadTy, OldMask, CostKind);
4505 }
4506
4507 LLVM_DEBUG(
4508 dbgs() << "Found a load used only by shufflevector instructions: "
4509 << I << "\n OldCost: " << OldCost
4510 << " vs NewCost: " << NewCost << "\n");
4511
4512 if (OldCost < NewCost || !NewCost.isValid())
4513 return false;
4514
4515 // Create new load of smaller vector.
4516 auto *NewLoad = cast<LoadInst>(
4517 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4518 NewLoad->copyMetadata(I);
4519
4520 // Replace all uses.
4521 for (UseEntry &Use : NewUses) {
4522 ShuffleVectorInst *Shuffle = Use.first;
4523 std::vector<int> &NewMask = Use.second;
4524
4525 Builder.SetInsertPoint(Shuffle);
4526 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4527 Value *NewShuffle = Builder.CreateShuffleVector(
4528 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4529
4530 replaceValue(*Shuffle, *NewShuffle, false);
4531 }
4532
4533 return true;
4534 }
4535 }
4536 return false;
4537}
4538
4539// Attempt to narrow a phi of shufflevector instructions where the two incoming
4540// values have the same operands but different masks. If the two shuffle masks
4541// are offsets of one another we can use one branch to rotate the incoming
4542// vector and perform one larger shuffle after the phi.
4543bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4544 auto *Phi = dyn_cast<PHINode>(&I);
4545 if (!Phi || Phi->getNumIncomingValues() != 2u)
4546 return false;
4547
4548 Value *Op = nullptr;
4549 ArrayRef<int> Mask0;
4550 ArrayRef<int> Mask1;
4551
4552 if (!match(Phi->getOperand(0u),
4553 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4554 !match(Phi->getOperand(1u),
4555 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4556 return false;
4557
4558 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4559
4560 // Ensure result vectors are wider than the argument vector.
4561 auto *InputVT = cast<FixedVectorType>(Op->getType());
4562 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4563 auto const InputNumElements = InputVT->getNumElements();
4564
4565 if (InputNumElements >= ResultVT->getNumElements())
4566 return false;
4567
4568 // Take the difference of the two shuffle masks at each index. Ignore poison
4569 // values at the same index in both masks.
4570 SmallVector<int, 16> NewMask;
4571 NewMask.reserve(Mask0.size());
4572
4573 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4574 if (M0 >= 0 && M1 >= 0)
4575 NewMask.push_back(M0 - M1);
4576 else if (M0 == -1 && M1 == -1)
4577 continue;
4578 else
4579 return false;
4580 }
4581
4582 // Ensure all elements of the new mask are equal. If the difference between
4583 // the incoming mask elements is the same, the two must be constant offsets
4584 // of one another.
4585 if (NewMask.empty() || !all_equal(NewMask))
4586 return false;
4587
4588 // Create new mask using difference of the two incoming masks.
4589 int MaskOffset = NewMask[0u];
4590 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
4591 NewMask.clear();
4592
4593 for (unsigned I = 0u; I < InputNumElements; ++I) {
4594 NewMask.push_back(Index);
4595 Index = (Index + 1u) % InputNumElements;
4596 }
4597
4598 // Calculate costs for worst cases and compare.
4599 auto const Kind = TTI::SK_PermuteSingleSrc;
4600 auto OldCost =
4601 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4602 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4603 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4604 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4605
4606 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4607 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4608 << "\n");
4609
4610 if (NewCost > OldCost)
4611 return false;
4612
4613 // Create new shuffles and narrowed phi.
4614 auto Builder = IRBuilder(Shuf);
4615 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4616 auto *PoisonVal = PoisonValue::get(InputVT);
4617 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4618 Worklist.push(cast<Instruction>(NewShuf0));
4619
4620 Builder.SetInsertPoint(Phi);
4621 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4622 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4623 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4624 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4625
4626 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4627 PoisonVal = PoisonValue::get(NewPhi->getType());
4628 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4629
4630 replaceValue(*Phi, *NewShuf1);
4631 return true;
4632}
4633
4634/// This is the entry point for all transforms. Pass manager differences are
4635/// handled in the callers of this function.
4636bool VectorCombine::run() {
4638 return false;
4639
4640 // Don't attempt vectorization if the target does not support vectors.
4641 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4642 return false;
4643
4644 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4645
4646 auto FoldInst = [this](Instruction &I) {
4647 Builder.SetInsertPoint(&I);
4648 bool IsVectorType = isa<VectorType>(I.getType());
4649 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4650 auto Opcode = I.getOpcode();
4651
4652 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4653
4654 // These folds should be beneficial regardless of when this pass is run
4655 // in the optimization pipeline.
4656 // The type checking is for run-time efficiency. We can avoid wasting time
4657 // dispatching to folding functions if there's no chance of matching.
4658 if (IsFixedVectorType) {
4659 switch (Opcode) {
4660 case Instruction::InsertElement:
4661 if (vectorizeLoadInsert(I))
4662 return true;
4663 break;
4664 case Instruction::ShuffleVector:
4665 if (widenSubvectorLoad(I))
4666 return true;
4667 break;
4668 default:
4669 break;
4670 }
4671 }
4672
4673 // This transform works with scalable and fixed vectors
4674 // TODO: Identify and allow other scalable transforms
4675 if (IsVectorType) {
4676 if (scalarizeOpOrCmp(I))
4677 return true;
4678 if (scalarizeLoad(I))
4679 return true;
4680 if (scalarizeExtExtract(I))
4681 return true;
4682 if (scalarizeVPIntrinsic(I))
4683 return true;
4684 if (foldInterleaveIntrinsics(I))
4685 return true;
4686 }
4687
4688 if (Opcode == Instruction::Store)
4689 if (foldSingleElementStore(I))
4690 return true;
4691
4692 // If this is an early pipeline invocation of this pass, we are done.
4693 if (TryEarlyFoldsOnly)
4694 return false;
4695
4696 // Otherwise, try folds that improve codegen but may interfere with
4697 // early IR canonicalizations.
4698 // The type checking is for run-time efficiency. We can avoid wasting time
4699 // dispatching to folding functions if there's no chance of matching.
4700 if (IsFixedVectorType) {
4701 switch (Opcode) {
4702 case Instruction::InsertElement:
4703 if (foldInsExtFNeg(I))
4704 return true;
4705 if (foldInsExtBinop(I))
4706 return true;
4707 if (foldInsExtVectorToShuffle(I))
4708 return true;
4709 break;
4710 case Instruction::ShuffleVector:
4711 if (foldPermuteOfBinops(I))
4712 return true;
4713 if (foldShuffleOfBinops(I))
4714 return true;
4715 if (foldShuffleOfSelects(I))
4716 return true;
4717 if (foldShuffleOfCastops(I))
4718 return true;
4719 if (foldShuffleOfShuffles(I))
4720 return true;
4721 if (foldShuffleOfIntrinsics(I))
4722 return true;
4723 if (foldSelectShuffle(I))
4724 return true;
4725 if (foldShuffleToIdentity(I))
4726 return true;
4727 break;
4728 case Instruction::Load:
4729 if (shrinkLoadForShuffles(I))
4730 return true;
4731 break;
4732 case Instruction::BitCast:
4733 if (foldBitcastShuffle(I))
4734 return true;
4735 break;
4736 case Instruction::And:
4737 case Instruction::Or:
4738 case Instruction::Xor:
4739 if (foldBitOpOfCastops(I))
4740 return true;
4741 if (foldBitOpOfCastConstant(I))
4742 return true;
4743 break;
4744 case Instruction::PHI:
4745 if (shrinkPhiOfShuffles(I))
4746 return true;
4747 break;
4748 default:
4749 if (shrinkType(I))
4750 return true;
4751 break;
4752 }
4753 } else {
4754 switch (Opcode) {
4755 case Instruction::Call:
4756 if (foldShuffleFromReductions(I))
4757 return true;
4758 if (foldCastFromReductions(I))
4759 return true;
4760 break;
4761 case Instruction::ExtractElement:
4762 if (foldShuffleChainsToReduce(I))
4763 return true;
4764 break;
4765 case Instruction::ICmp:
4766 case Instruction::FCmp:
4767 if (foldExtractExtract(I))
4768 return true;
4769 break;
4770 case Instruction::Or:
4771 if (foldConcatOfBoolMasks(I))
4772 return true;
4773 [[fallthrough]];
4774 default:
4775 if (Instruction::isBinaryOp(Opcode)) {
4776 if (foldExtractExtract(I))
4777 return true;
4778 if (foldExtractedCmps(I))
4779 return true;
4780 if (foldBinopOfReductions(I))
4781 return true;
4782 }
4783 break;
4784 }
4785 }
4786 return false;
4787 };
4788
4789 bool MadeChange = false;
4790 for (BasicBlock &BB : F) {
4791 // Ignore unreachable basic blocks.
4792 if (!DT.isReachableFromEntry(&BB))
4793 continue;
4794 // Use early increment range so that we can erase instructions in loop.
4795 // make_early_inc_range is not applicable here, as the next iterator may
4796 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4797 // We manually maintain the next instruction and update it when it is about
4798 // to be deleted.
4799 Instruction *I = &BB.front();
4800 while (I) {
4801 NextInst = I->getNextNode();
4802 if (!I->isDebugOrPseudoInst())
4803 MadeChange |= FoldInst(*I);
4804 I = NextInst;
4805 }
4806 }
4807
4808 NextInst = nullptr;
4809
4810 while (!Worklist.isEmpty()) {
4811 Instruction *I = Worklist.removeOne();
4812 if (!I)
4813 continue;
4814
4817 continue;
4818 }
4819
4820 MadeChange |= FoldInst(*I);
4821 }
4822
4823 return MadeChange;
4824}
4825
4828 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4830 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
4831 AAResults &AA = FAM.getResult<AAManager>(F);
4832 const DataLayout *DL = &F.getDataLayout();
4833 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4834 TryEarlyFoldsOnly);
4835 if (!Combiner.run())
4836 return PreservedAnalyses::all();
4839 return PA;
4840}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1450
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2266
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1718
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145