LLVM 23.0.0git
AggressiveInstCombine.cpp
Go to the documentation of this file.
1//===- AggressiveInstCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the aggressive expression pattern combiner classes.
10// Currently, it handles expression patterns for:
11// * Truncate instruction
12//
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/Statistic.h"
27#include "llvm/IR/DataLayout.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/Function.h"
30#include "llvm/IR/IRBuilder.h"
31#include "llvm/IR/Instruction.h"
32#include "llvm/IR/MDBuilder.h"
40
41using namespace llvm;
42using namespace PatternMatch;
43
44#define DEBUG_TYPE "aggressive-instcombine"
45
46namespace llvm {
48}
49
50STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
51STATISTIC(NumGuardedRotates,
52 "Number of guarded rotates transformed into funnel shifts");
53STATISTIC(NumGuardedFunnelShifts,
54 "Number of guarded funnel shifts transformed into funnel shifts");
55STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
56
58 "aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden,
59 cl::desc("Max number of instructions to scan for aggressive instcombine."));
60
62 "strncmp-inline-threshold", cl::init(3), cl::Hidden,
63 cl::desc("The maximum length of a constant string for a builtin string cmp "
64 "call eligible for inlining. The default value is 3."));
65
67 MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden,
68 cl::desc("The maximum length of a constant string to "
69 "inline a memchr call."));
70
71/// Match a pattern for a bitwise funnel/rotate operation that partially guards
72/// against undefined behavior by branching around the funnel-shift/rotation
73/// when the shift amount is 0.
75 if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
76 return false;
77
78 // As with the one-use checks below, this is not strictly necessary, but we
79 // are being cautious to avoid potential perf regressions on targets that
80 // do not actually have a funnel/rotate instruction (where the funnel shift
81 // would be expanded back into math/shift/logic ops).
82 if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
83 return false;
84
85 // Match V to funnel shift left/right and capture the source operands and
86 // shift amount.
87 auto matchFunnelShift = [](Value *V, Value *&ShVal0, Value *&ShVal1,
88 Value *&ShAmt) {
89 unsigned Width = V->getType()->getScalarSizeInBits();
90
91 // fshl(ShVal0, ShVal1, ShAmt)
92 // == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))
93 if (match(V, m_OneUse(m_c_Or(
94 m_Shl(m_Value(ShVal0), m_Value(ShAmt)),
95 m_LShr(m_Value(ShVal1), m_Sub(m_SpecificInt(Width),
96 m_Deferred(ShAmt))))))) {
97 return Intrinsic::fshl;
98 }
99
100 // fshr(ShVal0, ShVal1, ShAmt)
101 // == (ShVal0 >> ShAmt) | (ShVal1 << (Width - ShAmt))
102 if (match(V,
104 m_Value(ShAmt))),
105 m_LShr(m_Value(ShVal1), m_Deferred(ShAmt)))))) {
106 return Intrinsic::fshr;
107 }
108
110 };
111
112 // One phi operand must be a funnel/rotate operation, and the other phi
113 // operand must be the source value of that funnel/rotate operation:
114 // phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]
115 // phi [ fshl(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal0, GuardBB ]
116 // phi [ fshr(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal1, GuardBB ]
117 PHINode &Phi = cast<PHINode>(I);
118 unsigned FunnelOp = 0, GuardOp = 1;
119 Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
120 Value *ShVal0, *ShVal1, *ShAmt;
121 Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt);
122 if (IID == Intrinsic::not_intrinsic ||
123 (IID == Intrinsic::fshl && ShVal0 != P1) ||
124 (IID == Intrinsic::fshr && ShVal1 != P1)) {
125 IID = matchFunnelShift(P1, ShVal0, ShVal1, ShAmt);
126 if (IID == Intrinsic::not_intrinsic ||
127 (IID == Intrinsic::fshl && ShVal0 != P0) ||
128 (IID == Intrinsic::fshr && ShVal1 != P0))
129 return false;
130 assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
131 "Pattern must match funnel shift left or right");
132 std::swap(FunnelOp, GuardOp);
133 }
134
135 // The incoming block with our source operand must be the "guard" block.
136 // That must contain a cmp+branch to avoid the funnel/rotate when the shift
137 // amount is equal to 0. The other incoming block is the block with the
138 // funnel/rotate.
139 BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp);
140 BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp);
141 Instruction *TermI = GuardBB->getTerminator();
142
143 // Ensure that the shift values dominate each block.
144 if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))
145 return false;
146
147 BasicBlock *PhiBB = Phi.getParent();
149 m_ZeroInt()),
150 m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))
151 return false;
152
153 IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
154
155 if (ShVal0 == ShVal1)
156 ++NumGuardedRotates;
157 else
158 ++NumGuardedFunnelShifts;
159
160 // If this is not a rotate then the select was blocking poison from the
161 // 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it.
162 bool IsFshl = IID == Intrinsic::fshl;
163 if (ShVal0 != ShVal1) {
164 if (IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal1))
165 ShVal1 = Builder.CreateFreeze(ShVal1);
166 else if (!IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal0))
167 ShVal0 = Builder.CreateFreeze(ShVal0);
168 }
169
170 // We matched a variation of this IR pattern:
171 // GuardBB:
172 // %cmp = icmp eq i32 %ShAmt, 0
173 // br i1 %cmp, label %PhiBB, label %FunnelBB
174 // FunnelBB:
175 // %sub = sub i32 32, %ShAmt
176 // %shr = lshr i32 %ShVal1, %sub
177 // %shl = shl i32 %ShVal0, %ShAmt
178 // %fsh = or i32 %shr, %shl
179 // br label %PhiBB
180 // PhiBB:
181 // %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
182 // -->
183 // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
184 Phi.replaceAllUsesWith(
185 Builder.CreateIntrinsic(IID, Phi.getType(), {ShVal0, ShVal1, ShAmt}));
186 return true;
187}
188
189/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
190/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
191/// of 'and' ops, then we also need to capture the fact that we saw an
192/// "and X, 1", so that's an extra return value for that case.
193namespace {
194struct MaskOps {
195 Value *Root = nullptr;
196 APInt Mask;
197 bool MatchAndChain;
198 bool FoundAnd1 = false;
199
200 MaskOps(unsigned BitWidth, bool MatchAnds)
201 : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {}
202};
203} // namespace
204
205/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
206/// chain of 'and' or 'or' instructions looking for shift ops of a common source
207/// value. Examples:
208/// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
209/// returns { X, 0x129 }
210/// and (and (X >> 1), 1), (X >> 4)
211/// returns { X, 0x12 }
212static bool matchAndOrChain(Value *V, MaskOps &MOps) {
213 Value *Op0, *Op1;
214 if (MOps.MatchAndChain) {
215 // Recurse through a chain of 'and' operands. This requires an extra check
216 // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
217 // in the chain to know that all of the high bits are cleared.
218 if (match(V, m_And(m_Value(Op0), m_One()))) {
219 MOps.FoundAnd1 = true;
220 return matchAndOrChain(Op0, MOps);
221 }
222 if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
223 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
224 } else {
225 // Recurse through a chain of 'or' operands.
226 if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
227 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
228 }
229
230 // We need a shift-right or a bare value representing a compare of bit 0 of
231 // the original source operand.
232 Value *Candidate;
233 const APInt *BitIndex = nullptr;
234 if (!match(V, m_LShr(m_Value(Candidate), m_APInt(BitIndex))))
235 Candidate = V;
236
237 // Initialize result source operand.
238 if (!MOps.Root)
239 MOps.Root = Candidate;
240
241 // The shift constant is out-of-range? This code hasn't been simplified.
242 if (BitIndex && BitIndex->uge(MOps.Mask.getBitWidth()))
243 return false;
244
245 // Fill in the mask bit derived from the shift constant.
246 MOps.Mask.setBit(BitIndex ? BitIndex->getZExtValue() : 0);
247 return MOps.Root == Candidate;
248}
249
250/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
251/// These will include a chain of 'or' or 'and'-shifted bits from a
252/// common source value:
253/// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0
254/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
255/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
256/// that differ only with a final 'not' of the result. We expect that final
257/// 'not' to be folded with the compare that we create here (invert predicate).
259 // The 'any-bits-set' ('or' chain) pattern is simpler to match because the
260 // final "and X, 1" instruction must be the final op in the sequence.
261 bool MatchAllBitsSet;
262 bool MatchTrunc;
263 Value *X;
264 if (I.getType()->isIntOrIntVectorTy(1)) {
265 if (match(&I, m_Trunc(m_OneUse(m_And(m_Value(), m_Value())))))
266 MatchAllBitsSet = true;
267 else if (match(&I, m_Trunc(m_OneUse(m_Or(m_Value(), m_Value())))))
268 MatchAllBitsSet = false;
269 else
270 return false;
271 MatchTrunc = true;
272 X = I.getOperand(0);
273 } else {
274 if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value()))) {
275 X = &I;
276 MatchAllBitsSet = true;
277 } else if (match(&I,
278 m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One()))) {
279 X = I.getOperand(0);
280 MatchAllBitsSet = false;
281 } else
282 return false;
283 MatchTrunc = false;
284 }
285 Type *Ty = X->getType();
286
287 MaskOps MOps(Ty->getScalarSizeInBits(), MatchAllBitsSet);
288 if (!matchAndOrChain(X, MOps) ||
289 (MatchAllBitsSet && !MatchTrunc && !MOps.FoundAnd1))
290 return false;
291
292 // The pattern was found. Create a masked compare that replaces all of the
293 // shift and logic ops.
294 IRBuilder<> Builder(&I);
295 Constant *Mask = ConstantInt::get(Ty, MOps.Mask);
296 Value *And = Builder.CreateAnd(MOps.Root, Mask);
297 Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask)
298 : Builder.CreateIsNotNull(And);
299 Value *Zext = MatchTrunc ? Cmp : Builder.CreateZExt(Cmp, Ty);
300 I.replaceAllUsesWith(Zext);
301 ++NumAnyOrAllBitsSet;
302 return true;
303}
304
305// Try to recognize below function as popcount intrinsic.
306// This is the "best" algorithm from
307// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
308// Also used in TargetLowering::expandCTPOP().
309//
310// int popcount(unsigned int i) {
311// i = i - ((i >> 1) & 0x55555555);
312// i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
313// i = ((i + (i >> 4)) & 0x0F0F0F0F);
314// return (i * 0x01010101) >> 24;
315// }
317 if (I.getOpcode() != Instruction::LShr)
318 return false;
319
320 Type *Ty = I.getType();
321 if (!Ty->isIntOrIntVectorTy())
322 return false;
323
324 unsigned Len = Ty->getScalarSizeInBits();
325 // FIXME: fix Len == 8 and other irregular type lengths.
326 if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
327 return false;
328
329 APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
330 APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
331 APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
332 APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
333 APInt MaskShift = APInt(Len, Len - 8);
334
335 Value *Op0 = I.getOperand(0);
336 Value *Op1 = I.getOperand(1);
337 Value *MulOp0;
338 // Matching "(i * 0x01010101...) >> 24".
339 if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
341 Value *ShiftOp0;
342 // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
343 if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
344 m_Deferred(ShiftOp0)),
345 m_SpecificInt(Mask0F)))) {
346 Value *AndOp0;
347 // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
348 if (match(ShiftOp0,
349 m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
351 m_SpecificInt(Mask33))))) {
352 Value *Root, *SubOp1;
353 // Matching "i - ((i >> 1) & 0x55555555...)".
354 const APInt *AndMask;
355 if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
356 match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
357 m_APInt(AndMask)))) {
358 auto CheckAndMask = [&]() {
359 if (*AndMask == Mask55)
360 return true;
361
362 // Exact match failed, see if any bits are known to be 0 where we
363 // expect a 1 in the mask.
364 if (!AndMask->isSubsetOf(Mask55))
365 return false;
366
367 APInt NeededMask = Mask55 & ~*AndMask;
368 return MaskedValueIsZero(cast<Instruction>(SubOp1)->getOperand(0),
369 NeededMask,
370 SimplifyQuery(I.getDataLayout()));
371 };
372
373 if (CheckAndMask()) {
374 LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
375 IRBuilder<> Builder(&I);
376 I.replaceAllUsesWith(
377 Builder.CreateIntrinsic(Intrinsic::ctpop, I.getType(), {Root}));
378 ++NumPopCountRecognized;
379 return true;
380 }
381 }
382 }
383 }
384 }
385
386 return false;
387}
388
389// Try to recognize below function as popcount intrinsic.
390// Ref. Hackers Delight
391// int popcnt(unsigned x) {
392// x = x - ((x >> 1) & 0x55555555);
393// x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
394// x = (x + (x >> 4)) & 0x0F0F0F0F;
395// x = x + (x >> 8);
396// x = x + (x >> 16);
397// return x & 0x0000003F;
398// }
399
400// int popcnt(unsigned x) {
401// x = x - ((x >> 1) & 0x55555555);
402// x = x - 3*((x >> 2) & 0x33333333);
403// x = (x + (x >> 4)) & 0x0F0F0F0F;
404// x = x + (x >> 8);
405// x = x + (x >> 16);
406// return x & 0x0000003F;
407// }
408
410 if (I.getOpcode() != Instruction::And)
411 return false;
412
413 Type *Ty = I.getType();
414 if (!Ty->isIntOrIntVectorTy())
415 return false;
416
417 unsigned Len = Ty->getScalarSizeInBits();
418
419 if (Len > 64 || Len <= 8 || Len % 8 != 0)
420 return false;
421
422 // Len should be a power of 2 for the loop to work correctly
423 if (!isPowerOf2_32(Len))
424 return false;
425
426 APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
427 APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
428 APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
429
430 APInt MaskRes = APInt(Len, 2 * Len - 1);
431
432 Value *Add1;
433 if (!match(&I, m_And(m_Value(Add1), m_SpecificInt(MaskRes))))
434 return false;
435
436 Value *Add2;
437 for (unsigned I = Len; I >= 16; I = I / 2) {
438 // Matching "x = x + (x >> I/2)" for I-bit.
439 if (!match(Add1, m_c_Add(m_LShr(m_Value(Add2), m_SpecificInt(I / 2)),
440 m_Deferred(Add2))))
441 return false;
442 Add1 = Add2;
443 }
444
445 Value *And1 = Add1;
446 // Matching "x = (x + (x >> 4)) & 0x0F0F0F0F".
447 if (!match(And1, m_And(m_c_Add(m_LShr(m_Value(Add2), m_SpecificInt(4)),
448 m_Deferred(Add2)),
449 m_SpecificInt(Mask0F))))
450 return false;
451
452 Value *Sub1;
453 llvm::APInt NegThree(/*BitWidth=*/Len, /*Value=*/-3,
454 /*isSigned=*/true);
455 // x = (x & 0x33333333) + ((x >> 2) & 0x33333333)".
456 if (!match(Add2, m_c_Add(m_And(m_LShr(m_Value(Sub1), m_SpecificInt(2)),
457 m_SpecificInt(Mask33)),
458 m_And(m_Deferred(Sub1), m_SpecificInt(Mask33)))) &&
459 // Matching "x = x - 3*((x >> 2) & 0x33333333)".
461 m_SpecificInt(Mask33)),
462 m_SpecificInt(NegThree)),
463 m_Deferred(Sub1))))
464 return false;
465
466 Value *Root;
467 // x = x - ((x >> 1) & 0x55555555);
468 if (!match(Sub1, m_Sub(m_Value(Root),
470 m_SpecificInt(Mask55)))))
471 return false;
472
473 LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
474 IRBuilder<> Builder(&I);
475 I.replaceAllUsesWith(
476 Builder.CreateIntrinsic(Intrinsic::ctpop, I.getType(), {Root}));
477 ++NumPopCountRecognized;
478 return true;
479}
480
481/// Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and
482/// C2 saturate the value of the fp conversion. The transform is not reversable
483/// as the fptosi.sat is more defined than the input - all values produce a
484/// valid value for the fptosi.sat, where as some produce poison for original
485/// that were out of range of the integer conversion. The reversed pattern may
486/// use fmax and fmin instead. As we cannot directly reverse the transform, and
487/// it is not always profitable, we make it conditional on the cost being
488/// reported as lower by TTI.
490 // Look for min(max(fptosi, converting to fptosi_sat.
491 Value *In;
492 const APInt *MinC, *MaxC;
494 m_APInt(MinC))),
495 m_APInt(MaxC))) &&
497 m_APInt(MaxC))),
498 m_APInt(MinC))))
499 return false;
500
501 // Check that the constants clamp a saturate.
502 if (!(*MinC + 1).isPowerOf2() || -*MaxC != *MinC + 1)
503 return false;
504
505 Type *IntTy = I.getType();
506 Type *FpTy = In->getType();
507 Type *SatTy =
508 IntegerType::get(IntTy->getContext(), (*MinC + 1).exactLogBase2() + 1);
509 if (auto *VecTy = dyn_cast<VectorType>(IntTy))
510 SatTy = VectorType::get(SatTy, VecTy->getElementCount());
511
512 // Get the cost of the intrinsic, and check that against the cost of
513 // fptosi+smin+smax
514 InstructionCost SatCost = TTI.getIntrinsicInstrCost(
515 IntrinsicCostAttributes(Intrinsic::fptosi_sat, SatTy, {In}, {FpTy}),
517 SatCost += TTI.getCastInstrCost(Instruction::SExt, IntTy, SatTy,
520
521 InstructionCost MinMaxCost = TTI.getCastInstrCost(
522 Instruction::FPToSI, IntTy, FpTy, TTI::CastContextHint::None,
524 MinMaxCost += TTI.getIntrinsicInstrCost(
525 IntrinsicCostAttributes(Intrinsic::smin, IntTy, {IntTy}),
527 MinMaxCost += TTI.getIntrinsicInstrCost(
528 IntrinsicCostAttributes(Intrinsic::smax, IntTy, {IntTy}),
530
531 if (SatCost >= MinMaxCost)
532 return false;
533
534 IRBuilder<> Builder(&I);
535 Value *Sat =
536 Builder.CreateIntrinsic(Intrinsic::fptosi_sat, {SatTy, FpTy}, In);
537 I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy));
538 return true;
539}
540
541/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
542/// pessimistic codegen that has to account for setting errno and can enable
543/// vectorization.
544static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
546 DominatorTree &DT) {
547 // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
548 // (because NNAN or the operand arg must not be less than -0.0) and (2) we
549 // would not end up lowering to a libcall anyway (which could change the value
550 // of errno), then:
551 // (1) errno won't be set.
552 // (2) it is safe to convert this to an intrinsic call.
553 Type *Ty = Call->getType();
554 Value *Arg = Call->getArgOperand(0);
555 if (TTI.haveFastSqrt(Ty) &&
556 (Call->hasNoNaNs() ||
558 Arg, SimplifyQuery(Call->getDataLayout(), &TLI, &DT, &AC, Call)))) {
559 IRBuilder<> Builder(Call);
560 Value *NewSqrt =
561 Builder.CreateIntrinsic(Intrinsic::sqrt, Ty, Arg, Call, "sqrt");
562 Call->replaceAllUsesWith(NewSqrt);
563
564 // Explicitly erase the old call because a call with side effects is not
565 // trivially dead.
566 Call->eraseFromParent();
567 return true;
568 }
569
570 return false;
571}
572
573// Check if this array of constants represents a cttz table.
574// Iterate over the elements from \p Table by trying to find/match all
575// the numbers from 0 to \p InputBits that should represent cttz results.
576static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift,
577 const APInt &AndMask, Type *AccessTy,
578 unsigned InputBits, const APInt &GEPIdxFactor,
579 const DataLayout &DL) {
580 for (unsigned Idx = 0; Idx < InputBits; Idx++) {
581 APInt Index =
582 (APInt::getOneBitSet(InputBits, Idx) * Mul).lshr(Shift) & AndMask;
584 ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
585 if (!C || C->getValue() != Idx)
586 return false;
587 }
588
589 return true;
590}
591
592// Try to recognize table-based ctz implementation.
593// E.g., an example in C (for more cases please see the llvm/tests):
594// int f(unsigned x) {
595// static const char table[32] =
596// {0, 1, 28, 2, 29, 14, 24, 3, 30,
597// 22, 20, 15, 25, 17, 4, 8, 31, 27,
598// 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
599// return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27];
600// }
601// this can be lowered to `cttz` instruction.
602// There is also a special case when the element is 0.
603//
604// The (x & -x) sets the lowest non-zero bit to 1. The multiply is a de-bruijn
605// sequence that contains each pattern of bits in it. The shift extracts
606// the top bits after the multiply, and that index into the table should
607// represent the number of trailing zeros in the original number.
608//
609// Here are some examples or LLVM IR for a 64-bit target:
610//
611// CASE 1:
612// %sub = sub i32 0, %x
613// %and = and i32 %sub, %x
614// %mul = mul i32 %and, 125613361
615// %shr = lshr i32 %mul, 27
616// %idxprom = zext i32 %shr to i64
617// %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @ctz1.table, i64 0,
618// i64 %idxprom
619// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
620//
621// CASE 2:
622// %sub = sub i32 0, %x
623// %and = and i32 %sub, %x
624// %mul = mul i32 %and, 72416175
625// %shr = lshr i32 %mul, 26
626// %idxprom = zext i32 %shr to i64
627// %arrayidx = getelementptr inbounds [64 x i16], [64 x i16]* @ctz2.table,
628// i64 0, i64 %idxprom
629// %0 = load i16, i16* %arrayidx, align 2, !tbaa !8
630//
631// CASE 3:
632// %sub = sub i32 0, %x
633// %and = and i32 %sub, %x
634// %mul = mul i32 %and, 81224991
635// %shr = lshr i32 %mul, 27
636// %idxprom = zext i32 %shr to i64
637// %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @ctz3.table,
638// i64 0, i64 %idxprom
639// %0 = load i32, i32* %arrayidx, align 4, !tbaa !8
640//
641// CASE 4:
642// %sub = sub i64 0, %x
643// %and = and i64 %sub, %x
644// %mul = mul i64 %and, 283881067100198605
645// %shr = lshr i64 %mul, 58
646// %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @table, i64 0,
647// i64 %shr
648// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
649//
650// All these can be lowered to @llvm.cttz.i32/64 intrinsics.
653 if (!LI)
654 return false;
655
656 Type *AccessType = LI->getType();
657 if (!AccessType->isIntegerTy())
658 return false;
659
661 if (!GEP || !GEP->hasNoUnsignedSignedWrap())
662 return false;
663
664 GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
665 if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
666 return false;
667
668 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
669 APInt ModOffset(BW, 0);
671 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
672 VarOffsets.size() != 1 || ModOffset != 0)
673 return false;
674 auto [GepIdx, GEPScale] = VarOffsets.front();
675
676 Value *X1;
677 const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
678 // Check that the gep variable index is ((x & -x) * MulConst) >> ShiftConst.
679 // This might be extended to the pointer index type, and if the gep index type
680 // has been replaced with an i8 then a new And (and different ShiftConst) will
681 // be present.
682 auto MatchInner = m_LShr(
683 m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)), m_APInt(MulConst)),
684 m_APInt(ShiftConst));
685 if (!match(GepIdx, m_CastOrSelf(MatchInner)) &&
686 !match(GepIdx, m_CastOrSelf(m_And(MatchInner, m_APInt(AndCst)))))
687 return false;
688
689 unsigned InputBits = X1->getType()->getScalarSizeInBits();
690 if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
691 return false;
692
693 if (!GEPScale.isIntN(InputBits) ||
694 !isCTTZTable(GVTable->getInitializer(), *MulConst, *ShiftConst,
695 AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
696 InputBits, GEPScale.zextOrTrunc(InputBits), DL))
697 return false;
698
699 ConstantInt *ZeroTableElem = cast<ConstantInt>(
700 ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
701 bool DefinedForZero = ZeroTableElem->getZExtValue() == InputBits;
702
703 IRBuilder<> B(LI);
704 ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
705 Type *XType = X1->getType();
706 auto Cttz = B.CreateIntrinsic(Intrinsic::cttz, {XType}, {X1, BoolConst});
707 Value *ZExtOrTrunc = nullptr;
708
709 if (DefinedForZero) {
710 ZExtOrTrunc = B.CreateZExtOrTrunc(Cttz, AccessType);
711 } else {
712 // If the value in elem 0 isn't the same as InputBits, we still want to
713 // produce the value from the table.
714 auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
715 auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
716
717 // The true branch of select handles the cttz(0) case, which is rare.
720 SelectI->setMetadata(
721 LLVMContext::MD_prof,
722 MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
723 }
724
725 // NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
726 // it should be handled as: `cttz(x) & (typeSize - 1)`.
727
728 ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
729 }
730
731 LI->replaceAllUsesWith(ZExtOrTrunc);
732
733 return true;
734}
735
736// Check if this array of constants represents a log2 table.
737// Iterate over the elements from \p Table by trying to find/match all
738// the numbers from 0 to \p InputBits that should represent log2 results.
739static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
740 Type *AccessTy, unsigned InputBits,
741 const APInt &GEPIdxFactor, const DataLayout &DL) {
742 for (unsigned Idx = 0; Idx < InputBits; Idx++) {
743 APInt Index = (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift);
745 ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
746 if (!C || C->getValue() != Idx)
747 return false;
748 }
749
750 // Verify that an input of zero will select table index 0.
751 APInt ZeroIndex = Mul.lshr(Shift);
752 if (!ZeroIndex.isZero())
753 return false;
754
755 return true;
756}
757
758// Try to recognize table-based log2 implementation.
759// E.g., an example in C (for more cases please the llvm/tests):
760// int f(unsigned v) {
761// static const char table[32] =
762// {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
763// 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
764//
765// v |= v >> 1; // first round down to one less than a power of 2
766// v |= v >> 2;
767// v |= v >> 4;
768// v |= v >> 8;
769// v |= v >> 16;
770//
771// return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
772// }
773// this can be lowered to `ctlz` instruction.
774// There is also a special case when the element is 0.
775//
776// The >> and |= sequence sets all bits below the most significant set bit. The
777// multiply is a de-bruijn sequence that contains each pattern of bits in it.
778// The shift extracts the top bits after the multiply, and that index into the
779// table should represent the floor log base 2 of the original number.
780//
781// Here are some examples of LLVM IR for a 64-bit target.
782//
783// CASE 1:
784// %shr = lshr i32 %v, 1
785// %or = or i32 %shr, %v
786// %shr1 = lshr i32 %or, 2
787// %or2 = or i32 %shr1, %or
788// %shr3 = lshr i32 %or2, 4
789// %or4 = or i32 %shr3, %or2
790// %shr5 = lshr i32 %or4, 8
791// %or6 = or i32 %shr5, %or4
792// %shr7 = lshr i32 %or6, 16
793// %or8 = or i32 %shr7, %or6
794// %mul = mul i32 %or8, 130329821
795// %shr9 = lshr i32 %mul, 27
796// %idxprom = zext nneg i32 %shr9 to i64
797// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %idxprom
798// %0 = load i8, ptr %arrayidx, align 1
799//
800// CASE 2:
801// %shr = lshr i64 %v, 1
802// %or = or i64 %shr, %v
803// %shr1 = lshr i64 %or, 2
804// %or2 = or i64 %shr1, %or
805// %shr3 = lshr i64 %or2, 4
806// %or4 = or i64 %shr3, %or2
807// %shr5 = lshr i64 %or4, 8
808// %or6 = or i64 %shr5, %or4
809// %shr7 = lshr i64 %or6, 16
810// %or8 = or i64 %shr7, %or6
811// %shr9 = lshr i64 %or8, 32
812// %or10 = or i64 %shr9, %or8
813// %mul = mul i64 %or10, 285870213051386505
814// %shr11 = lshr i64 %mul, 58
815// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
816// %0 = load i8, ptr %arrayidx, align 1
817//
818// All these can be lowered to @llvm.ctlz.i32/64 intrinsics and a subtract.
822 if (!LI)
823 return false;
824
825 Type *AccessType = LI->getType();
826 if (!AccessType->isIntegerTy())
827 return false;
828
830 if (!GEP || !GEP->hasNoUnsignedSignedWrap())
831 return false;
832
833 GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
834 if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
835 return false;
836
837 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
838 APInt ModOffset(BW, 0);
840 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
841 VarOffsets.size() != 1 || ModOffset != 0)
842 return false;
843 auto [GepIdx, GEPScale] = VarOffsets.front();
844
845 Value *X;
846 const APInt *MulConst, *ShiftConst;
847 // Check that the gep variable index is (x * MulConst) >> ShiftConst.
848 auto MatchInner =
849 m_LShr(m_Mul(m_Value(X), m_APInt(MulConst)), m_APInt(ShiftConst));
850 if (!match(GepIdx, m_CastOrSelf(MatchInner)))
851 return false;
852
853 unsigned InputBits = X->getType()->getScalarSizeInBits();
854 if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
855 return false;
856
857 // Verify shift amount.
858 // TODO: Allow other shift amounts when we have proper test coverage.
859 if (*ShiftConst != InputBits - Log2_32(InputBits))
860 return false;
861
862 // Match the sequence of OR operations with right shifts by powers of 2.
863 for (unsigned ShiftAmt = InputBits / 2; ShiftAmt != 0; ShiftAmt /= 2) {
864 Value *Y;
865 if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(ShiftAmt)),
866 m_Deferred(Y))))
867 return false;
868 X = Y;
869 }
870
871 if (!GEPScale.isIntN(InputBits) ||
872 !isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
873 AccessType, InputBits, GEPScale.zextOrTrunc(InputBits), DL))
874 return false;
875
876 ConstantInt *ZeroTableElem = cast<ConstantInt>(
877 ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
878
879 // Use InputBits - 1 - ctlz(X) to compute log2(X).
880 IRBuilder<> B(LI);
881 ConstantInt *BoolConst = B.getTrue();
882 Type *XType = X->getType();
883
884 // Check the the backend has an efficient ctlz instruction.
885 // FIXME: Teach the backend to emit the original code when ctlz isn't
886 // supported like we do for cttz.
888 Intrinsic::ctlz, XType,
889 {PoisonValue::get(XType), /*is_zero_poison=*/BoolConst});
890 InstructionCost Cost =
891 TTI.getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
893 return false;
894
895 Value *Ctlz = B.CreateIntrinsic(Intrinsic::ctlz, {XType}, {X, BoolConst});
896
897 Constant *InputBitsM1 = ConstantInt::get(XType, InputBits - 1);
898 Value *Sub = B.CreateSub(InputBitsM1, Ctlz);
899
900 // The table won't produce a sensible result for 0.
901 Value *Cmp = B.CreateICmpEQ(X, ConstantInt::get(XType, 0));
902 Value *Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Sub);
903
904 // The true branch of select handles the log2(0) case, which is rare.
907 SelectI->setMetadata(
908 LLVMContext::MD_prof,
909 MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
910 }
911
912 Value *ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
913
914 LI->replaceAllUsesWith(ZExtOrTrunc);
915
916 return true;
917}
918
919/// This is used by foldLoadsRecursive() to capture a Root Load node which is
920/// of type or(load, load) and recursively build the wide load. Also capture the
921/// shift amount, zero extend type and loadSize.
931
932// Identify and Merge consecutive loads recursively which is of the form
933// (ZExt(L1) << shift1) | (ZExt(L2) << shift2) -> ZExt(L3) << shift1
934// (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3)
935static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
936 AliasAnalysis &AA, bool IsRoot = false) {
937 uint64_t ShAmt2;
938 Value *X;
939 Instruction *L1, *L2;
940
941 // For the root instruction, allow multiple uses since the final result
942 // may legitimately be used in multiple places. For intermediate values,
943 // require single use to avoid creating duplicate loads.
944 if (!IsRoot && !V->hasOneUse())
945 return false;
946
947 if (!match(V, m_c_Or(m_Value(X),
949 ShAmt2)))))
950 return false;
951
952 if (!foldLoadsRecursive(X, LOps, DL, AA, /*IsRoot=*/false) && LOps.FoundRoot)
953 // Avoid Partial chain merge.
954 return false;
955
956 // Check if the pattern has loads
957 LoadInst *LI1 = LOps.Root;
958 uint64_t ShAmt1 = LOps.Shift;
959 if (LOps.FoundRoot == false &&
961 m_ShlOrSelf(m_OneUse(m_ZExt(m_Instruction(L1))), ShAmt1)))) {
962 LI1 = dyn_cast<LoadInst>(L1);
963 }
964 LoadInst *LI2 = dyn_cast<LoadInst>(L2);
965
966 // Check if loads are same, atomic, volatile and having same address space.
967 if (LI1 == LI2 || !LI1 || !LI2 || !LI1->isSimple() || !LI2->isSimple() ||
969 return false;
970
971 // Check if Loads come from same BB.
972 if (LI1->getParent() != LI2->getParent())
973 return false;
974
975 // Find the data layout
976 bool IsBigEndian = DL.isBigEndian();
977
978 // Check if loads are consecutive and same size.
979 Value *Load1Ptr = LI1->getPointerOperand();
980 APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);
981 Load1Ptr =
982 Load1Ptr->stripAndAccumulateConstantOffsets(DL, Offset1,
983 /* AllowNonInbounds */ true);
984
985 Value *Load2Ptr = LI2->getPointerOperand();
986 APInt Offset2(DL.getIndexTypeSizeInBits(Load2Ptr->getType()), 0);
987 Load2Ptr =
988 Load2Ptr->stripAndAccumulateConstantOffsets(DL, Offset2,
989 /* AllowNonInbounds */ true);
990
991 // Verify if both loads have same base pointers
992 uint64_t LoadSize1 = LI1->getType()->getPrimitiveSizeInBits();
993 uint64_t LoadSize2 = LI2->getType()->getPrimitiveSizeInBits();
994 if (Load1Ptr != Load2Ptr)
995 return false;
996
997 // Make sure that there are no padding bits.
998 if (!DL.typeSizeEqualsStoreSize(LI1->getType()) ||
999 !DL.typeSizeEqualsStoreSize(LI2->getType()))
1000 return false;
1001
1002 // Alias Analysis to check for stores b/w the loads.
1003 LoadInst *Start = LOps.FoundRoot ? LOps.RootInsert : LI1, *End = LI2;
1005 if (!Start->comesBefore(End)) {
1006 std::swap(Start, End);
1007 // If LOps.RootInsert comes after LI2, since we use LI2 as the new insert
1008 // point, we should make sure whether the memory region accessed by LOps
1009 // isn't modified.
1010 if (LOps.FoundRoot)
1012 LOps.Root->getPointerOperand(),
1013 LocationSize::precise(DL.getTypeStoreSize(
1014 IntegerType::get(LI1->getContext(), LOps.LoadSize))),
1015 LOps.AATags);
1016 else
1017 Loc = MemoryLocation::get(End);
1018 } else
1019 Loc = MemoryLocation::get(End);
1020 unsigned NumScanned = 0;
1021 for (Instruction &Inst :
1022 make_range(Start->getIterator(), End->getIterator())) {
1023 if (Inst.mayWriteToMemory() && isModSet(AA.getModRefInfo(&Inst, Loc)))
1024 return false;
1025
1026 if (++NumScanned > MaxInstrsToScan)
1027 return false;
1028 }
1029
1030 // Make sure Load with lower Offset is at LI1
1031 bool Reverse = false;
1032 if (Offset2.slt(Offset1)) {
1033 std::swap(LI1, LI2);
1034 std::swap(ShAmt1, ShAmt2);
1035 std::swap(Offset1, Offset2);
1036 std::swap(Load1Ptr, Load2Ptr);
1037 std::swap(LoadSize1, LoadSize2);
1038 Reverse = true;
1039 }
1040
1041 // Big endian swap the shifts
1042 if (IsBigEndian)
1043 std::swap(ShAmt1, ShAmt2);
1044
1045 // First load is always LI1. This is where we put the new load.
1046 // Use the merged load size available from LI1 for forward loads.
1047 if (LOps.FoundRoot) {
1048 if (!Reverse)
1049 LoadSize1 = LOps.LoadSize;
1050 else
1051 LoadSize2 = LOps.LoadSize;
1052 }
1053
1054 // Verify if shift amount and load index aligns and verifies that loads
1055 // are consecutive.
1056 uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1;
1057 uint64_t PrevSize =
1058 DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1));
1059 if ((ShAmt2 - ShAmt1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)
1060 return false;
1061
1062 // Update LOps
1063 AAMDNodes AATags1 = LOps.AATags;
1064 AAMDNodes AATags2 = LI2->getAAMetadata();
1065 if (LOps.FoundRoot == false) {
1066 LOps.FoundRoot = true;
1067 AATags1 = LI1->getAAMetadata();
1068 }
1069 LOps.LoadSize = LoadSize1 + LoadSize2;
1070 LOps.RootInsert = Start;
1071
1072 // Concatenate the AATags of the Merged Loads.
1073 LOps.AATags = AATags1.concat(AATags2);
1074
1075 LOps.Root = LI1;
1076 LOps.Shift = ShAmt1;
1077 LOps.ZextType = X->getType();
1078 return true;
1079}
1080
1081// For a given BB instruction, evaluate all loads in the chain that form a
1082// pattern which suggests that the loads can be combined. The one and only use
1083// of the loads is to form a wider load.
1086 const DominatorTree &DT) {
1087 // Only consider load chains of scalar values.
1088 if (isa<VectorType>(I.getType()))
1089 return false;
1090
1091 LoadOps LOps;
1092 if (!foldLoadsRecursive(&I, LOps, DL, AA, /*IsRoot=*/true) || !LOps.FoundRoot)
1093 return false;
1094
1095 IRBuilder<> Builder(&I);
1096 LoadInst *NewLoad = nullptr, *LI1 = LOps.Root;
1097
1098 IntegerType *WiderType = IntegerType::get(I.getContext(), LOps.LoadSize);
1099 // TTI based checks if we want to proceed with wider load
1100 bool Allowed = TTI.isTypeLegal(WiderType);
1101 if (!Allowed)
1102 return false;
1103
1104 unsigned AS = LI1->getPointerAddressSpace();
1105 unsigned Fast = 0;
1106 Allowed = TTI.allowsMisalignedMemoryAccesses(I.getContext(), LOps.LoadSize,
1107 AS, LI1->getAlign(), &Fast);
1108 if (!Allowed || !Fast)
1109 return false;
1110
1111 // Get the Index and Ptr for the new GEP.
1112 Value *Load1Ptr = LI1->getPointerOperand();
1113 Builder.SetInsertPoint(LOps.RootInsert);
1114 if (!DT.dominates(Load1Ptr, LOps.RootInsert)) {
1115 APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);
1116 Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets(
1117 DL, Offset1, /* AllowNonInbounds */ true);
1118 Load1Ptr = Builder.CreatePtrAdd(Load1Ptr, Builder.getInt(Offset1));
1119 }
1120 // Generate wider load.
1121 NewLoad = Builder.CreateAlignedLoad(WiderType, Load1Ptr, LI1->getAlign(),
1122 LI1->isVolatile(), "");
1123 NewLoad->takeName(LI1);
1124 // Set the New Load AATags Metadata.
1125 if (LOps.AATags)
1126 NewLoad->setAAMetadata(LOps.AATags);
1127
1128 Value *NewOp = NewLoad;
1129 // Check if zero extend needed.
1130 if (LOps.ZextType)
1131 NewOp = Builder.CreateZExt(NewOp, LOps.ZextType);
1132
1133 // Check if shift needed. We need to shift with the amount of load1
1134 // shift if not zero.
1135 if (LOps.Shift)
1136 NewOp = Builder.CreateShl(NewOp, LOps.Shift);
1137 I.replaceAllUsesWith(NewOp);
1138
1139 return true;
1140}
1141
1142/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
1150
1151 bool isCompatibleWith(const PartStore &Other) const {
1152 return PtrBase == Other.PtrBase && Val == Other.Val;
1153 }
1154
1155 bool operator<(const PartStore &Other) const {
1156 return PtrOffset.slt(Other.PtrOffset);
1157 }
1158};
1159
1160static std::optional<PartStore> matchPartStore(Instruction &I,
1161 const DataLayout &DL) {
1162 auto *Store = dyn_cast<StoreInst>(&I);
1163 if (!Store || !Store->isSimple())
1164 return std::nullopt;
1165
1166 Value *StoredVal = Store->getValueOperand();
1167 Type *StoredTy = StoredVal->getType();
1168 if (!StoredTy->isIntegerTy() || !DL.typeSizeEqualsStoreSize(StoredTy))
1169 return std::nullopt;
1170
1171 uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
1172 uint64_t ValOffset;
1173 Value *Val;
1174 if (!match(StoredVal, m_Trunc(m_LShrOrSelf(m_Value(Val), ValOffset))))
1175 return std::nullopt;
1176
1177 Value *Ptr = Store->getPointerOperand();
1178 APInt PtrOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
1180 DL, PtrOffset, /*AllowNonInbounds=*/true);
1181 return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
1182}
1183
1185 unsigned Width, const DataLayout &DL,
1187 if (Parts.size() < 2)
1188 return false;
1189
1190 // Check whether combining the stores is profitable.
1191 // FIXME: We could generate smaller stores if we can't produce a large one.
1192 const PartStore &First = Parts.front();
1193 LLVMContext &Ctx = First.Store->getContext();
1194 Type *NewTy = Type::getIntNTy(Ctx, Width);
1195 unsigned Fast = 0;
1196 if (!TTI.isTypeLegal(NewTy) ||
1197 !TTI.allowsMisalignedMemoryAccesses(Ctx, Width,
1198 First.Store->getPointerAddressSpace(),
1199 First.Store->getAlign(), &Fast) ||
1200 !Fast)
1201 return false;
1202
1203 // Generate the combined store.
1204 IRBuilder<> Builder(First.Store);
1205 Value *Val = First.Val;
1206 if (First.ValOffset != 0)
1207 Val = Builder.CreateLShr(Val, First.ValOffset);
1208 Val = Builder.CreateZExtOrTrunc(Val, NewTy);
1209 StoreInst *Store = Builder.CreateAlignedStore(
1210 Val, First.Store->getPointerOperand(), First.Store->getAlign());
1211
1212 // Merge various metadata onto the new store.
1213 AAMDNodes AATags = First.Store->getAAMetadata();
1214 SmallVector<Instruction *> Stores = {First.Store};
1215 Stores.reserve(Parts.size());
1216 SmallVector<DebugLoc> DbgLocs = {First.Store->getDebugLoc()};
1217 DbgLocs.reserve(Parts.size());
1218 for (const PartStore &Part : drop_begin(Parts)) {
1219 AATags = AATags.concat(Part.Store->getAAMetadata());
1220 Stores.push_back(Part.Store);
1221 DbgLocs.push_back(Part.Store->getDebugLoc());
1222 }
1223 Store->setAAMetadata(AATags);
1224 Store->mergeDIAssignID(Stores);
1225 Store->setDebugLoc(DebugLoc::getMergedLocations(DbgLocs));
1226
1227 // Remove the old stores.
1228 for (const PartStore &Part : Parts)
1229 Part.Store->eraseFromParent();
1230
1231 return true;
1232}
1233
1236 if (Parts.size() < 2)
1237 return false;
1238
1239 // We now have multiple parts of the same value stored to the same pointer.
1240 // Sort the parts by pointer offset, and make sure they are consistent with
1241 // the value offsets. Also check that the value is fully covered without
1242 // overlaps.
1243 bool Changed = false;
1244 llvm::sort(Parts);
1245 int64_t LastEndOffsetFromFirst = 0;
1246 const PartStore *First = &Parts[0];
1247 for (const PartStore &Part : Parts) {
1248 APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
1249 int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
1250 if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
1251 LastEndOffsetFromFirst != ValOffsetFromFirst) {
1253 LastEndOffsetFromFirst, DL, TTI);
1254 First = &Part;
1255 LastEndOffsetFromFirst = Part.ValWidth;
1256 continue;
1257 }
1258
1259 LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
1260 }
1261
1263 LastEndOffsetFromFirst, DL, TTI);
1264 return Changed;
1265}
1266
1269 // FIXME: Add big endian support.
1270 if (DL.isBigEndian())
1271 return false;
1272
1273 BatchAAResults BatchAA(AA);
1275 bool MadeChange = false;
1276 for (Instruction &I : make_early_inc_range(BB)) {
1277 if (std::optional<PartStore> Part = matchPartStore(I, DL)) {
1278 if (Parts.empty() || Part->isCompatibleWith(Parts[0])) {
1279 Parts.push_back(std::move(*Part));
1280 continue;
1281 }
1282
1283 MadeChange |= mergePartStores(Parts, DL, TTI);
1284 Parts.clear();
1285 Parts.push_back(std::move(*Part));
1286 continue;
1287 }
1288
1289 if (Parts.empty())
1290 continue;
1291
1292 if (I.mayThrow() ||
1293 (I.mayReadOrWriteMemory() &&
1295 &I, MemoryLocation::getBeforeOrAfter(Parts[0].PtrBase))))) {
1296 MadeChange |= mergePartStores(Parts, DL, TTI);
1297 Parts.clear();
1298 continue;
1299 }
1300 }
1301
1302 MadeChange |= mergePartStores(Parts, DL, TTI);
1303 return MadeChange;
1304}
1305
1306/// Combine away instructions providing they are still equivalent when compared
1307/// against 0. i.e do they have any bits set.
1309 auto *I = dyn_cast<Instruction>(V);
1310 if (!I || I->getOpcode() != Instruction::Or || !I->hasOneUse())
1311 return nullptr;
1312
1313 Value *A;
1314
1315 // Look deeper into the chain of or's, combining away shl (so long as they are
1316 // nuw or nsw).
1317 Value *Op0 = I->getOperand(0);
1318 if (match(Op0, m_CombineOr(m_NSWShl(m_Value(A), m_Value()),
1319 m_NUWShl(m_Value(A), m_Value()))))
1320 Op0 = A;
1321 else if (auto *NOp = optimizeShiftInOrChain(Op0, Builder))
1322 Op0 = NOp;
1323
1324 Value *Op1 = I->getOperand(1);
1325 if (match(Op1, m_CombineOr(m_NSWShl(m_Value(A), m_Value()),
1326 m_NUWShl(m_Value(A), m_Value()))))
1327 Op1 = A;
1328 else if (auto *NOp = optimizeShiftInOrChain(Op1, Builder))
1329 Op1 = NOp;
1330
1331 if (Op0 != I->getOperand(0) || Op1 != I->getOperand(1))
1332 return Builder.CreateOr(Op0, Op1);
1333 return nullptr;
1334}
1335
1338 const DominatorTree &DT) {
1339 CmpPredicate Pred;
1340 Value *Op0;
1341 if (!match(&I, m_ICmp(Pred, m_Value(Op0), m_Zero())) ||
1342 !ICmpInst::isEquality(Pred))
1343 return false;
1344
1345 // If the chain or or's matches a load, combine to that before attempting to
1346 // remove shifts.
1347 if (auto OpI = dyn_cast<Instruction>(Op0))
1348 if (OpI->getOpcode() == Instruction::Or)
1349 if (foldConsecutiveLoads(*OpI, DL, TTI, AA, DT))
1350 return true;
1351
1352 IRBuilder<> Builder(&I);
1353 // icmp eq/ne or(shl(a), b), 0 -> icmp eq/ne or(a, b), 0
1354 if (auto *Res = optimizeShiftInOrChain(Op0, Builder)) {
1355 I.replaceAllUsesWith(Builder.CreateICmp(Pred, Res, I.getOperand(1)));
1356 return true;
1357 }
1358
1359 return false;
1360}
1361
1362// Calculate GEP Stride and accumulated const ModOffset. Return Stride and
1363// ModOffset
1364static std::pair<APInt, APInt>
1366 unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());
1367 std::optional<APInt> Stride;
1368 APInt ModOffset(BW, 0);
1369 // Return a minimum gep stride, greatest common divisor of consective gep
1370 // index scales(c.f. Bézout's identity).
1371 while (auto *GEP = dyn_cast<GEPOperator>(PtrOp)) {
1373 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset))
1374 break;
1375
1376 for (auto [V, Scale] : VarOffsets) {
1377 // Only keep a power of two factor for non-inbounds
1378 if (!GEP->hasNoUnsignedSignedWrap())
1379 Scale = APInt::getOneBitSet(Scale.getBitWidth(), Scale.countr_zero());
1380
1381 if (!Stride)
1382 Stride = Scale;
1383 else
1384 Stride = APIntOps::GreatestCommonDivisor(*Stride, Scale);
1385 }
1386
1387 PtrOp = GEP->getPointerOperand();
1388 }
1389
1390 // Check whether pointer arrives back at Global Variable via at least one GEP.
1391 // Even if it doesn't, we can check by alignment.
1392 if (!isa<GlobalVariable>(PtrOp) || !Stride)
1393 return {APInt(BW, 1), APInt(BW, 0)};
1394
1395 // In consideration of signed GEP indices, non-negligible offset become
1396 // remainder of division by minimum GEP stride.
1397 ModOffset = ModOffset.srem(*Stride);
1398 if (ModOffset.isNegative())
1399 ModOffset += *Stride;
1400
1401 return {*Stride, ModOffset};
1402}
1403
1404/// If C is a constant patterned array and all valid loaded results for given
1405/// alignment are same to a constant, return that constant.
1407 auto *LI = dyn_cast<LoadInst>(&I);
1408 if (!LI || LI->isVolatile())
1409 return false;
1410
1411 // We can only fold the load if it is from a constant global with definitive
1412 // initializer. Skip expensive logic if this is not the case.
1413 auto *PtrOp = LI->getPointerOperand();
1415 if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
1416 return false;
1417
1418 // Bail for large initializers in excess of 4K to avoid too many scans.
1419 Constant *C = GV->getInitializer();
1420 uint64_t GVSize = DL.getTypeAllocSize(C->getType());
1421 if (!GVSize || 4096 < GVSize)
1422 return false;
1423
1424 Type *LoadTy = LI->getType();
1425 unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());
1426 auto [Stride, ConstOffset] = getStrideAndModOffsetOfGEP(PtrOp, DL);
1427
1428 // Any possible offset could be multiple of GEP stride. And any valid
1429 // offset is multiple of load alignment, so checking only multiples of bigger
1430 // one is sufficient to say results' equality.
1431 if (auto LA = LI->getAlign();
1432 LA <= GV->getAlign().valueOrOne() && Stride.getZExtValue() < LA.value()) {
1433 ConstOffset = APInt(BW, 0);
1434 Stride = APInt(BW, LA.value());
1435 }
1436
1437 Constant *Ca = ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL);
1438 if (!Ca)
1439 return false;
1440
1441 unsigned E = GVSize - DL.getTypeStoreSize(LoadTy);
1442 for (; ConstOffset.getZExtValue() <= E; ConstOffset += Stride)
1443 if (Ca != ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL))
1444 return false;
1445
1446 I.replaceAllUsesWith(Ca);
1447
1448 return true;
1449}
1450
1451namespace {
1452class StrNCmpInliner {
1453public:
1454 StrNCmpInliner(CallInst *CI, LibFunc Func, DomTreeUpdater *DTU,
1455 const DataLayout &DL)
1456 : CI(CI), Func(Func), DTU(DTU), DL(DL) {}
1457
1458 bool optimizeStrNCmp();
1459
1460private:
1461 void inlineCompare(Value *LHS, StringRef RHS, uint64_t N, bool Swapped);
1462
1463 CallInst *CI;
1464 LibFunc Func;
1465 DomTreeUpdater *DTU;
1466 const DataLayout &DL;
1467};
1468
1469} // namespace
1470
1471/// First we normalize calls to strncmp/strcmp to the form of
1472/// compare(s1, s2, N), which means comparing first N bytes of s1 and s2
1473/// (without considering '\0').
1474///
1475/// Examples:
1476///
1477/// \code
1478/// strncmp(s, "a", 3) -> compare(s, "a", 2)
1479/// strncmp(s, "abc", 3) -> compare(s, "abc", 3)
1480/// strncmp(s, "a\0b", 3) -> compare(s, "a\0b", 2)
1481/// strcmp(s, "a") -> compare(s, "a", 2)
1482///
1483/// char s2[] = {'a'}
1484/// strncmp(s, s2, 3) -> compare(s, s2, 3)
1485///
1486/// char s2[] = {'a', 'b', 'c', 'd'}
1487/// strncmp(s, s2, 3) -> compare(s, s2, 3)
1488/// \endcode
1489///
1490/// We only handle cases where N and exactly one of s1 and s2 are constant.
1491/// Cases that s1 and s2 are both constant are already handled by the
1492/// instcombine pass.
1493///
1494/// We do not handle cases where N > StrNCmpInlineThreshold.
1495///
1496/// We also do not handles cases where N < 2, which are already
1497/// handled by the instcombine pass.
1498///
1499bool StrNCmpInliner::optimizeStrNCmp() {
1500 if (StrNCmpInlineThreshold < 2)
1501 return false;
1502
1504 return false;
1505
1506 Value *Str1P = CI->getArgOperand(0);
1507 Value *Str2P = CI->getArgOperand(1);
1508 // Should be handled elsewhere.
1509 if (Str1P == Str2P)
1510 return false;
1511
1512 StringRef Str1, Str2;
1513 bool HasStr1 = getConstantStringInfo(Str1P, Str1, /*TrimAtNul=*/false);
1514 bool HasStr2 = getConstantStringInfo(Str2P, Str2, /*TrimAtNul=*/false);
1515 if (HasStr1 == HasStr2)
1516 return false;
1517
1518 // Note that '\0' and characters after it are not trimmed.
1519 StringRef Str = HasStr1 ? Str1 : Str2;
1520 Value *StrP = HasStr1 ? Str2P : Str1P;
1521
1522 size_t Idx = Str.find('\0');
1523 uint64_t N = Idx == StringRef::npos ? UINT64_MAX : Idx + 1;
1524 if (Func == LibFunc_strncmp) {
1525 if (auto *ConstInt = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
1526 N = std::min(N, ConstInt->getZExtValue());
1527 else
1528 return false;
1529 }
1530 // Now N means how many bytes we need to compare at most.
1531 if (N > Str.size() || N < 2 || N > StrNCmpInlineThreshold)
1532 return false;
1533
1534 // Cases where StrP has two or more dereferenceable bytes might be better
1535 // optimized elsewhere.
1536 bool CanBeNull = false, CanBeFreed = false;
1537 if (StrP->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed) > 1)
1538 return false;
1539 inlineCompare(StrP, Str, N, HasStr1);
1540 return true;
1541}
1542
1543/// Convert
1544///
1545/// \code
1546/// ret = compare(s1, s2, N)
1547/// \endcode
1548///
1549/// into
1550///
1551/// \code
1552/// ret = (int)s1[0] - (int)s2[0]
1553/// if (ret != 0)
1554/// goto NE
1555/// ...
1556/// ret = (int)s1[N-2] - (int)s2[N-2]
1557/// if (ret != 0)
1558/// goto NE
1559/// ret = (int)s1[N-1] - (int)s2[N-1]
1560/// NE:
1561/// \endcode
1562///
1563/// CFG before and after the transformation:
1564///
1565/// (before)
1566/// BBCI
1567///
1568/// (after)
1569/// BBCI -> BBSubs[0] (sub,icmp) --NE-> BBNE -> BBTail
1570/// | ^
1571/// E |
1572/// | |
1573/// BBSubs[1] (sub,icmp) --NE-----+
1574/// ... |
1575/// BBSubs[N-1] (sub) ---------+
1576///
1577void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N,
1578 bool Swapped) {
1579 auto &Ctx = CI->getContext();
1580 IRBuilder<> B(Ctx);
1581 // We want these instructions to be recognized as inlined instructions for the
1582 // compare call, but we don't have a source location for the definition of
1583 // that function, since we're generating that code now. Because the generated
1584 // code is a viable point for a memory access error, we make the pragmatic
1585 // choice here to directly use CI's location so that we have useful
1586 // attribution for the generated code.
1587 B.SetCurrentDebugLocation(CI->getDebugLoc());
1588
1589 BasicBlock *BBCI = CI->getParent();
1590 BasicBlock *BBTail =
1591 SplitBlock(BBCI, CI, DTU, nullptr, nullptr, BBCI->getName() + ".tail");
1592
1594 for (uint64_t I = 0; I < N; ++I)
1595 BBSubs.push_back(
1596 BasicBlock::Create(Ctx, "sub_" + Twine(I), BBCI->getParent(), BBTail));
1597 BasicBlock *BBNE = BasicBlock::Create(Ctx, "ne", BBCI->getParent(), BBTail);
1598
1599 cast<UncondBrInst>(BBCI->getTerminator())->setSuccessor(BBSubs[0]);
1600
1601 B.SetInsertPoint(BBNE);
1602 PHINode *Phi = B.CreatePHI(CI->getType(), N);
1603 B.CreateBr(BBTail);
1604
1605 Value *Base = LHS;
1606 for (uint64_t i = 0; i < N; ++i) {
1607 B.SetInsertPoint(BBSubs[i]);
1608 Value *VL =
1609 B.CreateZExt(B.CreateLoad(B.getInt8Ty(),
1610 B.CreateInBoundsPtrAdd(Base, B.getInt64(i))),
1611 CI->getType());
1612 Value *VR =
1613 ConstantInt::get(CI->getType(), static_cast<unsigned char>(RHS[i]));
1614 Value *Sub = Swapped ? B.CreateSub(VR, VL) : B.CreateSub(VL, VR);
1615 if (i < N - 1) {
1616 CondBrInst *CondBrInst = B.CreateCondBr(
1617 B.CreateICmpNE(Sub, ConstantInt::get(CI->getType(), 0)), BBNE,
1618 BBSubs[i + 1]);
1619
1620 Function *F = CI->getFunction();
1621 assert(F && "Instruction does not belong to a function!");
1622 std::optional<Function::ProfileCount> EC = F->getEntryCount();
1623 if (EC && EC->getCount() > 0)
1625 } else {
1626 B.CreateBr(BBNE);
1627 }
1628
1629 Phi->addIncoming(Sub, BBSubs[i]);
1630 }
1631
1632 CI->replaceAllUsesWith(Phi);
1633 CI->eraseFromParent();
1634
1635 if (DTU) {
1637 Updates.push_back({DominatorTree::Insert, BBCI, BBSubs[0]});
1638 for (uint64_t i = 0; i < N; ++i) {
1639 if (i < N - 1)
1640 Updates.push_back({DominatorTree::Insert, BBSubs[i], BBSubs[i + 1]});
1641 Updates.push_back({DominatorTree::Insert, BBSubs[i], BBNE});
1642 }
1643 Updates.push_back({DominatorTree::Insert, BBNE, BBTail});
1644 Updates.push_back({DominatorTree::Delete, BBCI, BBTail});
1645 DTU->applyUpdates(Updates);
1646 }
1647}
1648
1649/// Convert memchr with a small constant string into a switch
1651 const DataLayout &DL) {
1652 if (isa<Constant>(Call->getArgOperand(1)))
1653 return false;
1654
1655 StringRef Str;
1656 Value *Base = Call->getArgOperand(0);
1657 if (!getConstantStringInfo(Base, Str, /*TrimAtNul=*/false))
1658 return false;
1659
1660 uint64_t N = Str.size();
1661 if (auto *ConstInt = dyn_cast<ConstantInt>(Call->getArgOperand(2))) {
1662 uint64_t Val = ConstInt->getZExtValue();
1663 // Ignore the case that n is larger than the size of string.
1664 if (Val > N)
1665 return false;
1666 N = Val;
1667 } else
1668 return false;
1669
1671 return false;
1672
1673 BasicBlock *BB = Call->getParent();
1674 BasicBlock *BBNext = SplitBlock(BB, Call, DTU);
1675 IRBuilder<> IRB(BB);
1676 IRB.SetCurrentDebugLocation(Call->getDebugLoc());
1677 IntegerType *ByteTy = IRB.getInt8Ty();
1679 SwitchInst *SI = IRB.CreateSwitch(
1680 IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
1681 // We can't know the precise weights here, as they would depend on the value
1682 // distribution of Call->getArgOperand(1). So we just mark it as "unknown".
1684 Type *IndexTy = DL.getIndexType(Call->getType());
1686
1687 BasicBlock *BBSuccess = BasicBlock::Create(
1688 Call->getContext(), "memchr.success", BB->getParent(), BBNext);
1689 IRB.SetInsertPoint(BBSuccess);
1690 PHINode *IndexPHI = IRB.CreatePHI(IndexTy, N, "memchr.idx");
1691 Value *FirstOccursLocation = IRB.CreateInBoundsPtrAdd(Base, IndexPHI);
1692 IRB.CreateBr(BBNext);
1693 if (DTU)
1694 Updates.push_back({DominatorTree::Insert, BBSuccess, BBNext});
1695
1697 for (uint64_t I = 0; I < N; ++I) {
1698 ConstantInt *CaseVal =
1699 ConstantInt::get(ByteTy, static_cast<unsigned char>(Str[I]));
1700 if (!Cases.insert(CaseVal).second)
1701 continue;
1702
1703 BasicBlock *BBCase = BasicBlock::Create(Call->getContext(), "memchr.case",
1704 BB->getParent(), BBSuccess);
1705 SI->addCase(CaseVal, BBCase);
1706 IRB.SetInsertPoint(BBCase);
1707 IndexPHI->addIncoming(ConstantInt::get(IndexTy, I), BBCase);
1708 IRB.CreateBr(BBSuccess);
1709 if (DTU) {
1710 Updates.push_back({DominatorTree::Insert, BB, BBCase});
1711 Updates.push_back({DominatorTree::Insert, BBCase, BBSuccess});
1712 }
1713 }
1714
1715 PHINode *PHI =
1716 PHINode::Create(Call->getType(), 2, Call->getName(), BBNext->begin());
1717 PHI->addIncoming(Constant::getNullValue(Call->getType()), BB);
1718 PHI->addIncoming(FirstOccursLocation, BBSuccess);
1719
1720 Call->replaceAllUsesWith(PHI);
1721 Call->eraseFromParent();
1722
1723 if (DTU)
1724 DTU->applyUpdates(Updates);
1725
1726 return true;
1727}
1728
1731 DominatorTree &DT, const DataLayout &DL,
1732 bool &MadeCFGChange) {
1733
1734 auto *CI = dyn_cast<CallInst>(&I);
1735 if (!CI || CI->isNoBuiltin())
1736 return false;
1737
1738 Function *CalledFunc = CI->getCalledFunction();
1739 if (!CalledFunc)
1740 return false;
1741
1742 LibFunc LF;
1743 if (!TLI.getLibFunc(*CalledFunc, LF) ||
1744 !isLibFuncEmittable(CI->getModule(), &TLI, LF))
1745 return false;
1746
1747 DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
1748
1749 switch (LF) {
1750 case LibFunc_sqrt:
1751 case LibFunc_sqrtf:
1752 case LibFunc_sqrtl:
1753 return foldSqrt(CI, LF, TTI, TLI, AC, DT);
1754 case LibFunc_strcmp:
1755 case LibFunc_strncmp:
1756 if (StrNCmpInliner(CI, LF, &DTU, DL).optimizeStrNCmp()) {
1757 MadeCFGChange = true;
1758 return true;
1759 }
1760 break;
1761 case LibFunc_memchr:
1762 if (foldMemChr(CI, &DTU, DL)) {
1763 MadeCFGChange = true;
1764 return true;
1765 }
1766 break;
1767 default:;
1768 }
1769 return false;
1770}
1771
1772/// Match high part of long multiplication.
1773///
1774/// Considering a multiply made up of high and low parts, we can split the
1775/// multiply into:
1776/// x * y == (xh*T + xl) * (yh*T + yl)
1777/// where xh == x>>32 and xl == x & 0xffffffff. T = 2^32.
1778/// This expands to
1779/// xh*yh*T*T + xh*yl*T + xl*yh*T + xl*yl
1780/// which can be drawn as
1781/// [ xh*yh ]
1782/// [ xh*yl ]
1783/// [ xl*yh ]
1784/// [ xl*yl ]
1785/// We are looking for the "high" half, which is xh*yh + xh*yl>>32 + xl*yh>>32 +
1786/// some carrys. The carry makes this difficult and there are multiple ways of
1787/// representing it. The ones we attempt to support here are:
1788/// Carry: xh*yh + carry + lowsum
1789/// carry = lowsum < xh*yl ? 0x1000000 : 0
1790/// lowsum = xh*yl + xl*yh + (xl*yl>>32)
1791/// Ladder: xh*yh + c2>>32 + c3>>32
1792/// c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh
1793/// or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xl*yh
1794/// Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
1795/// crosssum = xh*yl + xl*yh
1796/// carry = crosssum < xh*yl ? 0x1000000 : 0
1797/// Ladder4: xh*yh + (xl*yh)>>32 + (xh*yl)>>32 + low>>32;
1798/// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
1799///
1800/// They all start by matching xh*yh + 2 or 3 other operands. The bottom of the
1801/// tree is xh*yh, xh*yl, xl*yh and xl*yl.
1803 Type *Ty = I.getType();
1804 if (!Ty->isIntOrIntVectorTy())
1805 return false;
1806
1807 unsigned BitWidth = Ty->getScalarSizeInBits();
1809 if (BitWidth % 2 != 0)
1810 return false;
1811
1812 auto CreateMulHigh = [&](Value *X, Value *Y) {
1813 IRBuilder<> Builder(&I);
1814 Type *NTy = Ty->getWithNewBitWidth(BitWidth * 2);
1815 Value *XExt = Builder.CreateZExt(X, NTy);
1816 Value *YExt = Builder.CreateZExt(Y, NTy);
1817 Value *Mul = Builder.CreateMul(XExt, YExt, "", /*HasNUW=*/true);
1818 Value *High = Builder.CreateLShr(Mul, BitWidth);
1819 Value *Res = Builder.CreateTrunc(High, Ty, "", /*HasNUW=*/true);
1820 Res->takeName(&I);
1821 I.replaceAllUsesWith(Res);
1822 LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and "
1823 << *Y << "\n");
1824 return true;
1825 };
1826
1827 // Common check routines for X_lo*Y_lo and X_hi*Y_lo
1828 auto CheckLoLo = [&](Value *XlYl, Value *X, Value *Y) {
1829 return match(XlYl, m_c_Mul(m_And(m_Specific(X), m_SpecificInt(LowMask)),
1830 m_And(m_Specific(Y), m_SpecificInt(LowMask))));
1831 };
1832 auto CheckHiLo = [&](Value *XhYl, Value *X, Value *Y) {
1833 return match(XhYl,
1835 m_And(m_Specific(Y), m_SpecificInt(LowMask))));
1836 };
1837
1838 auto FoldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry,
1839 Instruction *B) {
1840 // Looking for LowSum >> 32 and carry (select)
1841 if (Carry->getOpcode() != Instruction::Select)
1842 std::swap(Carry, B);
1843
1844 // Carry = LowSum < XhYl ? 0x100000000 : 0
1845 Value *LowSum, *XhYl;
1846 if (!match(Carry,
1849 m_Value(XhYl))),
1851 m_Zero()))))
1852 return false;
1853
1854 // XhYl can be Xh*Yl or Xl*Yh
1855 if (!CheckHiLo(XhYl, X, Y)) {
1856 if (CheckHiLo(XhYl, Y, X))
1857 std::swap(X, Y);
1858 else
1859 return false;
1860 }
1861 if (XhYl->hasNUsesOrMore(3))
1862 return false;
1863
1864 // B = LowSum >> 32
1865 if (!match(B, m_OneUse(m_LShr(m_Specific(LowSum),
1866 m_SpecificInt(BitWidth / 2)))) ||
1867 LowSum->hasNUsesOrMore(3))
1868 return false;
1869
1870 // LowSum = XhYl + XlYh + XlYl>>32
1871 Value *XlYh, *XlYl;
1872 auto XlYlHi = m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2));
1873 if (!match(LowSum,
1874 m_c_Add(m_Specific(XhYl),
1875 m_OneUse(m_c_Add(m_OneUse(m_Value(XlYh)), XlYlHi)))) &&
1876 !match(LowSum, m_c_Add(m_OneUse(m_Value(XlYh)),
1877 m_OneUse(m_c_Add(m_Specific(XhYl), XlYlHi)))) &&
1878 !match(LowSum,
1879 m_c_Add(XlYlHi, m_OneUse(m_c_Add(m_Specific(XhYl),
1880 m_OneUse(m_Value(XlYh)))))))
1881 return false;
1882
1883 // Check XlYl and XlYh
1884 if (!CheckLoLo(XlYl, X, Y))
1885 return false;
1886 if (!CheckHiLo(XlYh, Y, X))
1887 return false;
1888
1889 return CreateMulHigh(X, Y);
1890 };
1891
1892 auto FoldMulHighLadder = [&](Value *X, Value *Y, Instruction *A,
1893 Instruction *B) {
1894 // xh*yh + c2>>32 + c3>>32
1895 // c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh
1896 // or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xh*yl
1897 Value *XlYh, *XhYl, *XlYl, *C2, *C3;
1898 // Strip off the two expected shifts.
1899 if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BitWidth / 2))) ||
1901 return false;
1902
1903 if (match(C3, m_c_Add(m_Add(m_Value(), m_Value()), m_Value())))
1904 std::swap(C2, C3);
1905 // Try to match c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32)
1906 if (match(C2,
1908 m_Value(XlYh)),
1909 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2)))) ||
1911 m_LShr(m_Value(XlYl),
1912 m_SpecificInt(BitWidth / 2))),
1913 m_Value(XlYh))) ||
1915 m_SpecificInt(BitWidth / 2)),
1916 m_Value(XlYh)),
1917 m_And(m_Specific(C3), m_SpecificInt(LowMask))))) {
1918 XhYl = C3;
1919 } else {
1920 // Match c3 = c2&0xffffffff + xl*yh
1921 if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)),
1922 m_Value(XlYh))))
1923 std::swap(C2, C3);
1924 if (!match(C3, m_c_Add(m_OneUse(
1925 m_And(m_Specific(C2), m_SpecificInt(LowMask))),
1926 m_Value(XlYh))) ||
1927 !C3->hasOneUse() || C2->hasNUsesOrMore(3))
1928 return false;
1929
1930 // Match c2 = xh*yl + (xl*yl >> 32)
1931 if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2)),
1932 m_Value(XhYl))))
1933 return false;
1934 }
1935
1936 // Match XhYl and XlYh - they can appear either way around.
1937 if (!CheckHiLo(XlYh, Y, X))
1938 std::swap(XlYh, XhYl);
1939 if (!CheckHiLo(XlYh, Y, X))
1940 return false;
1941 if (!CheckHiLo(XhYl, X, Y))
1942 return false;
1943 if (!CheckLoLo(XlYl, X, Y))
1944 return false;
1945
1946 return CreateMulHigh(X, Y);
1947 };
1948
1949 auto FoldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A,
1951 /// Ladder4: xh*yh + (xl*yh)>>32 + (xh+yl)>>32 + low>>32;
1952 /// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
1953
1954 // Find A = Low >> 32 and B/C = XhYl>>32, XlYh>>32.
1955 auto ShiftAdd =
1957 if (!match(A, ShiftAdd))
1958 std::swap(A, B);
1959 if (!match(A, ShiftAdd))
1960 std::swap(A, C);
1961 Value *Low;
1963 return false;
1964
1965 // Match B == XhYl>>32 and C == XlYh>>32
1966 Value *XhYl, *XlYh;
1967 if (!match(B, m_LShr(m_Value(XhYl), m_SpecificInt(BitWidth / 2))) ||
1968 !match(C, m_LShr(m_Value(XlYh), m_SpecificInt(BitWidth / 2))))
1969 return false;
1970 if (!CheckHiLo(XhYl, X, Y))
1971 std::swap(XhYl, XlYh);
1972 if (!CheckHiLo(XhYl, X, Y) || XhYl->hasNUsesOrMore(3))
1973 return false;
1974 if (!CheckHiLo(XlYh, Y, X) || XlYh->hasNUsesOrMore(3))
1975 return false;
1976
1977 // Match Low as XlYl>>32 + XhYl&0xffffffff + XlYh&0xffffffff
1978 Value *XlYl;
1979 if (!match(
1980 Low,
1981 m_c_Add(
1983 m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))),
1984 m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))),
1985 m_OneUse(
1986 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))) &&
1987 !match(
1988 Low,
1989 m_c_Add(
1991 m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))),
1992 m_OneUse(
1993 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))),
1994 m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))) &&
1995 !match(
1996 Low,
1997 m_c_Add(
1999 m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))),
2000 m_OneUse(
2001 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))),
2002 m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))))))
2003 return false;
2004 if (!CheckLoLo(XlYl, X, Y))
2005 return false;
2006
2007 return CreateMulHigh(X, Y);
2008 };
2009
2010 auto FoldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry,
2012 // xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
2013 // crosssum = xh*yl+xl*yh
2014 // carry = crosssum < xh*yl ? 0x1000000 : 0
2015 if (Carry->getOpcode() != Instruction::Select)
2016 std::swap(Carry, B);
2017 if (Carry->getOpcode() != Instruction::Select)
2018 std::swap(Carry, C);
2019
2020 // Carry = CrossSum < XhYl ? 0x100000000 : 0
2021 Value *CrossSum, *XhYl;
2022 if (!match(Carry,
2025 m_Value(CrossSum), m_Value(XhYl))),
2027 m_Zero()))))
2028 return false;
2029
2030 if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BitWidth / 2))))
2031 std::swap(B, C);
2032 if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BitWidth / 2))))
2033 return false;
2034
2035 Value *XlYl, *LowAccum;
2036 if (!match(C, m_LShr(m_Value(LowAccum), m_SpecificInt(BitWidth / 2))) ||
2037 !match(LowAccum, m_c_Add(m_OneUse(m_LShr(m_Value(XlYl),
2038 m_SpecificInt(BitWidth / 2))),
2039 m_OneUse(m_And(m_Specific(CrossSum),
2040 m_SpecificInt(LowMask))))) ||
2041 LowAccum->hasNUsesOrMore(3))
2042 return false;
2043 if (!CheckLoLo(XlYl, X, Y))
2044 return false;
2045
2046 if (!CheckHiLo(XhYl, X, Y))
2047 std::swap(X, Y);
2048 if (!CheckHiLo(XhYl, X, Y))
2049 return false;
2050 Value *XlYh;
2051 if (!match(CrossSum, m_c_Add(m_Specific(XhYl), m_OneUse(m_Value(XlYh)))) ||
2052 !CheckHiLo(XlYh, Y, X) || CrossSum->hasNUsesOrMore(4) ||
2053 XhYl->hasNUsesOrMore(3))
2054 return false;
2055
2056 return CreateMulHigh(X, Y);
2057 };
2058
2059 // X and Y are the two inputs, A, B and C are other parts of the pattern
2060 // (crosssum>>32, carry, etc).
2061 Value *X, *Y;
2062 Instruction *A, *B, *C;
2063 auto HiHi = m_OneUse(m_Mul(m_LShr(m_Value(X), m_SpecificInt(BitWidth / 2)),
2065 if ((match(&I, m_c_Add(HiHi, m_OneUse(m_Add(m_Instruction(A),
2066 m_Instruction(B))))) ||
2068 m_OneUse(m_c_Add(HiHi, m_Instruction(B)))))) &&
2069 A->hasOneUse() && B->hasOneUse())
2070 if (FoldMulHighCarry(X, Y, A, B) || FoldMulHighLadder(X, Y, A, B))
2071 return true;
2072
2073 if ((match(&I, m_c_Add(HiHi, m_OneUse(m_c_Add(
2076 m_Instruction(C))))))) ||
2080 m_Instruction(C))))))) ||
2084 m_OneUse(m_c_Add(HiHi, m_Instruction(C))))))) ||
2085 match(&I,
2088 A->hasOneUse() && B->hasOneUse() && C->hasOneUse())
2089 return FoldMulHighCarry4(X, Y, A, B, C) ||
2090 FoldMulHighLadder4(X, Y, A, B, C);
2091
2092 return false;
2093}
2094
2095/// This is the entry point for folds that could be implemented in regular
2096/// InstCombine, but they are separated because they are not expected to
2097/// occur frequently and/or have more than a constant-length pattern match.
2101 AssumptionCache &AC, bool &MadeCFGChange) {
2102 bool MadeChange = false;
2103 for (BasicBlock &BB : F) {
2104 // Ignore unreachable basic blocks.
2105 if (!DT.isReachableFromEntry(&BB))
2106 continue;
2107
2108 const DataLayout &DL = F.getDataLayout();
2109
2110 // Walk the block backwards for efficiency. We're matching a chain of
2111 // use->defs, so we're more likely to succeed by starting from the bottom.
2112 // Also, we want to avoid matching partial patterns.
2113 // TODO: It would be more efficient if we removed dead instructions
2114 // iteratively in this loop rather than waiting until the end.
2116 MadeChange |= foldAnyOrAllBitsSet(I);
2117 MadeChange |= foldGuardedFunnelShift(I, DT);
2118 MadeChange |= tryToRecognizePopCount(I);
2119 MadeChange |= tryToRecognizePopCount2n3(I);
2120 MadeChange |= tryToFPToSat(I, TTI);
2121 MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
2122 MadeChange |= tryToRecognizeTableBasedLog2(I, DL, TTI);
2123 MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
2124 MadeChange |= foldPatternedLoads(I, DL);
2125 MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
2126 MadeChange |= foldMulHigh(I);
2127 // NOTE: This function introduces erasing of the instruction `I`, so it
2128 // needs to be called at the end of this sequence, otherwise we may make
2129 // bugs.
2130 MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
2131 }
2132
2133 // Do this separately to avoid redundantly scanning stores multiple times.
2134 MadeChange |= foldConsecutiveStores(BB, DL, TTI, AA);
2135 }
2136
2137 // We're done with transforms, so remove dead instructions.
2138 if (MadeChange)
2139 for (BasicBlock &BB : F)
2141
2142 return MadeChange;
2143}
2144
2145/// This is the entry point for all transforms. Pass manager differences are
2146/// handled in the callers of this function.
2149 AliasAnalysis &AA, bool &MadeCFGChange) {
2150 bool MadeChange = false;
2151 const DataLayout &DL = F.getDataLayout();
2152 TruncInstCombine TIC(AC, TLI, DL, DT);
2153 MadeChange |= TIC.run(F);
2154 MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, MadeCFGChange);
2155 return MadeChange;
2156}
2157
2160 auto &AC = AM.getResult<AssumptionAnalysis>(F);
2161 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
2162 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
2163 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
2164 auto &AA = AM.getResult<AAManager>(F);
2165 bool MadeCFGChange = false;
2166 if (!runImpl(F, AC, TTI, TLI, DT, AA, MadeCFGChange)) {
2167 // No changes, all analyses are preserved.
2168 return PreservedAnalyses::all();
2169 }
2170 // Mark all the analyses that instcombine updates as preserved.
2172 if (MadeCFGChange)
2174 else
2176 return PA;
2177}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool tryToRecognizePopCount(Instruction &I)
static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT)
Try to replace a mathlib call to sqrt with the LLVM intrinsic.
static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift, Type *AccessTy, unsigned InputBits, const APInt &GEPIdxFactor, const DataLayout &DL)
static bool foldAnyOrAllBitsSet(Instruction &I)
Match patterns that correspond to "any-bits-set" and "all-bits-set".
static cl::opt< unsigned > MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden, cl::desc("The maximum length of a constant string to " "inline a memchr call."))
static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI)
Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and C2 saturate the value of t...
static cl::opt< unsigned > StrNCmpInlineThreshold("strncmp-inline-threshold", cl::init(3), cl::Hidden, cl::desc("The maximum length of a constant string for a builtin string cmp " "call eligible for inlining. The default value is 3."))
static bool matchAndOrChain(Value *V, MaskOps &MOps)
This is a recursive helper for foldAnyOrAllBitsSet() that walks through a chain of 'and' or 'or' inst...
static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, const DataLayout &DL)
Convert memchr with a small constant string into a switch.
static bool tryToRecognizePopCount2n3(Instruction &I)
static Value * optimizeShiftInOrChain(Value *V, IRBuilder<> &Builder)
Combine away instructions providing they are still equivalent when compared against 0.
static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA, const DominatorTree &DT)
static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT)
Match a pattern for a bitwise funnel/rotate operation that partially guards against undefined behavio...
static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL)
static bool mergePartStores(SmallVectorImpl< PartStore > &Parts, const DataLayout &DL, TargetTransformInfo &TTI)
static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, AliasAnalysis &AA, bool IsRoot=false)
static bool mergeConsecutivePartStores(ArrayRef< PartStore > Parts, unsigned Width, const DataLayout &DL, TargetTransformInfo &TTI)
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
static bool foldICmpOrChain(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA, const DominatorTree &DT)
static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift, const APInt &AndMask, Type *AccessTy, unsigned InputBits, const APInt &GEPIdxFactor, const DataLayout &DL)
static std::optional< PartStore > matchPartStore(Instruction &I, const DataLayout &DL)
static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA)
static std::pair< APInt, APInt > getStrideAndModOffsetOfGEP(Value *PtrOp, const DataLayout &DL)
static bool foldPatternedLoads(Instruction &I, const DataLayout &DL)
If C is a constant patterned array and all valid loaded results for given alignment are same to a con...
static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI)
static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT, const DataLayout &DL, bool &MadeCFGChange)
static bool foldMulHigh(Instruction &I)
Match high part of long multiplication.
static bool foldUnusualPatterns(Function &F, DominatorTree &DT, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AliasAnalysis &AA, AssumptionCache &AC, bool &MadeCFGChange)
This is the entry point for folds that could be implemented in regular InstCombine,...
AggressiveInstCombiner - Combine expression patterns to form expressions with fewer,...
This is the interface for LLVM's primary stateless and local alias analysis.
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
static MaybeAlign getAlign(Value *Ptr)
static Instruction * matchFunnelShift(Instruction &Or, InstCombinerImpl &IC)
Match UB-safe variants of the funnel shift intrinsic.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t High
This file contains the declarations for profiling metadata utility functions.
static const MCExpr * MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
BinaryOperator * Mul
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1787
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1137
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static LLVM_ABI DebugLoc getMergedLocations(ArrayRef< DebugLoc > Locs)
Try to combine the vector of locations passed as input in a single one.
Definition DebugLoc.cpp:166
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
bool isConstant() const
If the value is a global constant, its value is immutable throughout the runtime execution of the pro...
static bool isEquality(Predicate P)
Return true if this predicate is either EQ or NE.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
UncondBrInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition IRBuilder.h:1218
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2507
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition IRBuilder.h:1247
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2074
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateInBoundsPtrAdd(Value *Ptr, Value *Offset, const Twine &Name="")
Definition IRBuilder.h:2064
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:569
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2822
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isSimple() const
static LocationSize precise(uint64_t Value)
LLVM_ABI MDNode * createUnlikelyBranchWeights()
Return metadata containing two branch weights, with significant bias towards false destination.
Definition MDBuilder.cpp:48
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static constexpr size_t npos
Definition StringRef.h:57
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
bool getLibFunc(StringRef funcName, LibFunc &F) const
Searches for a particular function name.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ None
The insert/extract is not used with a load/store.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCC_Basic
The cost of a typical 'add' instruction.
bool run(Function &F)
Perform TruncInst pattern optimization on given function.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:154
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
LLVM_ABI uint64_t getPointerDereferenceableBytes(const DataLayout &DL, bool &CanBeNull, bool &CanBeFreed) const
Returns the number of bytes known to be dereferenceable for the pointer value.
Definition Value.cpp:890
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define UINT64_MAX
Definition DataTypes.h:77
Abstract Attribute helper functions.
Definition Attributor.h:165
LLVM_ABI APInt GreatestCommonDivisor(APInt A, APInt B)
Compute GCD of two unsigned APInt values.
Definition APInt.cpp:829
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
ShiftLike_match< LHS, Instruction::LShr > m_LShrOrSelf(const LHS &L, uint64_t &R)
Matches lshr L, ConstShAmt or L itself (R will be set to zero in this case).
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, CastInst >, OpTy > m_CastOrSelf(const OpTy &Op)
Matches any cast or self. Used to ignore casts.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
auto m_Value()
Match an arbitrary value and ignore it.
ShiftLike_match< LHS, Instruction::Shl > m_ShlOrSelf(const LHS &L, uint64_t &R)
Matches shl L, ConstShAmt or L itself (R will be set to zero in this case).
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
specific_bbval m_SpecificBB(BasicBlock *BB)
Match a specific basic block value.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoSignedWrap > m_NSWShl(const LHS &L, const RHS &R)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastInst_match< OpTy, FPToSIInst > m_FPToSI(const OpTy &Op)
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
brc_match< Cond_t, match_bind< BasicBlock >, match_bind< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition LoopInfo.cpp:60
LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName, const Function *F=nullptr)
Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch weights in the new instruct...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool isOnlyUsedInZeroComparison(const Instruction *CxtI)
LLVM_ABI bool getConstantStringInfo(const Value *V, StringRef &Str, bool TrimAtNul=true)
This function computes the length of a null-terminated C string pointed to by V.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
LLVM_ABI bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr)
Scan the specified basic block and try to simplify any instructions in it and recursively delete dead...
Definition Local.cpp:723
LLVM_ABI void setExplicitlyUnknownBranchWeights(Instruction &I, StringRef PassName)
Specify that the branch weights for this terminator cannot be known at compile time.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
LLVM_ABI bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
LLVM_ABI Constant * ConstantFoldLoadFromConst(Constant *C, Type *Ty, const APInt &Offset, const DataLayout &DL)
Extract value of C at the given Offset reinterpreted as Ty.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Sub
Subtraction of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool cannotBeOrderedLessThanZero(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if we can prove that the specified FP value is either NaN or never less than -0....
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
LoadInst * RootInsert
ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
bool operator<(const PartStore &Other) const
bool isCompatibleWith(const PartStore &Other) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
LLVM_ABI AAMDNodes concat(const AAMDNodes &Other) const
Determine the best AAMDNodes after concatenating two different locations together.
Matching combinators.
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276