LLVM 19.0.0git
AArch64PostLegalizerCombiner.cpp
Go to the documentation of this file.
1//=== AArch64PostLegalizerCombiner.cpp --------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// Post-legalization combines on generic MachineInstrs.
11///
12/// The combines here must preserve instruction legality.
13///
14/// Lowering combines (e.g. pseudo matching) should be handled by
15/// AArch64PostLegalizerLowering.
16///
17/// Combines which don't rely on instruction legality should go in the
18/// AArch64PreLegalizerCombiner.
19///
20//===----------------------------------------------------------------------===//
21
23#include "llvm/ADT/STLExtras.h"
41#include "llvm/Support/Debug.h"
42
43#define GET_GICOMBINER_DEPS
44#include "AArch64GenPostLegalizeGICombiner.inc"
45#undef GET_GICOMBINER_DEPS
46
47#define DEBUG_TYPE "aarch64-postlegalizer-combiner"
48
49using namespace llvm;
50using namespace MIPatternMatch;
51
52namespace {
53
54#define GET_GICOMBINER_TYPES
55#include "AArch64GenPostLegalizeGICombiner.inc"
56#undef GET_GICOMBINER_TYPES
57
58/// This combine tries do what performExtractVectorEltCombine does in SDAG.
59/// Rewrite for pairwise fadd pattern
60/// (s32 (g_extract_vector_elt
61/// (g_fadd (vXs32 Other)
62/// (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0))
63/// ->
64/// (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0)
65/// (g_extract_vector_elt (vXs32 Other) 1))
66bool matchExtractVecEltPairwiseAdd(
68 std::tuple<unsigned, LLT, Register> &MatchInfo) {
69 Register Src1 = MI.getOperand(1).getReg();
70 Register Src2 = MI.getOperand(2).getReg();
71 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
72
74 if (!Cst || Cst->Value != 0)
75 return false;
76 // SDAG also checks for FullFP16, but this looks to be beneficial anyway.
77
78 // Now check for an fadd operation. TODO: expand this for integer add?
79 auto *FAddMI = getOpcodeDef(TargetOpcode::G_FADD, Src1, MRI);
80 if (!FAddMI)
81 return false;
82
83 // If we add support for integer add, must restrict these types to just s64.
84 unsigned DstSize = DstTy.getSizeInBits();
85 if (DstSize != 16 && DstSize != 32 && DstSize != 64)
86 return false;
87
88 Register Src1Op1 = FAddMI->getOperand(1).getReg();
89 Register Src1Op2 = FAddMI->getOperand(2).getReg();
90 MachineInstr *Shuffle =
91 getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op2, MRI);
92 MachineInstr *Other = MRI.getVRegDef(Src1Op1);
93 if (!Shuffle) {
94 Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op1, MRI);
95 Other = MRI.getVRegDef(Src1Op2);
96 }
97
98 // We're looking for a shuffle that moves the second element to index 0.
99 if (Shuffle && Shuffle->getOperand(3).getShuffleMask()[0] == 1 &&
100 Other == MRI.getVRegDef(Shuffle->getOperand(1).getReg())) {
101 std::get<0>(MatchInfo) = TargetOpcode::G_FADD;
102 std::get<1>(MatchInfo) = DstTy;
103 std::get<2>(MatchInfo) = Other->getOperand(0).getReg();
104 return true;
105 }
106 return false;
107}
108
109void applyExtractVecEltPairwiseAdd(
111 std::tuple<unsigned, LLT, Register> &MatchInfo) {
112 unsigned Opc = std::get<0>(MatchInfo);
113 assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!");
114 // We want to generate two extracts of elements 0 and 1, and add them.
115 LLT Ty = std::get<1>(MatchInfo);
116 Register Src = std::get<2>(MatchInfo);
117 LLT s64 = LLT::scalar(64);
118 B.setInstrAndDebugLoc(MI);
119 auto Elt0 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 0));
120 auto Elt1 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 1));
121 B.buildInstr(Opc, {MI.getOperand(0).getReg()}, {Elt0, Elt1});
122 MI.eraseFromParent();
123}
124
126 // TODO: check if extended build vector as well.
127 unsigned Opc = MRI.getVRegDef(R)->getOpcode();
128 return Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG;
129}
130
132 // TODO: check if extended build vector as well.
133 return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT;
134}
135
136bool matchAArch64MulConstCombine(
138 std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
139 assert(MI.getOpcode() == TargetOpcode::G_MUL);
140 Register LHS = MI.getOperand(1).getReg();
141 Register RHS = MI.getOperand(2).getReg();
142 Register Dst = MI.getOperand(0).getReg();
143 const LLT Ty = MRI.getType(LHS);
144
145 // The below optimizations require a constant RHS.
147 if (!Const)
148 return false;
149
150 APInt ConstValue = Const->Value.sext(Ty.getSizeInBits());
151 // The following code is ported from AArch64ISelLowering.
152 // Multiplication of a power of two plus/minus one can be done more
153 // cheaply as shift+add/sub. For now, this is true unilaterally. If
154 // future CPUs have a cheaper MADD instruction, this may need to be
155 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
156 // 64-bit is 5 cycles, so this is always a win.
157 // More aggressively, some multiplications N0 * C can be lowered to
158 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
159 // e.g. 6=3*2=(2+1)*2.
160 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
161 // which equals to (1+2)*16-(1+2).
162 // TrailingZeroes is used to test if the mul can be lowered to
163 // shift+add+shift.
164 unsigned TrailingZeroes = ConstValue.countr_zero();
165 if (TrailingZeroes) {
166 // Conservatively do not lower to shift+add+shift if the mul might be
167 // folded into smul or umul.
168 if (MRI.hasOneNonDBGUse(LHS) &&
169 (isSignExtended(LHS, MRI) || isZeroExtended(LHS, MRI)))
170 return false;
171 // Conservatively do not lower to shift+add+shift if the mul might be
172 // folded into madd or msub.
173 if (MRI.hasOneNonDBGUse(Dst)) {
174 MachineInstr &UseMI = *MRI.use_instr_begin(Dst);
175 unsigned UseOpc = UseMI.getOpcode();
176 if (UseOpc == TargetOpcode::G_ADD || UseOpc == TargetOpcode::G_PTR_ADD ||
177 UseOpc == TargetOpcode::G_SUB)
178 return false;
179 }
180 }
181 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
182 // and shift+add+shift.
183 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
184
185 unsigned ShiftAmt, AddSubOpc;
186 // Is the shifted value the LHS operand of the add/sub?
187 bool ShiftValUseIsLHS = true;
188 // Do we need to negate the result?
189 bool NegateResult = false;
190
191 if (ConstValue.isNonNegative()) {
192 // (mul x, 2^N + 1) => (add (shl x, N), x)
193 // (mul x, 2^N - 1) => (sub (shl x, N), x)
194 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
195 APInt SCVMinus1 = ShiftedConstValue - 1;
196 APInt CVPlus1 = ConstValue + 1;
197 if (SCVMinus1.isPowerOf2()) {
198 ShiftAmt = SCVMinus1.logBase2();
199 AddSubOpc = TargetOpcode::G_ADD;
200 } else if (CVPlus1.isPowerOf2()) {
201 ShiftAmt = CVPlus1.logBase2();
202 AddSubOpc = TargetOpcode::G_SUB;
203 } else
204 return false;
205 } else {
206 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
207 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
208 APInt CVNegPlus1 = -ConstValue + 1;
209 APInt CVNegMinus1 = -ConstValue - 1;
210 if (CVNegPlus1.isPowerOf2()) {
211 ShiftAmt = CVNegPlus1.logBase2();
212 AddSubOpc = TargetOpcode::G_SUB;
213 ShiftValUseIsLHS = false;
214 } else if (CVNegMinus1.isPowerOf2()) {
215 ShiftAmt = CVNegMinus1.logBase2();
216 AddSubOpc = TargetOpcode::G_ADD;
217 NegateResult = true;
218 } else
219 return false;
220 }
221
222 if (NegateResult && TrailingZeroes)
223 return false;
224
225 ApplyFn = [=](MachineIRBuilder &B, Register DstReg) {
226 auto Shift = B.buildConstant(LLT::scalar(64), ShiftAmt);
227 auto ShiftedVal = B.buildShl(Ty, LHS, Shift);
228
229 Register AddSubLHS = ShiftValUseIsLHS ? ShiftedVal.getReg(0) : LHS;
230 Register AddSubRHS = ShiftValUseIsLHS ? LHS : ShiftedVal.getReg(0);
231 auto Res = B.buildInstr(AddSubOpc, {Ty}, {AddSubLHS, AddSubRHS});
232 assert(!(NegateResult && TrailingZeroes) &&
233 "NegateResult and TrailingZeroes cannot both be true for now.");
234 // Negate the result.
235 if (NegateResult) {
236 B.buildSub(DstReg, B.buildConstant(Ty, 0), Res);
237 return;
238 }
239 // Shift the result.
240 if (TrailingZeroes) {
241 B.buildShl(DstReg, Res, B.buildConstant(LLT::scalar(64), TrailingZeroes));
242 return;
243 }
244 B.buildCopy(DstReg, Res.getReg(0));
245 };
246 return true;
247}
248
249void applyAArch64MulConstCombine(
251 std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
252 B.setInstrAndDebugLoc(MI);
253 ApplyFn(B, MI.getOperand(0).getReg());
254 MI.eraseFromParent();
255}
256
257/// Try to fold a G_MERGE_VALUES of 2 s32 sources, where the second source
258/// is a zero, into a G_ZEXT of the first.
259bool matchFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI) {
260 auto &Merge = cast<GMerge>(MI);
261 LLT SrcTy = MRI.getType(Merge.getSourceReg(0));
262 if (SrcTy != LLT::scalar(32) || Merge.getNumSources() != 2)
263 return false;
264 return mi_match(Merge.getSourceReg(1), MRI, m_SpecificICst(0));
265}
266
267void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI,
269 // Mutate %d(s64) = G_MERGE_VALUES %a(s32), 0(s32)
270 // ->
271 // %d(s64) = G_ZEXT %a(s32)
272 Observer.changingInstr(MI);
273 MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT));
274 MI.removeOperand(2);
275 Observer.changedInstr(MI);
276}
277
278/// \returns True if a G_ANYEXT instruction \p MI should be mutated to a G_ZEXT
279/// instruction.
280bool matchMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI) {
281 // If this is coming from a scalar compare then we can use a G_ZEXT instead of
282 // a G_ANYEXT:
283 //
284 // %cmp:_(s32) = G_[I|F]CMP ... <-- produces 0/1.
285 // %ext:_(s64) = G_ANYEXT %cmp(s32)
286 //
287 // By doing this, we can leverage more KnownBits combines.
288 assert(MI.getOpcode() == TargetOpcode::G_ANYEXT);
289 Register Dst = MI.getOperand(0).getReg();
290 Register Src = MI.getOperand(1).getReg();
291 return MRI.getType(Dst).isScalar() &&
292 mi_match(Src, MRI,
294 m_GFCmp(m_Pred(), m_Reg(), m_Reg())));
295}
296
297void applyMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI,
299 GISelChangeObserver &Observer) {
300 Observer.changingInstr(MI);
301 MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT));
302 Observer.changedInstr(MI);
303}
304
305/// Match a 128b store of zero and split it into two 64 bit stores, for
306/// size/performance reasons.
307bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) {
308 GStore &Store = cast<GStore>(MI);
309 if (!Store.isSimple())
310 return false;
311 LLT ValTy = MRI.getType(Store.getValueReg());
312 if (!ValTy.isVector() || ValTy.getSizeInBits() != 128)
313 return false;
314 if (Store.getMemSizeInBits() != ValTy.getSizeInBits())
315 return false; // Don't split truncating stores.
316 if (!MRI.hasOneNonDBGUse(Store.getValueReg()))
317 return false;
318 auto MaybeCst = isConstantOrConstantSplatVector(
319 *MRI.getVRegDef(Store.getValueReg()), MRI);
320 return MaybeCst && MaybeCst->isZero();
321}
322
323void applySplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI,
325 GISelChangeObserver &Observer) {
326 B.setInstrAndDebugLoc(MI);
327 GStore &Store = cast<GStore>(MI);
328 assert(MRI.getType(Store.getValueReg()).isVector() &&
329 "Expected a vector store value");
330 LLT NewTy = LLT::scalar(64);
331 Register PtrReg = Store.getPointerReg();
332 auto Zero = B.buildConstant(NewTy, 0);
333 auto HighPtr = B.buildPtrAdd(MRI.getType(PtrReg), PtrReg,
334 B.buildConstant(LLT::scalar(64), 8));
335 auto &MF = *MI.getMF();
336 auto *LowMMO = MF.getMachineMemOperand(&Store.getMMO(), 0, NewTy);
337 auto *HighMMO = MF.getMachineMemOperand(&Store.getMMO(), 8, NewTy);
338 B.buildStore(Zero, PtrReg, *LowMMO);
339 B.buildStore(Zero, HighPtr, *HighMMO);
340 Store.eraseFromParent();
341}
342
343bool matchOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
344 std::tuple<Register, Register, Register> &MatchInfo) {
345 const LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
346 if (!DstTy.isVector())
347 return false;
348
349 Register AO1, AO2, BVO1, BVO2;
350 if (!mi_match(MI, MRI,
351 m_GOr(m_GAnd(m_Reg(AO1), m_Reg(BVO1)),
352 m_GAnd(m_Reg(AO2), m_Reg(BVO2)))))
353 return false;
354
355 auto *BV1 = getOpcodeDef<GBuildVector>(BVO1, MRI);
356 auto *BV2 = getOpcodeDef<GBuildVector>(BVO2, MRI);
357 if (!BV1 || !BV2)
358 return false;
359
360 for (int I = 0, E = DstTy.getNumElements(); I < E; I++) {
361 auto ValAndVReg1 =
362 getIConstantVRegValWithLookThrough(BV1->getSourceReg(I), MRI);
363 auto ValAndVReg2 =
364 getIConstantVRegValWithLookThrough(BV2->getSourceReg(I), MRI);
365 if (!ValAndVReg1 || !ValAndVReg2 ||
366 ValAndVReg1->Value != ~ValAndVReg2->Value)
367 return false;
368 }
369
370 MatchInfo = {AO1, AO2, BVO1};
371 return true;
372}
373
374void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
376 std::tuple<Register, Register, Register> &MatchInfo) {
377 B.setInstrAndDebugLoc(MI);
378 B.buildInstr(
379 AArch64::G_BSP, {MI.getOperand(0).getReg()},
380 {std::get<2>(MatchInfo), std::get<0>(MatchInfo), std::get<1>(MatchInfo)});
381 MI.eraseFromParent();
382}
383
384class AArch64PostLegalizerCombinerImpl : public Combiner {
385protected:
386 // TODO: Make CombinerHelper methods const.
387 mutable CombinerHelper Helper;
388 const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig;
389 const AArch64Subtarget &STI;
390
391public:
392 AArch64PostLegalizerCombinerImpl(
393 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
394 GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
395 const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig,
396 const AArch64Subtarget &STI, MachineDominatorTree *MDT,
397 const LegalizerInfo *LI);
398
399 static const char *getName() { return "AArch64PostLegalizerCombiner"; }
400
401 bool tryCombineAll(MachineInstr &I) const override;
402
403private:
404#define GET_GICOMBINER_CLASS_MEMBERS
405#include "AArch64GenPostLegalizeGICombiner.inc"
406#undef GET_GICOMBINER_CLASS_MEMBERS
407};
408
409#define GET_GICOMBINER_IMPL
410#include "AArch64GenPostLegalizeGICombiner.inc"
411#undef GET_GICOMBINER_IMPL
412
413AArch64PostLegalizerCombinerImpl::AArch64PostLegalizerCombinerImpl(
414 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
415 GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
416 const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig,
417 const AArch64Subtarget &STI, MachineDominatorTree *MDT,
418 const LegalizerInfo *LI)
419 : Combiner(MF, CInfo, TPC, &KB, CSEInfo),
420 Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
421 RuleConfig(RuleConfig), STI(STI),
423#include "AArch64GenPostLegalizeGICombiner.inc"
425{
426}
427
428class AArch64PostLegalizerCombiner : public MachineFunctionPass {
429public:
430 static char ID;
431
432 AArch64PostLegalizerCombiner(bool IsOptNone = false);
433
434 StringRef getPassName() const override {
435 return "AArch64PostLegalizerCombiner";
436 }
437
438 bool runOnMachineFunction(MachineFunction &MF) override;
439 void getAnalysisUsage(AnalysisUsage &AU) const override;
440
441private:
442 bool IsOptNone;
443 AArch64PostLegalizerCombinerImplRuleConfig RuleConfig;
444
445
446 struct StoreInfo {
447 GStore *St = nullptr;
448 // The G_PTR_ADD that's used by the store. We keep this to cache the
449 // MachineInstr def.
450 GPtrAdd *Ptr = nullptr;
451 // The signed offset to the Ptr instruction.
452 int64_t Offset = 0;
453 LLT StoredType;
454 };
455 bool tryOptimizeConsecStores(SmallVectorImpl<StoreInfo> &Stores,
456 CSEMIRBuilder &MIB);
457
458 bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF,
459 CSEMIRBuilder &MIB);
460};
461} // end anonymous namespace
462
463void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
465 AU.setPreservesCFG();
469 if (!IsOptNone) {
474 }
476}
477
478AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone)
479 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
481
482 if (!RuleConfig.parseCommandLineOption())
483 report_fatal_error("Invalid rule identifier");
484}
485
486bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
487 if (MF.getProperties().hasProperty(
488 MachineFunctionProperties::Property::FailedISel))
489 return false;
491 MachineFunctionProperties::Property::Legalized) &&
492 "Expected a legalized function?");
493 auto *TPC = &getAnalysis<TargetPassConfig>();
494 const Function &F = MF.getFunction();
495 bool EnableOpt =
496 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
497
499 const auto *LI = ST.getLegalizerInfo();
500
501 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
503 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
505 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
506 auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
507
508 CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
509 /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
510 F.hasMinSize());
511 AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo,
512 RuleConfig, ST, MDT, LI);
513 bool Changed = Impl.combineMachineInstrs();
514
515 auto MIB = CSEMIRBuilder(MF);
516 MIB.setCSEInfo(CSEInfo);
517 Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB);
518 return Changed;
519}
520
521bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores(
523 if (Stores.size() <= 2)
524 return false;
525
526 // Profitabity checks:
527 int64_t BaseOffset = Stores[0].Offset;
528 unsigned NumPairsExpected = Stores.size() / 2;
529 unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2);
530 // Size savings will depend on whether we can fold the offset, as an
531 // immediate of an ADD.
532 auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering();
533 if (!TLI.isLegalAddImmediate(BaseOffset))
534 TotalInstsExpected++;
535 int SavingsExpected = Stores.size() - TotalInstsExpected;
536 if (SavingsExpected <= 0)
537 return false;
538
539 auto &MRI = MIB.getMF().getRegInfo();
540
541 // We have a series of consecutive stores. Factor out the common base
542 // pointer and rewrite the offsets.
543 Register NewBase = Stores[0].Ptr->getReg(0);
544 for (auto &SInfo : Stores) {
545 // Compute a new pointer with the new base ptr and adjusted offset.
546 MIB.setInstrAndDebugLoc(*SInfo.St);
547 auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset);
548 auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()),
549 NewBase, NewOff);
550 if (MIB.getObserver())
551 MIB.getObserver()->changingInstr(*SInfo.St);
552 SInfo.St->getOperand(1).setReg(NewPtr.getReg(0));
553 if (MIB.getObserver())
554 MIB.getObserver()->changedInstr(*SInfo.St);
555 }
556 LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size()
557 << " stores into a base pointer and offsets.\n");
558 return true;
559}
560
561static cl::opt<bool>
562 EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops",
563 cl::init(true), cl::Hidden,
564 cl::desc("Enable consecutive memop optimization "
565 "in AArch64PostLegalizerCombiner"));
566
567bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
568 MachineFunction &MF, CSEMIRBuilder &MIB) {
569 // This combine needs to run after all reassociations/folds on pointer
570 // addressing have been done, specifically those that combine two G_PTR_ADDs
571 // with constant offsets into a single G_PTR_ADD with a combined offset.
572 // The goal of this optimization is to undo that combine in the case where
573 // doing so has prevented the formation of pair stores due to illegal
574 // addressing modes of STP. The reason that we do it here is because
575 // it's much easier to undo the transformation of a series consecutive
576 // mem ops, than it is to detect when doing it would be a bad idea looking
577 // at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine.
578 //
579 // An example:
580 // G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1)
581 // %off1:_(s64) = G_CONSTANT i64 4128
582 // %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64)
583 // G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1)
584 // %off2:_(s64) = G_CONSTANT i64 4144
585 // %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64)
586 // G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1)
587 // %off3:_(s64) = G_CONSTANT i64 4160
588 // %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64)
589 // G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1)
590 bool Changed = false;
591 auto &MRI = MF.getRegInfo();
592
594 return Changed;
595
597 // If we see a load, then we keep track of any values defined by it.
598 // In the following example, STP formation will fail anyway because
599 // the latter store is using a load result that appears after the
600 // the prior store. In this situation if we factor out the offset then
601 // we increase code size for no benefit.
602 // G_STORE %v1:_(s64), %base:_(p0) :: (store (s64))
603 // %v2:_(s64) = G_LOAD %ldptr:_(p0) :: (load (s64))
604 // G_STORE %v2:_(s64), %base:_(p0) :: (store (s64))
605 SmallVector<Register> LoadValsSinceLastStore;
606
607 auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) {
608 // Check if this store is consecutive to the last one.
609 if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() ||
610 (Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) !=
611 New.Offset) ||
612 Last.StoredType != New.StoredType)
613 return false;
614
615 // Check if this store is using a load result that appears after the
616 // last store. If so, bail out.
617 if (any_of(LoadValsSinceLastStore, [&](Register LoadVal) {
618 return New.St->getValueReg() == LoadVal;
619 }))
620 return false;
621
622 // Check if the current offset would be too large for STP.
623 // If not, then STP formation should be able to handle it, so we don't
624 // need to do anything.
625 int64_t MaxLegalOffset;
626 switch (New.StoredType.getSizeInBits()) {
627 case 32:
628 MaxLegalOffset = 252;
629 break;
630 case 64:
631 MaxLegalOffset = 504;
632 break;
633 case 128:
634 MaxLegalOffset = 1008;
635 break;
636 default:
637 llvm_unreachable("Unexpected stored type size");
638 }
639 if (New.Offset < MaxLegalOffset)
640 return false;
641
642 // If factoring it out still wouldn't help then don't bother.
643 return New.Offset - Stores[0].Offset <= MaxLegalOffset;
644 };
645
646 auto resetState = [&]() {
647 Stores.clear();
648 LoadValsSinceLastStore.clear();
649 };
650
651 for (auto &MBB : MF) {
652 // We're looking inside a single BB at a time since the memset pattern
653 // should only be in a single block.
654 resetState();
655 for (auto &MI : MBB) {
656 if (auto *St = dyn_cast<GStore>(&MI)) {
657 Register PtrBaseReg;
659 LLT StoredValTy = MRI.getType(St->getValueReg());
660 unsigned ValSize = StoredValTy.getSizeInBits();
661 if (ValSize < 32 || St->getMMO().getSizeInBits() != ValSize)
662 continue;
663
664 Register PtrReg = St->getPointerReg();
665 if (mi_match(
666 PtrReg, MRI,
667 m_OneNonDBGUse(m_GPtrAdd(m_Reg(PtrBaseReg), m_ICst(Offset))))) {
668 GPtrAdd *PtrAdd = cast<GPtrAdd>(MRI.getVRegDef(PtrReg));
669 StoreInfo New = {St, PtrAdd, Offset.getSExtValue(), StoredValTy};
670
671 if (Stores.empty()) {
672 Stores.push_back(New);
673 continue;
674 }
675
676 // Check if this store is a valid continuation of the sequence.
677 auto &Last = Stores.back();
678 if (storeIsValid(Last, New)) {
679 Stores.push_back(New);
680 LoadValsSinceLastStore.clear(); // Reset the load value tracking.
681 } else {
682 // The store isn't a valid to consider for the prior sequence,
683 // so try to optimize what we have so far and start a new sequence.
684 Changed |= tryOptimizeConsecStores(Stores, MIB);
685 resetState();
686 Stores.push_back(New);
687 }
688 }
689 } else if (auto *Ld = dyn_cast<GLoad>(&MI)) {
690 LoadValsSinceLastStore.push_back(Ld->getDstReg());
691 }
692 }
693 Changed |= tryOptimizeConsecStores(Stores, MIB);
694 resetState();
695 }
696
697 return Changed;
698}
699
700char AArch64PostLegalizerCombiner::ID = 0;
701INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE,
702 "Combine AArch64 MachineInstrs after legalization", false,
703 false)
706INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE,
707 "Combine AArch64 MachineInstrs after legalization", false,
708 false)
709
710namespace llvm {
712 return new AArch64PostLegalizerCombiner(IsOptNone);
713}
714} // end namespace llvm
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
#define GET_GICOMBINER_CONSTRUCTOR_INITS
static cl::opt< bool > EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops", cl::init(true), cl::Hidden, cl::desc("Enable consecutive memop optimization " "in AArch64PostLegalizerCombiner"))
#define DEBUG_TYPE
Combine AArch64 MachineInstrs after legalization
MachineBasicBlock & MBB
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Provides analysis for continuously CSEing during GISel passes.
This file implements a version of MachineIRBuilder which CSEs insts within a MachineBasicBlock.
This contains common combine transformations that may be used in a combine pass,or by the target else...
Option class for Targets to specify which operations are combined how and when.
This contains the base class for all Combiners generated by TableGen.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1291
This contains common code to allow clients to notify changes to machine instr.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
Hexagon Vector Combine
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
static StringRef getName(Value *V)
R600 Clause Merge
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:76
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
Defines a builder that does CSE of MachineInstructions using GISelCSEInfo.
Definition: CSEMIRBuilder.h:32
MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val) override
Build and insert Res = G_CONSTANT Val.
Combiner implementation.
Definition: Combiner.h:34
virtual bool tryCombineAll(MachineInstr &I) const =0
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
The actual analysis pass wrapper.
Definition: CSEInfo.h:222
Simple wrapper that does the following.
Definition: CSEInfo.h:204
The CSE Analysis object.
Definition: CSEInfo.h:69
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
To use KnownBitsInfo analysis in a pass, KnownBitsInfo &Info = getAnalysis<GISelKnownBitsInfoAnalysis...
Represents a G_PTR_ADD.
Represents a G_STORE.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelType.h:159
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool hasProperty(Property P) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineFunctionProperties & getProperties() const
Get the function properties.
Helper class to build MachineInstr.
GISelChangeObserver * getObserver()
MachineInstrBuilder buildPtrAdd(const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1, std::optional< unsigned > Flags=std::nullopt)
Build and insert Res = G_PTR_ADD Op0, Op1.
MachineFunction & getMF()
Getter for the function we currently build.
void setInstrAndDebugLoc(MachineInstr &MI)
Set the insertion point to before MI, and set the debug loc to MI's loc.
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
ArrayRef< int > getShuffleMask() const
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Target-Independent Code Generator Pass Configuration Options.
virtual const TargetLowering * getTargetLowering() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
operand_type_match m_Pred()
ConstantMatch< APInt > m_ICst(APInt &Cst)
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
OneNonDBGUse_match< SubPat > m_OneNonDBGUse(const SubPat &SP)
CompareOp_match< Pred, LHS, RHS, TargetOpcode::G_ICMP > m_GICmp(const Pred &P, const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
CompareOp_match< Pred, LHS, RHS, TargetOpcode::G_FCMP > m_GFCmp(const Pred &P, const LHS &L, const RHS &R)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:625
void initializeAArch64PostLegalizerCombinerPass(PassRegistry &)
std::optional< APInt > isConstantOrConstantSplatVector(MachineInstr &MI, const MachineRegisterInfo &MRI)
Determines if MI defines a constant integer or a splat vector of constant integers.
Definition: Utils.cpp:1495
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
FunctionPass * createAArch64PostLegalizerCombiner(bool IsOptNone)
void getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU)
Modify analysis usage so it preserves passes required for the SelectionDAG fallback.
Definition: Utils.cpp:1140
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:413