LLVM 19.0.0git
AArch64MIPeepholeOpt.cpp
Go to the documentation of this file.
1//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass performs below peephole optimizations on MIR level.
10//
11// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13//
14// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16//
17// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19//
20// The mov pseudo instruction could be expanded to multiple mov instructions
21// later. In this case, we could try to split the constant operand of mov
22// instruction into two immediates which can be directly encoded into
23// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24// multiple `mov` + `and/add/sub` instructions.
25//
26// 4. Remove redundant ORRWrs which is generated by zero-extend.
27//
28// %3:gpr32 = ORRWrs $wzr, %2, 0
29// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30//
31// If AArch64's 32-bit form of instruction defines the source operand of
32// ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33// operand are set to zero.
34//
35// 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36// ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
37//
38// 6. %intermediate:gpr32 = COPY %src:fpr128
39// %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40// ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41//
42// In cases where a source FPR is copied to a GPR in order to be copied
43// to a destination FPR, we can directly copy the values between the FPRs,
44// eliminating the use of the Integer unit. When we match a pattern of
45// INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46// source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47// instructions.
48//
49// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50// 64-bits. For example,
51//
52// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53// %2:fpr64 = MOVID 0
54// %4:fpr128 = IMPLICIT_DEF
55// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56// %6:fpr128 = IMPLICIT_DEF
57// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
59// ==>
60// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61// %6:fpr128 = IMPLICIT_DEF
62// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
63//
64//===----------------------------------------------------------------------===//
65
66#include "AArch64ExpandImm.h"
67#include "AArch64InstrInfo.h"
71
72using namespace llvm;
73
74#define DEBUG_TYPE "aarch64-mi-peephole-opt"
75
76namespace {
77
78struct AArch64MIPeepholeOpt : public MachineFunctionPass {
79 static char ID;
80
81 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
83 }
84
85 const AArch64InstrInfo *TII;
87 MachineLoopInfo *MLI;
89
90 using OpcodePair = std::pair<unsigned, unsigned>;
91 template <typename T>
92 using SplitAndOpcFunc =
93 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
94 using BuildMIFunc =
95 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
97
98 /// For instructions where an immediate operand could be split into two
99 /// separate immediate instructions, use the splitTwoPartImm two handle the
100 /// optimization.
101 ///
102 /// To implement, the following function types must be passed to
103 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104 /// splitting the immediate is valid and returns the associated new opcode. A
105 /// BuildMIFunc must be implemented to build the two immediate instructions.
106 ///
107 /// Example Pattern (where IMM would require 2+ MOV instructions):
108 /// %dst = <Instr>rr %src IMM [...]
109 /// becomes:
110 /// %tmp = <Instr>ri %src (encode half IMM) [...]
111 /// %dst = <Instr>ri %tmp (encode half IMM) [...]
112 template <typename T>
113 bool splitTwoPartImm(MachineInstr &MI,
114 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
115
116 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
117 MachineInstr *&SubregToRegMI);
118
119 template <typename T>
120 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
121 template <typename T>
122 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
123
124 template <typename T>
125 bool visitAND(unsigned Opc, MachineInstr &MI);
126 bool visitORR(MachineInstr &MI);
127 bool visitINSERT(MachineInstr &MI);
128 bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
129 bool visitINSvi64lane(MachineInstr &MI);
130 bool visitFMOVDr(MachineInstr &MI);
131 bool runOnMachineFunction(MachineFunction &MF) override;
132
133 StringRef getPassName() const override {
134 return "AArch64 MI Peephole Optimization pass";
135 }
136
137 void getAnalysisUsage(AnalysisUsage &AU) const override {
138 AU.setPreservesCFG();
141 }
142};
143
144char AArch64MIPeepholeOpt::ID = 0;
145
146} // end anonymous namespace
147
148INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
149 "AArch64 MI Peephole Optimization", false, false)
150
151template <typename T>
152static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
153 T UImm = static_cast<T>(Imm);
155 return false;
156
157 // If this immediate can be handled by one instruction, do not split it.
160 if (Insn.size() == 1)
161 return false;
162
163 // The bitmask immediate consists of consecutive ones. Let's say there is
164 // constant 0b00000000001000000000010000000000 which does not consist of
165 // consecutive ones. We can split it in to two bitmask immediate like
166 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
167 // If we do AND with these two bitmask immediate, we can see original one.
169 unsigned HighestBitSet = Log2_64(UImm);
170
171 // Create a mask which is filled with one from the position of lowest bit set
172 // to the position of highest bit set.
173 T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
174 (static_cast<T>(1) << LowestBitSet);
175 // Create a mask which is filled with one outside the position of lowest bit
176 // set and the position of highest bit set.
177 T NewImm2 = UImm | ~NewImm1;
178
179 // If the split value is not valid bitmask immediate, do not split this
180 // constant.
182 return false;
183
186 return true;
187}
188
189template <typename T>
190bool AArch64MIPeepholeOpt::visitAND(
191 unsigned Opc, MachineInstr &MI) {
192 // Try below transformation.
193 //
194 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
195 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
196 //
197 // The mov pseudo instruction could be expanded to multiple mov instructions
198 // later. Let's try to split the constant operand of mov instruction into two
199 // bitmask immediates. It makes only two AND instructions intead of multiple
200 // mov + and instructions.
201
202 return splitTwoPartImm<T>(
203 MI,
204 [Opc](T Imm, unsigned RegSize, T &Imm0,
205 T &Imm1) -> std::optional<OpcodePair> {
206 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
207 return std::make_pair(Opc, Opc);
208 return std::nullopt;
209 },
210 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
211 unsigned Imm1, Register SrcReg, Register NewTmpReg,
212 Register NewDstReg) {
213 DebugLoc DL = MI.getDebugLoc();
214 MachineBasicBlock *MBB = MI.getParent();
215 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
216 .addReg(SrcReg)
217 .addImm(Imm0);
218 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
219 .addReg(NewTmpReg)
220 .addImm(Imm1);
221 });
222}
223
224bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
225 // Check this ORR comes from below zero-extend pattern.
226 //
227 // def : Pat<(i64 (zext GPR32:$src)),
228 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
229 if (MI.getOperand(3).getImm() != 0)
230 return false;
231
232 if (MI.getOperand(1).getReg() != AArch64::WZR)
233 return false;
234
235 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
236 if (!SrcMI)
237 return false;
238
239 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
240 //
241 // When you use the 32-bit form of an instruction, the upper 32 bits of the
242 // source registers are ignored and the upper 32 bits of the destination
243 // register are set to zero.
244 //
245 // If AArch64's 32-bit form of instruction defines the source operand of
246 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
247 // real AArch64 instruction and if it is not, do not process the opcode
248 // conservatively.
249 if (SrcMI->getOpcode() == TargetOpcode::COPY &&
250 SrcMI->getOperand(1).getReg().isVirtual()) {
251 const TargetRegisterClass *RC =
252 MRI->getRegClass(SrcMI->getOperand(1).getReg());
253
254 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
255 // that the upper bits are zero.
256 if (RC != &AArch64::FPR32RegClass &&
257 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
258 SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
259 return false;
260 Register CpySrc = SrcMI->getOperand(1).getReg();
261 if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
262 CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
263 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
264 TII->get(TargetOpcode::COPY), CpySrc)
265 .add(SrcMI->getOperand(1));
266 }
267 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
268 TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
269 .addReg(CpySrc);
270 SrcMI->eraseFromParent();
271 }
272 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
273 return false;
274
275 Register DefReg = MI.getOperand(0).getReg();
276 Register SrcReg = MI.getOperand(2).getReg();
277 MRI->replaceRegWith(DefReg, SrcReg);
278 MRI->clearKillFlags(SrcReg);
279 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
280 MI.eraseFromParent();
281
282 return true;
283}
284
285bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
286 // Check this INSERT_SUBREG comes from below zero-extend pattern.
287 //
288 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
289 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
290 //
291 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
292 // COPY would destroy the upper part of the register anyway
293 if (!MI.isRegTiedToDefOperand(1))
294 return false;
295
296 Register DstReg = MI.getOperand(0).getReg();
297 const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
298 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
299 if (!SrcMI)
300 return false;
301
302 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
303 //
304 // When you use the 32-bit form of an instruction, the upper 32 bits of the
305 // source registers are ignored and the upper 32 bits of the destination
306 // register are set to zero.
307 //
308 // If AArch64's 32-bit form of instruction defines the source operand of
309 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
310 // real AArch64 instruction and if it is not, do not process the opcode
311 // conservatively.
312 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
313 !AArch64::GPR64allRegClass.hasSubClassEq(RC))
314 return false;
315
316 // Build a SUBREG_TO_REG instruction
317 MachineInstr *SubregMI =
318 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
319 TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
320 .addImm(0)
321 .add(MI.getOperand(2))
322 .add(MI.getOperand(3));
323 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n");
324 (void)SubregMI;
325 MI.eraseFromParent();
326
327 return true;
328}
329
330template <typename T>
331static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
332 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
333 // imm0 and imm1 are non-zero 12-bit unsigned int.
334 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
335 (Imm & ~static_cast<T>(0xffffff)) != 0)
336 return false;
337
338 // The immediate can not be composed via a single instruction.
341 if (Insn.size() == 1)
342 return false;
343
344 // Split Imm into (Imm0 << 12) + Imm1;
345 Imm0 = (Imm >> 12) & 0xfff;
346 Imm1 = Imm & 0xfff;
347 return true;
348}
349
350template <typename T>
351bool AArch64MIPeepholeOpt::visitADDSUB(
352 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
353 // Try below transformation.
354 //
355 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
356 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
357 //
358 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
359 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
360 //
361 // The mov pseudo instruction could be expanded to multiple mov instructions
362 // later. Let's try to split the constant operand of mov instruction into two
363 // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
364 // multiple `mov` + `and/sub` instructions.
365
366 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
367 // folded. Make sure that we don't generate invalid instructions that use XZR
368 // in those cases.
369 if (MI.getOperand(1).getReg() == AArch64::XZR ||
370 MI.getOperand(1).getReg() == AArch64::WZR)
371 return false;
372
373 return splitTwoPartImm<T>(
374 MI,
375 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
376 T &Imm1) -> std::optional<OpcodePair> {
377 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
378 return std::make_pair(PosOpc, PosOpc);
379 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
380 return std::make_pair(NegOpc, NegOpc);
381 return std::nullopt;
382 },
383 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
384 unsigned Imm1, Register SrcReg, Register NewTmpReg,
385 Register NewDstReg) {
386 DebugLoc DL = MI.getDebugLoc();
387 MachineBasicBlock *MBB = MI.getParent();
388 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
389 .addReg(SrcReg)
390 .addImm(Imm0)
391 .addImm(12);
392 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
393 .addReg(NewTmpReg)
394 .addImm(Imm1)
395 .addImm(0);
396 });
397}
398
399template <typename T>
400bool AArch64MIPeepholeOpt::visitADDSSUBS(
401 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
402 // Try the same transformation as ADDSUB but with additional requirement
403 // that the condition code usages are only for Equal and Not Equal
404
405 if (MI.getOperand(1).getReg() == AArch64::XZR ||
406 MI.getOperand(1).getReg() == AArch64::WZR)
407 return false;
408
409 return splitTwoPartImm<T>(
410 MI,
411 [PosOpcs, NegOpcs, &MI, &TRI = TRI,
412 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
413 T &Imm1) -> std::optional<OpcodePair> {
414 OpcodePair OP;
415 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
416 OP = PosOpcs;
417 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
418 OP = NegOpcs;
419 else
420 return std::nullopt;
421 // Check conditional uses last since it is expensive for scanning
422 // proceeding instructions
423 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
424 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
425 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
426 return std::nullopt;
427 return OP;
428 },
429 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
430 unsigned Imm1, Register SrcReg, Register NewTmpReg,
431 Register NewDstReg) {
432 DebugLoc DL = MI.getDebugLoc();
433 MachineBasicBlock *MBB = MI.getParent();
434 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
435 .addReg(SrcReg)
436 .addImm(Imm0)
437 .addImm(12);
438 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
439 .addReg(NewTmpReg)
440 .addImm(Imm1)
441 .addImm(0);
442 });
443}
444
445// Checks if the corresponding MOV immediate instruction is applicable for
446// this peephole optimization.
447bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
448 MachineInstr *&MovMI,
449 MachineInstr *&SubregToRegMI) {
450 // Check whether current MBB is in loop and the AND is loop invariant.
451 MachineBasicBlock *MBB = MI.getParent();
452 MachineLoop *L = MLI->getLoopFor(MBB);
453 if (L && !L->isLoopInvariant(MI))
454 return false;
455
456 // Check whether current MI's operand is MOV with immediate.
457 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
458 if (!MovMI)
459 return false;
460
461 // If it is SUBREG_TO_REG, check its operand.
462 SubregToRegMI = nullptr;
463 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
464 SubregToRegMI = MovMI;
465 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
466 if (!MovMI)
467 return false;
468 }
469
470 if (MovMI->getOpcode() != AArch64::MOVi32imm &&
471 MovMI->getOpcode() != AArch64::MOVi64imm)
472 return false;
473
474 // If the MOV has multiple uses, do not split the immediate because it causes
475 // more instructions.
476 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
477 return false;
478 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
479 return false;
480
481 // It is OK to perform this peephole optimization.
482 return true;
483}
484
485template <typename T>
486bool AArch64MIPeepholeOpt::splitTwoPartImm(
488 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
489 unsigned RegSize = sizeof(T) * 8;
490 assert((RegSize == 32 || RegSize == 64) &&
491 "Invalid RegSize for legal immediate peephole optimization");
492
493 // Perform several essential checks against current MI.
494 MachineInstr *MovMI, *SubregToRegMI;
495 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
496 return false;
497
498 // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
499 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
500 // For the 32 bit form of instruction, the upper 32 bits of the destination
501 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
502 // of Imm to zero. This is essential if the Immediate value was a negative
503 // number since it was sign extended when we assign to the 64-bit Imm.
504 if (SubregToRegMI)
505 Imm &= 0xFFFFFFFF;
506 OpcodePair Opcode;
507 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
508 Opcode = *R;
509 else
510 return false;
511
512 // Create new MIs using the first and second opcodes. Opcodes might differ for
513 // flag setting operations that should only set flags on second instruction.
514 // NewTmpReg = Opcode.first SrcReg Imm0
515 // NewDstReg = Opcode.second NewTmpReg Imm1
516
517 // Determine register classes for destinations and register operands
518 MachineFunction *MF = MI.getMF();
519 const TargetRegisterClass *FirstInstrDstRC =
520 TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
521 const TargetRegisterClass *FirstInstrOperandRC =
522 TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
523 const TargetRegisterClass *SecondInstrDstRC =
524 (Opcode.first == Opcode.second)
525 ? FirstInstrDstRC
526 : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
527 const TargetRegisterClass *SecondInstrOperandRC =
528 (Opcode.first == Opcode.second)
529 ? FirstInstrOperandRC
530 : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
531
532 // Get old registers destinations and new register destinations
533 Register DstReg = MI.getOperand(0).getReg();
534 Register SrcReg = MI.getOperand(1).getReg();
535 Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
536 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
537 // reuse that same destination register.
538 Register NewDstReg = DstReg.isVirtual()
539 ? MRI->createVirtualRegister(SecondInstrDstRC)
540 : DstReg;
541
542 // Constrain registers based on their new uses
543 MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
544 MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
545 if (DstReg != NewDstReg)
546 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
547
548 // Call the delegating operation to build the instruction
549 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
550
551 // replaceRegWith changes MI's definition register. Keep it for SSA form until
552 // deleting MI. Only if we made a new destination register.
553 if (DstReg != NewDstReg) {
554 MRI->replaceRegWith(DstReg, NewDstReg);
555 MI.getOperand(0).setReg(DstReg);
556 }
557
558 // Record the MIs need to be removed.
559 MI.eraseFromParent();
560 if (SubregToRegMI)
561 SubregToRegMI->eraseFromParent();
562 MovMI->eraseFromParent();
563
564 return true;
565}
566
567bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
568 // Check if this INSvi[X]gpr comes from COPY of a source FPR128
569 //
570 // From
571 // %intermediate1:gpr64 = COPY %src:fpr128
572 // %intermediate2:gpr32 = COPY %intermediate1:gpr64
573 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
574 // To
575 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
576 // src_index
577 // where src_index = 0, X = [8|16|32|64]
578
579 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
580
581 // For a chain of COPY instructions, find the initial source register
582 // and check if it's an FPR128
583 while (true) {
584 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
585 return false;
586
587 if (!SrcMI->getOperand(1).getReg().isVirtual())
588 return false;
589
590 if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
591 &AArch64::FPR128RegClass) {
592 break;
593 }
594 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
595 }
596
597 Register DstReg = MI.getOperand(0).getReg();
598 Register SrcReg = SrcMI->getOperand(1).getReg();
599 MachineInstr *INSvilaneMI =
600 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
601 .add(MI.getOperand(1))
602 .add(MI.getOperand(2))
603 .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
604 .addImm(0);
605
606 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");
607 (void)INSvilaneMI;
608 MI.eraseFromParent();
609 return true;
610}
611
612// All instructions that set a FPR64 will implicitly zero the top bits of the
613// register.
616 if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
617 return false;
618 const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
619 if (RC != &AArch64::FPR64RegClass)
620 return false;
621 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
622}
623
624bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
625 // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
626 // We are expecting below case.
627 //
628 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
629 // %6:fpr128 = IMPLICIT_DEF
630 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
631 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
632 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
633 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
634 return false;
635 Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
636 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
637 return false;
638
639 // Check there is `mov 0` MI for high 64-bits.
640 // We are expecting below cases.
641 //
642 // %2:fpr64 = MOVID 0
643 // %4:fpr128 = IMPLICIT_DEF
644 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
645 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
646 // or
647 // %5:fpr128 = MOVIv2d_ns 0
648 // %6:fpr64 = COPY %5.dsub:fpr128
649 // %8:fpr128 = IMPLICIT_DEF
650 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
651 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
652 MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
653 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
654 return false;
655 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
656 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
657 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
658 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
659 High64MI->getOpcode() != AArch64::MOVIv2d_ns))
660 return false;
661 if (High64MI->getOperand(1).getImm() != 0)
662 return false;
663
664 // Let's remove MIs for high 64-bits.
665 Register OldDef = MI.getOperand(0).getReg();
666 Register NewDef = MI.getOperand(1).getReg();
667 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
668 MRI->replaceRegWith(OldDef, NewDef);
669 MI.eraseFromParent();
670
671 return true;
672}
673
674bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
675 // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
676 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
677 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
678 return false;
679
680 // Let's remove MIs for high 64-bits.
681 Register OldDef = MI.getOperand(0).getReg();
682 Register NewDef = MI.getOperand(1).getReg();
683 LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
684 MRI->clearKillFlags(OldDef);
685 MRI->clearKillFlags(NewDef);
686 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
687 MRI->replaceRegWith(OldDef, NewDef);
688 MI.eraseFromParent();
689
690 return true;
691}
692
693bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
694 if (skipFunction(MF.getFunction()))
695 return false;
696
697 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
698 TRI = static_cast<const AArch64RegisterInfo *>(
700 MLI = &getAnalysis<MachineLoopInfo>();
701 MRI = &MF.getRegInfo();
702
703 assert(MRI->isSSA() && "Expected to be run on SSA form!");
704
705 bool Changed = false;
706
707 for (MachineBasicBlock &MBB : MF) {
709 switch (MI.getOpcode()) {
710 default:
711 break;
712 case AArch64::INSERT_SUBREG:
713 Changed |= visitINSERT(MI);
714 break;
715 case AArch64::ANDWrr:
716 Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
717 break;
718 case AArch64::ANDXrr:
719 Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
720 break;
721 case AArch64::ORRWrs:
722 Changed |= visitORR(MI);
723 break;
724 case AArch64::ADDWrr:
725 Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
726 break;
727 case AArch64::SUBWrr:
728 Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
729 break;
730 case AArch64::ADDXrr:
731 Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
732 break;
733 case AArch64::SUBXrr:
734 Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
735 break;
736 case AArch64::ADDSWrr:
737 Changed |=
738 visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
739 {AArch64::SUBWri, AArch64::SUBSWri}, MI);
740 break;
741 case AArch64::SUBSWrr:
742 Changed |=
743 visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
744 {AArch64::ADDWri, AArch64::ADDSWri}, MI);
745 break;
746 case AArch64::ADDSXrr:
747 Changed |=
748 visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
749 {AArch64::SUBXri, AArch64::SUBSXri}, MI);
750 break;
751 case AArch64::SUBSXrr:
752 Changed |=
753 visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
754 {AArch64::ADDXri, AArch64::ADDSXri}, MI);
755 break;
756 case AArch64::INSvi64gpr:
757 Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
758 break;
759 case AArch64::INSvi32gpr:
760 Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
761 break;
762 case AArch64::INSvi16gpr:
763 Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
764 break;
765 case AArch64::INSvi8gpr:
766 Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
767 break;
768 case AArch64::INSvi64lane:
769 Changed |= visitINSvi64lane(MI);
770 break;
771 case AArch64::FMOVDr:
772 Changed |= visitFMOVDr(MI);
773 break;
774 }
775 }
776 }
777
778 return Changed;
779}
780
782 return new AArch64MIPeepholeOpt();
783}
unsigned const MachineRegisterInfo * MRI
unsigned HighestBitSet
unsigned T T & Imm2Enc
unsigned T & Imm1Enc
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
unsigned RegSize
static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, MachineRegisterInfo *MRI)
static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1)
unsigned LowestBitSet
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define LLVM_DEBUG(X)
Definition: Debug.h:101
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
A debug info location.
Definition: DebugLoc.h:33
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:475
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
unsigned getSubReg() const
int64_t getImm() const
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createAArch64MIPeepholeOptPass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:319
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
void initializeAArch64MIPeepholeOptPass(PassRegistry &)
#define OP(n)
Definition: regex2.h:73