LLVM  14.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/CodeGen/StackMaps.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/MC/MCAsmInfo.h"
37 #include "llvm/MC/MCInst.h"
38 #include "llvm/MC/MCInstBuilder.h"
39 #include "llvm/MC/MCInstrDesc.h"
40 #include "llvm/Support/Casting.h"
41 #include "llvm/Support/CodeGen.h"
43 #include "llvm/Support/Compiler.h"
48 #include <cassert>
49 #include <cstdint>
50 #include <iterator>
51 #include <utility>
52 
53 using namespace llvm;
54 
55 #define GET_INSTRINFO_CTOR_DTOR
56 #include "AArch64GenInstrInfo.inc"
57 
59  "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
60  cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
61 
63  "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
64  cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
65 
66 static cl::opt<unsigned>
67  BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
68  cl::desc("Restrict range of Bcc instructions (DEBUG)"));
69 
71  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
72  AArch64::CATCHRET),
73  RI(STI.getTargetTriple()), Subtarget(STI) {}
74 
75 /// GetInstSize - Return the number of bytes of code the specified
76 /// instruction may be. This returns the maximum number of bytes.
78  const MachineBasicBlock &MBB = *MI.getParent();
79  const MachineFunction *MF = MBB.getParent();
80  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
81 
82  {
83  auto Op = MI.getOpcode();
85  return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
86  }
87 
88  // Meta-instructions emit no code.
89  if (MI.isMetaInstruction())
90  return 0;
91 
92  // FIXME: We currently only handle pseudoinstructions that don't get expanded
93  // before the assembly printer.
94  unsigned NumBytes = 0;
95  const MCInstrDesc &Desc = MI.getDesc();
96  switch (Desc.getOpcode()) {
97  default:
98  // Anything not explicitly designated otherwise is a normal 4-byte insn.
99  NumBytes = 4;
100  break;
101  case TargetOpcode::STACKMAP:
102  // The upper bound for a stackmap intrinsic is the full length of its shadow
103  NumBytes = StackMapOpers(&MI).getNumPatchBytes();
104  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
105  break;
106  case TargetOpcode::PATCHPOINT:
107  // The size of the patchpoint intrinsic is the number of bytes requested
108  NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
109  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
110  break;
111  case TargetOpcode::STATEPOINT:
112  NumBytes = StatepointOpers(&MI).getNumPatchBytes();
113  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
114  // No patch bytes means a normal call inst is emitted
115  if (NumBytes == 0)
116  NumBytes = 4;
117  break;
119  // This gets lowered to an instruction sequence which takes 16 bytes
120  NumBytes = 16;
121  break;
122  case AArch64::SpeculationBarrierISBDSBEndBB:
123  // This gets lowered to 2 4-byte instructions.
124  NumBytes = 8;
125  break;
126  case AArch64::SpeculationBarrierSBEndBB:
127  // This gets lowered to 1 4-byte instructions.
128  NumBytes = 4;
129  break;
130  case AArch64::JumpTableDest32:
131  case AArch64::JumpTableDest16:
132  case AArch64::JumpTableDest8:
133  NumBytes = 12;
134  break;
135  case AArch64::SPACE:
136  NumBytes = MI.getOperand(1).getImm();
137  break;
138  case AArch64::StoreSwiftAsyncContext:
139  NumBytes = 20;
140  break;
141  case TargetOpcode::BUNDLE:
142  NumBytes = getInstBundleLength(MI);
143  break;
144  }
145 
146  return NumBytes;
147 }
148 
149 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
150  unsigned Size = 0;
152  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
153  while (++I != E && I->isInsideBundle()) {
154  assert(!I->isBundle() && "No nested bundle!");
156  }
157  return Size;
158 }
159 
162  // Block ends with fall-through condbranch.
163  switch (LastInst->getOpcode()) {
164  default:
165  llvm_unreachable("Unknown branch instruction?");
166  case AArch64::Bcc:
167  Target = LastInst->getOperand(1).getMBB();
168  Cond.push_back(LastInst->getOperand(0));
169  break;
170  case AArch64::CBZW:
171  case AArch64::CBZX:
172  case AArch64::CBNZW:
173  case AArch64::CBNZX:
174  Target = LastInst->getOperand(1).getMBB();
175  Cond.push_back(MachineOperand::CreateImm(-1));
176  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
177  Cond.push_back(LastInst->getOperand(0));
178  break;
179  case AArch64::TBZW:
180  case AArch64::TBZX:
181  case AArch64::TBNZW:
182  case AArch64::TBNZX:
183  Target = LastInst->getOperand(2).getMBB();
184  Cond.push_back(MachineOperand::CreateImm(-1));
185  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
186  Cond.push_back(LastInst->getOperand(0));
187  Cond.push_back(LastInst->getOperand(1));
188  }
189 }
190 
191 static unsigned getBranchDisplacementBits(unsigned Opc) {
192  switch (Opc) {
193  default:
194  llvm_unreachable("unexpected opcode!");
195  case AArch64::B:
196  return 64;
197  case AArch64::TBNZW:
198  case AArch64::TBZW:
199  case AArch64::TBNZX:
200  case AArch64::TBZX:
201  return TBZDisplacementBits;
202  case AArch64::CBNZW:
203  case AArch64::CBZW:
204  case AArch64::CBNZX:
205  case AArch64::CBZX:
206  return CBZDisplacementBits;
207  case AArch64::Bcc:
208  return BCCDisplacementBits;
209  }
210 }
211 
213  int64_t BrOffset) const {
214  unsigned Bits = getBranchDisplacementBits(BranchOp);
215  assert(Bits >= 3 && "max branch displacement must be enough to jump"
216  "over conditional branch expansion");
217  return isIntN(Bits, BrOffset / 4);
218 }
219 
222  switch (MI.getOpcode()) {
223  default:
224  llvm_unreachable("unexpected opcode!");
225  case AArch64::B:
226  return MI.getOperand(0).getMBB();
227  case AArch64::TBZW:
228  case AArch64::TBNZW:
229  case AArch64::TBZX:
230  case AArch64::TBNZX:
231  return MI.getOperand(2).getMBB();
232  case AArch64::CBZW:
233  case AArch64::CBNZW:
234  case AArch64::CBZX:
235  case AArch64::CBNZX:
236  case AArch64::Bcc:
237  return MI.getOperand(1).getMBB();
238  }
239 }
240 
241 // Branch analysis.
243  MachineBasicBlock *&TBB,
244  MachineBasicBlock *&FBB,
246  bool AllowModify) const {
247  // If the block has no terminators, it just falls into the block after it.
249  if (I == MBB.end())
250  return false;
251 
252  // Skip over SpeculationBarrierEndBB terminators
253  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
254  I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
255  --I;
256  }
257 
258  if (!isUnpredicatedTerminator(*I))
259  return false;
260 
261  // Get the last instruction in the block.
262  MachineInstr *LastInst = &*I;
263 
264  // If there is only one terminator instruction, process it.
265  unsigned LastOpc = LastInst->getOpcode();
266  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
267  if (isUncondBranchOpcode(LastOpc)) {
268  TBB = LastInst->getOperand(0).getMBB();
269  return false;
270  }
271  if (isCondBranchOpcode(LastOpc)) {
272  // Block ends with fall-through condbranch.
273  parseCondBranch(LastInst, TBB, Cond);
274  return false;
275  }
276  return true; // Can't handle indirect branch.
277  }
278 
279  // Get the instruction before it if it is a terminator.
280  MachineInstr *SecondLastInst = &*I;
281  unsigned SecondLastOpc = SecondLastInst->getOpcode();
282 
283  // If AllowModify is true and the block ends with two or more unconditional
284  // branches, delete all but the first unconditional branch.
285  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
286  while (isUncondBranchOpcode(SecondLastOpc)) {
287  LastInst->eraseFromParent();
288  LastInst = SecondLastInst;
289  LastOpc = LastInst->getOpcode();
290  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
291  // Return now the only terminator is an unconditional branch.
292  TBB = LastInst->getOperand(0).getMBB();
293  return false;
294  } else {
295  SecondLastInst = &*I;
296  SecondLastOpc = SecondLastInst->getOpcode();
297  }
298  }
299  }
300 
301  // If we're allowed to modify and the block ends in a unconditional branch
302  // which could simply fallthrough, remove the branch. (Note: This case only
303  // matters when we can't understand the whole sequence, otherwise it's also
304  // handled by BranchFolding.cpp.)
305  if (AllowModify && isUncondBranchOpcode(LastOpc) &&
307  LastInst->eraseFromParent();
308  LastInst = SecondLastInst;
309  LastOpc = LastInst->getOpcode();
310  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
311  assert(!isUncondBranchOpcode(LastOpc) &&
312  "unreachable unconditional branches removed above");
313 
314  if (isCondBranchOpcode(LastOpc)) {
315  // Block ends with fall-through condbranch.
316  parseCondBranch(LastInst, TBB, Cond);
317  return false;
318  }
319  return true; // Can't handle indirect branch.
320  } else {
321  SecondLastInst = &*I;
322  SecondLastOpc = SecondLastInst->getOpcode();
323  }
324  }
325 
326  // If there are three terminators, we don't know what sort of block this is.
327  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
328  return true;
329 
330  // If the block ends with a B and a Bcc, handle it.
331  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
332  parseCondBranch(SecondLastInst, TBB, Cond);
333  FBB = LastInst->getOperand(0).getMBB();
334  return false;
335  }
336 
337  // If the block ends with two unconditional branches, handle it. The second
338  // one is not executed, so remove it.
339  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
340  TBB = SecondLastInst->getOperand(0).getMBB();
341  I = LastInst;
342  if (AllowModify)
343  I->eraseFromParent();
344  return false;
345  }
346 
347  // ...likewise if it ends with an indirect branch followed by an unconditional
348  // branch.
349  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
350  I = LastInst;
351  if (AllowModify)
352  I->eraseFromParent();
353  return true;
354  }
355 
356  // Otherwise, can't handle this.
357  return true;
358 }
359 
361  MachineBranchPredicate &MBP,
362  bool AllowModify) const {
363  // For the moment, handle only a block which ends with a cb(n)zx followed by
364  // a fallthrough. Why this? Because it is a common form.
365  // TODO: Should we handle b.cc?
366 
368  if (I == MBB.end())
369  return true;
370 
371  // Skip over SpeculationBarrierEndBB terminators
372  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
373  I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
374  --I;
375  }
376 
377  if (!isUnpredicatedTerminator(*I))
378  return true;
379 
380  // Get the last instruction in the block.
381  MachineInstr *LastInst = &*I;
382  unsigned LastOpc = LastInst->getOpcode();
383  if (!isCondBranchOpcode(LastOpc))
384  return true;
385 
386  switch (LastOpc) {
387  default:
388  return true;
389  case AArch64::CBZW:
390  case AArch64::CBZX:
391  case AArch64::CBNZW:
392  case AArch64::CBNZX:
393  break;
394  };
395 
396  MBP.TrueDest = LastInst->getOperand(1).getMBB();
397  assert(MBP.TrueDest && "expected!");
398  MBP.FalseDest = MBB.getNextNode();
399 
400  MBP.ConditionDef = nullptr;
401  MBP.SingleUseCondition = false;
402 
403  MBP.LHS = LastInst->getOperand(0);
404  MBP.RHS = MachineOperand::CreateImm(0);
405  MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
407  return false;
408 }
409 
412  if (Cond[0].getImm() != -1) {
413  // Regular Bcc
414  AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
415  Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
416  } else {
417  // Folded compare-and-branch
418  switch (Cond[1].getImm()) {
419  default:
420  llvm_unreachable("Unknown conditional branch!");
421  case AArch64::CBZW:
422  Cond[1].setImm(AArch64::CBNZW);
423  break;
424  case AArch64::CBNZW:
425  Cond[1].setImm(AArch64::CBZW);
426  break;
427  case AArch64::CBZX:
428  Cond[1].setImm(AArch64::CBNZX);
429  break;
430  case AArch64::CBNZX:
431  Cond[1].setImm(AArch64::CBZX);
432  break;
433  case AArch64::TBZW:
434  Cond[1].setImm(AArch64::TBNZW);
435  break;
436  case AArch64::TBNZW:
437  Cond[1].setImm(AArch64::TBZW);
438  break;
439  case AArch64::TBZX:
440  Cond[1].setImm(AArch64::TBNZX);
441  break;
442  case AArch64::TBNZX:
443  Cond[1].setImm(AArch64::TBZX);
444  break;
445  }
446  }
447 
448  return false;
449 }
450 
452  int *BytesRemoved) const {
454  if (I == MBB.end())
455  return 0;
456 
457  if (!isUncondBranchOpcode(I->getOpcode()) &&
458  !isCondBranchOpcode(I->getOpcode()))
459  return 0;
460 
461  // Remove the branch.
462  I->eraseFromParent();
463 
464  I = MBB.end();
465 
466  if (I == MBB.begin()) {
467  if (BytesRemoved)
468  *BytesRemoved = 4;
469  return 1;
470  }
471  --I;
472  if (!isCondBranchOpcode(I->getOpcode())) {
473  if (BytesRemoved)
474  *BytesRemoved = 4;
475  return 1;
476  }
477 
478  // Remove the branch.
479  I->eraseFromParent();
480  if (BytesRemoved)
481  *BytesRemoved = 8;
482 
483  return 2;
484 }
485 
486 void AArch64InstrInfo::instantiateCondBranch(
489  if (Cond[0].getImm() != -1) {
490  // Regular Bcc
491  BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
492  } else {
493  // Folded compare-and-branch
494  // Note that we use addOperand instead of addReg to keep the flags.
495  const MachineInstrBuilder MIB =
496  BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
497  if (Cond.size() > 3)
498  MIB.addImm(Cond[3].getImm());
499  MIB.addMBB(TBB);
500  }
501 }
502 
505  ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
506  // Shouldn't be a fall through.
507  assert(TBB && "insertBranch must not be told to insert a fallthrough");
508 
509  if (!FBB) {
510  if (Cond.empty()) // Unconditional branch?
511  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
512  else
513  instantiateCondBranch(MBB, DL, TBB, Cond);
514 
515  if (BytesAdded)
516  *BytesAdded = 4;
517 
518  return 1;
519  }
520 
521  // Two-way conditional branch.
522  instantiateCondBranch(MBB, DL, TBB, Cond);
523  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
524 
525  if (BytesAdded)
526  *BytesAdded = 8;
527 
528  return 2;
529 }
530 
531 // Find the original register that VReg is copied from.
532 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
533  while (Register::isVirtualRegister(VReg)) {
534  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
535  if (!DefMI->isFullCopy())
536  return VReg;
537  VReg = DefMI->getOperand(1).getReg();
538  }
539  return VReg;
540 }
541 
542 // Determine if VReg is defined by an instruction that can be folded into a
543 // csel instruction. If so, return the folded opcode, and the replacement
544 // register.
545 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
546  unsigned *NewVReg = nullptr) {
547  VReg = removeCopies(MRI, VReg);
548  if (!Register::isVirtualRegister(VReg))
549  return 0;
550 
551  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
552  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
553  unsigned Opc = 0;
554  unsigned SrcOpNum = 0;
555  switch (DefMI->getOpcode()) {
556  case AArch64::ADDSXri:
557  case AArch64::ADDSWri:
558  // if NZCV is used, do not fold.
559  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
560  return 0;
561  // fall-through to ADDXri and ADDWri.
563  case AArch64::ADDXri:
564  case AArch64::ADDWri:
565  // add x, 1 -> csinc.
566  if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
567  DefMI->getOperand(3).getImm() != 0)
568  return 0;
569  SrcOpNum = 1;
570  Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
571  break;
572 
573  case AArch64::ORNXrr:
574  case AArch64::ORNWrr: {
575  // not x -> csinv, represented as orn dst, xzr, src.
576  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
577  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
578  return 0;
579  SrcOpNum = 2;
580  Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
581  break;
582  }
583 
584  case AArch64::SUBSXrr:
585  case AArch64::SUBSWrr:
586  // if NZCV is used, do not fold.
587  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
588  return 0;
589  // fall-through to SUBXrr and SUBWrr.
591  case AArch64::SUBXrr:
592  case AArch64::SUBWrr: {
593  // neg x -> csneg, represented as sub dst, xzr, src.
594  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
595  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
596  return 0;
597  SrcOpNum = 2;
598  Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
599  break;
600  }
601  default:
602  return 0;
603  }
604  assert(Opc && SrcOpNum && "Missing parameters");
605 
606  if (NewVReg)
607  *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
608  return Opc;
609 }
610 
613  Register DstReg, Register TrueReg,
614  Register FalseReg, int &CondCycles,
615  int &TrueCycles,
616  int &FalseCycles) const {
617  // Check register classes.
619  const TargetRegisterClass *RC =
620  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
621  if (!RC)
622  return false;
623 
624  // Also need to check the dest regclass, in case we're trying to optimize
625  // something like:
626  // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
627  if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
628  return false;
629 
630  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
631  unsigned ExtraCondLat = Cond.size() != 1;
632 
633  // GPRs are handled by csel.
634  // FIXME: Fold in x+1, -x, and ~x when applicable.
635  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
636  AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
637  // Single-cycle csel, csinc, csinv, and csneg.
638  CondCycles = 1 + ExtraCondLat;
639  TrueCycles = FalseCycles = 1;
640  if (canFoldIntoCSel(MRI, TrueReg))
641  TrueCycles = 0;
642  else if (canFoldIntoCSel(MRI, FalseReg))
643  FalseCycles = 0;
644  return true;
645  }
646 
647  // Scalar floating point is handled by fcsel.
648  // FIXME: Form fabs, fmin, and fmax when applicable.
649  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
650  AArch64::FPR32RegClass.hasSubClassEq(RC)) {
651  CondCycles = 5 + ExtraCondLat;
652  TrueCycles = FalseCycles = 2;
653  return true;
654  }
655 
656  // Can't do vectors.
657  return false;
658 }
659 
662  const DebugLoc &DL, Register DstReg,
664  Register TrueReg, Register FalseReg) const {
666 
667  // Parse the condition code, see parseCondBranch() above.
669  switch (Cond.size()) {
670  default:
671  llvm_unreachable("Unknown condition opcode in Cond");
672  case 1: // b.cc
673  CC = AArch64CC::CondCode(Cond[0].getImm());
674  break;
675  case 3: { // cbz/cbnz
676  // We must insert a compare against 0.
677  bool Is64Bit;
678  switch (Cond[1].getImm()) {
679  default:
680  llvm_unreachable("Unknown branch opcode in Cond");
681  case AArch64::CBZW:
682  Is64Bit = false;
683  CC = AArch64CC::EQ;
684  break;
685  case AArch64::CBZX:
686  Is64Bit = true;
687  CC = AArch64CC::EQ;
688  break;
689  case AArch64::CBNZW:
690  Is64Bit = false;
691  CC = AArch64CC::NE;
692  break;
693  case AArch64::CBNZX:
694  Is64Bit = true;
695  CC = AArch64CC::NE;
696  break;
697  }
698  Register SrcReg = Cond[2].getReg();
699  if (Is64Bit) {
700  // cmp reg, #0 is actually subs xzr, reg, #0.
701  MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
702  BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
703  .addReg(SrcReg)
704  .addImm(0)
705  .addImm(0);
706  } else {
707  MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
708  BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
709  .addReg(SrcReg)
710  .addImm(0)
711  .addImm(0);
712  }
713  break;
714  }
715  case 4: { // tbz/tbnz
716  // We must insert a tst instruction.
717  switch (Cond[1].getImm()) {
718  default:
719  llvm_unreachable("Unknown branch opcode in Cond");
720  case AArch64::TBZW:
721  case AArch64::TBZX:
722  CC = AArch64CC::EQ;
723  break;
724  case AArch64::TBNZW:
725  case AArch64::TBNZX:
726  CC = AArch64CC::NE;
727  break;
728  }
729  // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
730  if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
731  BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
732  .addReg(Cond[2].getReg())
733  .addImm(
734  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
735  else
736  BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
737  .addReg(Cond[2].getReg())
738  .addImm(
739  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
740  break;
741  }
742  }
743 
744  unsigned Opc = 0;
745  const TargetRegisterClass *RC = nullptr;
746  bool TryFold = false;
747  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
748  RC = &AArch64::GPR64RegClass;
749  Opc = AArch64::CSELXr;
750  TryFold = true;
751  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
752  RC = &AArch64::GPR32RegClass;
753  Opc = AArch64::CSELWr;
754  TryFold = true;
755  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
756  RC = &AArch64::FPR64RegClass;
757  Opc = AArch64::FCSELDrrr;
758  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
759  RC = &AArch64::FPR32RegClass;
760  Opc = AArch64::FCSELSrrr;
761  }
762  assert(RC && "Unsupported regclass");
763 
764  // Try folding simple instructions into the csel.
765  if (TryFold) {
766  unsigned NewVReg = 0;
767  unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
768  if (FoldedOpc) {
769  // The folded opcodes csinc, csinc and csneg apply the operation to
770  // FalseReg, so we need to invert the condition.
772  TrueReg = FalseReg;
773  } else
774  FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
775 
776  // Fold the operation. Leave any dead instructions for DCE to clean up.
777  if (FoldedOpc) {
778  FalseReg = NewVReg;
779  Opc = FoldedOpc;
780  // The extends the live range of NewVReg.
781  MRI.clearKillFlags(NewVReg);
782  }
783  }
784 
785  // Pull all virtual register into the appropriate class.
786  MRI.constrainRegClass(TrueReg, RC);
787  MRI.constrainRegClass(FalseReg, RC);
788 
789  // Insert the csel.
790  BuildMI(MBB, I, DL, get(Opc), DstReg)
791  .addReg(TrueReg)
792  .addReg(FalseReg)
793  .addImm(CC);
794 }
795 
796 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
797 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
798  uint64_t Imm = MI.getOperand(1).getImm();
799  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
800  uint64_t Encoding;
801  return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
802 }
803 
804 // FIXME: this implementation should be micro-architecture dependent, so a
805 // micro-architecture target hook should be introduced here in future.
807  if (!Subtarget.hasCustomCheapAsMoveHandling())
808  return MI.isAsCheapAsAMove();
809 
810  const unsigned Opcode = MI.getOpcode();
811 
812  // Firstly, check cases gated by features.
813 
814  if (Subtarget.hasZeroCycleZeroingFP()) {
815  if (Opcode == AArch64::FMOVH0 ||
816  Opcode == AArch64::FMOVS0 ||
817  Opcode == AArch64::FMOVD0)
818  return true;
819  }
820 
821  if (Subtarget.hasZeroCycleZeroingGP()) {
822  if (Opcode == TargetOpcode::COPY &&
823  (MI.getOperand(1).getReg() == AArch64::WZR ||
824  MI.getOperand(1).getReg() == AArch64::XZR))
825  return true;
826  }
827 
828  // Secondly, check cases specific to sub-targets.
829 
830  if (Subtarget.hasExynosCheapAsMoveHandling()) {
831  if (isExynosCheapAsMove(MI))
832  return true;
833 
834  return MI.isAsCheapAsAMove();
835  }
836 
837  // Finally, check generic cases.
838 
839  switch (Opcode) {
840  default:
841  return false;
842 
843  // add/sub on register without shift
844  case AArch64::ADDWri:
845  case AArch64::ADDXri:
846  case AArch64::SUBWri:
847  case AArch64::SUBXri:
848  return (MI.getOperand(3).getImm() == 0);
849 
850  // logical ops on immediate
851  case AArch64::ANDWri:
852  case AArch64::ANDXri:
853  case AArch64::EORWri:
854  case AArch64::EORXri:
855  case AArch64::ORRWri:
856  case AArch64::ORRXri:
857  return true;
858 
859  // logical ops on register without shift
860  case AArch64::ANDWrr:
861  case AArch64::ANDXrr:
862  case AArch64::BICWrr:
863  case AArch64::BICXrr:
864  case AArch64::EONWrr:
865  case AArch64::EONXrr:
866  case AArch64::EORWrr:
867  case AArch64::EORXrr:
868  case AArch64::ORNWrr:
869  case AArch64::ORNXrr:
870  case AArch64::ORRWrr:
871  case AArch64::ORRXrr:
872  return true;
873 
874  // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
875  // ORRXri, it is as cheap as MOV
876  case AArch64::MOVi32imm:
877  return canBeExpandedToORR(MI, 32);
878  case AArch64::MOVi64imm:
879  return canBeExpandedToORR(MI, 64);
880  }
881 
882  llvm_unreachable("Unknown opcode to check as cheap as a move!");
883 }
884 
886  switch (MI.getOpcode()) {
887  default:
888  return false;
889 
890  case AArch64::ADDWrs:
891  case AArch64::ADDXrs:
892  case AArch64::ADDSWrs:
893  case AArch64::ADDSXrs: {
894  unsigned Imm = MI.getOperand(3).getImm();
895  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
896  if (ShiftVal == 0)
897  return true;
898  return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
899  }
900 
901  case AArch64::ADDWrx:
902  case AArch64::ADDXrx:
903  case AArch64::ADDXrx64:
904  case AArch64::ADDSWrx:
905  case AArch64::ADDSXrx:
906  case AArch64::ADDSXrx64: {
907  unsigned Imm = MI.getOperand(3).getImm();
908  switch (AArch64_AM::getArithExtendType(Imm)) {
909  default:
910  return false;
911  case AArch64_AM::UXTB:
912  case AArch64_AM::UXTH:
913  case AArch64_AM::UXTW:
914  case AArch64_AM::UXTX:
915  return AArch64_AM::getArithShiftValue(Imm) <= 4;
916  }
917  }
918 
919  case AArch64::SUBWrs:
920  case AArch64::SUBSWrs: {
921  unsigned Imm = MI.getOperand(3).getImm();
922  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
923  return ShiftVal == 0 ||
924  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
925  }
926 
927  case AArch64::SUBXrs:
928  case AArch64::SUBSXrs: {
929  unsigned Imm = MI.getOperand(3).getImm();
930  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
931  return ShiftVal == 0 ||
932  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
933  }
934 
935  case AArch64::SUBWrx:
936  case AArch64::SUBXrx:
937  case AArch64::SUBXrx64:
938  case AArch64::SUBSWrx:
939  case AArch64::SUBSXrx:
940  case AArch64::SUBSXrx64: {
941  unsigned Imm = MI.getOperand(3).getImm();
942  switch (AArch64_AM::getArithExtendType(Imm)) {
943  default:
944  return false;
945  case AArch64_AM::UXTB:
946  case AArch64_AM::UXTH:
947  case AArch64_AM::UXTW:
948  case AArch64_AM::UXTX:
949  return AArch64_AM::getArithShiftValue(Imm) == 0;
950  }
951  }
952 
953  case AArch64::LDRBBroW:
954  case AArch64::LDRBBroX:
955  case AArch64::LDRBroW:
956  case AArch64::LDRBroX:
957  case AArch64::LDRDroW:
958  case AArch64::LDRDroX:
959  case AArch64::LDRHHroW:
960  case AArch64::LDRHHroX:
961  case AArch64::LDRHroW:
962  case AArch64::LDRHroX:
963  case AArch64::LDRQroW:
964  case AArch64::LDRQroX:
965  case AArch64::LDRSBWroW:
966  case AArch64::LDRSBWroX:
967  case AArch64::LDRSBXroW:
968  case AArch64::LDRSBXroX:
969  case AArch64::LDRSHWroW:
970  case AArch64::LDRSHWroX:
971  case AArch64::LDRSHXroW:
972  case AArch64::LDRSHXroX:
973  case AArch64::LDRSWroW:
974  case AArch64::LDRSWroX:
975  case AArch64::LDRSroW:
976  case AArch64::LDRSroX:
977  case AArch64::LDRWroW:
978  case AArch64::LDRWroX:
979  case AArch64::LDRXroW:
980  case AArch64::LDRXroX:
981  case AArch64::PRFMroW:
982  case AArch64::PRFMroX:
983  case AArch64::STRBBroW:
984  case AArch64::STRBBroX:
985  case AArch64::STRBroW:
986  case AArch64::STRBroX:
987  case AArch64::STRDroW:
988  case AArch64::STRDroX:
989  case AArch64::STRHHroW:
990  case AArch64::STRHHroX:
991  case AArch64::STRHroW:
992  case AArch64::STRHroX:
993  case AArch64::STRQroW:
994  case AArch64::STRQroX:
995  case AArch64::STRSroW:
996  case AArch64::STRSroX:
997  case AArch64::STRWroW:
998  case AArch64::STRWroX:
999  case AArch64::STRXroW:
1000  case AArch64::STRXroX: {
1001  unsigned IsSigned = MI.getOperand(3).getImm();
1002  return !IsSigned;
1003  }
1004  }
1005 }
1006 
1008  unsigned Opc = MI.getOpcode();
1009  switch (Opc) {
1010  default:
1011  return false;
1012  case AArch64::SEH_StackAlloc:
1013  case AArch64::SEH_SaveFPLR:
1014  case AArch64::SEH_SaveFPLR_X:
1015  case AArch64::SEH_SaveReg:
1016  case AArch64::SEH_SaveReg_X:
1017  case AArch64::SEH_SaveRegP:
1018  case AArch64::SEH_SaveRegP_X:
1019  case AArch64::SEH_SaveFReg:
1020  case AArch64::SEH_SaveFReg_X:
1021  case AArch64::SEH_SaveFRegP:
1022  case AArch64::SEH_SaveFRegP_X:
1023  case AArch64::SEH_SetFP:
1024  case AArch64::SEH_AddFP:
1025  case AArch64::SEH_Nop:
1026  case AArch64::SEH_PrologEnd:
1027  case AArch64::SEH_EpilogStart:
1028  case AArch64::SEH_EpilogEnd:
1029  return true;
1030  }
1031 }
1032 
1034  Register &SrcReg, Register &DstReg,
1035  unsigned &SubIdx) const {
1036  switch (MI.getOpcode()) {
1037  default:
1038  return false;
1039  case AArch64::SBFMXri: // aka sxtw
1040  case AArch64::UBFMXri: // aka uxtw
1041  // Check for the 32 -> 64 bit extension case, these instructions can do
1042  // much more.
1043  if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1044  return false;
1045  // This is a signed or unsigned 32 -> 64 bit extension.
1046  SrcReg = MI.getOperand(1).getReg();
1047  DstReg = MI.getOperand(0).getReg();
1048  SubIdx = AArch64::sub_32;
1049  return true;
1050  }
1051 }
1052 
1054  const MachineInstr &MIa, const MachineInstr &MIb) const {
1056  const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1057  int64_t OffsetA = 0, OffsetB = 0;
1058  unsigned WidthA = 0, WidthB = 0;
1059  bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1060 
1061  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1062  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1063 
1064  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1066  return false;
1067 
1068  // Retrieve the base, offset from the base and width. Width
1069  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1070  // base are identical, and the offset of a lower memory access +
1071  // the width doesn't overlap the offset of a higher memory access,
1072  // then the memory accesses are different.
1073  // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1074  // are assumed to have the same scale (vscale).
1075  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1076  WidthA, TRI) &&
1077  getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1078  WidthB, TRI)) {
1079  if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1080  OffsetAIsScalable == OffsetBIsScalable) {
1081  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1082  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1083  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1084  if (LowOffset + LowWidth <= HighOffset)
1085  return true;
1086  }
1087  }
1088  return false;
1089 }
1090 
1092  const MachineBasicBlock *MBB,
1093  const MachineFunction &MF) const {
1095  return true;
1096  switch (MI.getOpcode()) {
1097  case AArch64::HINT:
1098  // CSDB hints are scheduling barriers.
1099  if (MI.getOperand(0).getImm() == 0x14)
1100  return true;
1101  break;
1102  case AArch64::DSB:
1103  case AArch64::ISB:
1104  // DSB and ISB also are scheduling barriers.
1105  return true;
1106  default:;
1107  }
1108  return isSEHInstruction(MI);
1109 }
1110 
1111 /// analyzeCompare - For a comparison instruction, return the source registers
1112 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1113 /// Return true if the comparison instruction can be analyzed.
1115  Register &SrcReg2, int64_t &CmpMask,
1116  int64_t &CmpValue) const {
1117  // The first operand can be a frame index where we'd normally expect a
1118  // register.
1119  assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1120  if (!MI.getOperand(1).isReg())
1121  return false;
1122 
1123  switch (MI.getOpcode()) {
1124  default:
1125  break;
1126  case AArch64::PTEST_PP:
1127  SrcReg = MI.getOperand(0).getReg();
1128  SrcReg2 = MI.getOperand(1).getReg();
1129  // Not sure about the mask and value for now...
1130  CmpMask = ~0;
1131  CmpValue = 0;
1132  return true;
1133  case AArch64::SUBSWrr:
1134  case AArch64::SUBSWrs:
1135  case AArch64::SUBSWrx:
1136  case AArch64::SUBSXrr:
1137  case AArch64::SUBSXrs:
1138  case AArch64::SUBSXrx:
1139  case AArch64::ADDSWrr:
1140  case AArch64::ADDSWrs:
1141  case AArch64::ADDSWrx:
1142  case AArch64::ADDSXrr:
1143  case AArch64::ADDSXrs:
1144  case AArch64::ADDSXrx:
1145  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1146  SrcReg = MI.getOperand(1).getReg();
1147  SrcReg2 = MI.getOperand(2).getReg();
1148  CmpMask = ~0;
1149  CmpValue = 0;
1150  return true;
1151  case AArch64::SUBSWri:
1152  case AArch64::ADDSWri:
1153  case AArch64::SUBSXri:
1154  case AArch64::ADDSXri:
1155  SrcReg = MI.getOperand(1).getReg();
1156  SrcReg2 = 0;
1157  CmpMask = ~0;
1158  CmpValue = MI.getOperand(2).getImm();
1159  return true;
1160  case AArch64::ANDSWri:
1161  case AArch64::ANDSXri:
1162  // ANDS does not use the same encoding scheme as the others xxxS
1163  // instructions.
1164  SrcReg = MI.getOperand(1).getReg();
1165  SrcReg2 = 0;
1166  CmpMask = ~0;
1168  MI.getOperand(2).getImm(),
1169  MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1170  return true;
1171  }
1172 
1173  return false;
1174 }
1175 
1177  MachineBasicBlock *MBB = Instr.getParent();
1178  assert(MBB && "Can't get MachineBasicBlock here");
1179  MachineFunction *MF = MBB->getParent();
1180  assert(MF && "Can't get MachineFunction here");
1181  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1184 
1185  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1186  ++OpIdx) {
1187  MachineOperand &MO = Instr.getOperand(OpIdx);
1188  const TargetRegisterClass *OpRegCstraints =
1189  Instr.getRegClassConstraint(OpIdx, TII, TRI);
1190 
1191  // If there's no constraint, there's nothing to do.
1192  if (!OpRegCstraints)
1193  continue;
1194  // If the operand is a frame index, there's nothing to do here.
1195  // A frame index operand will resolve correctly during PEI.
1196  if (MO.isFI())
1197  continue;
1198 
1199  assert(MO.isReg() &&
1200  "Operand has register constraints without being a register!");
1201 
1202  Register Reg = MO.getReg();
1204  if (!OpRegCstraints->contains(Reg))
1205  return false;
1206  } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1207  !MRI->constrainRegClass(Reg, OpRegCstraints))
1208  return false;
1209  }
1210 
1211  return true;
1212 }
1213 
1214 /// Return the opcode that does not set flags when possible - otherwise
1215 /// return the original opcode. The caller is responsible to do the actual
1216 /// substitution and legality checking.
1217 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1218  // Don't convert all compare instructions, because for some the zero register
1219  // encoding becomes the sp register.
1220  bool MIDefinesZeroReg = false;
1221  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1222  MIDefinesZeroReg = true;
1223 
1224  switch (MI.getOpcode()) {
1225  default:
1226  return MI.getOpcode();
1227  case AArch64::ADDSWrr:
1228  return AArch64::ADDWrr;
1229  case AArch64::ADDSWri:
1230  return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1231  case AArch64::ADDSWrs:
1232  return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1233  case AArch64::ADDSWrx:
1234  return AArch64::ADDWrx;
1235  case AArch64::ADDSXrr:
1236  return AArch64::ADDXrr;
1237  case AArch64::ADDSXri:
1238  return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1239  case AArch64::ADDSXrs:
1240  return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1241  case AArch64::ADDSXrx:
1242  return AArch64::ADDXrx;
1243  case AArch64::SUBSWrr:
1244  return AArch64::SUBWrr;
1245  case AArch64::SUBSWri:
1246  return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1247  case AArch64::SUBSWrs:
1248  return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1249  case AArch64::SUBSWrx:
1250  return AArch64::SUBWrx;
1251  case AArch64::SUBSXrr:
1252  return AArch64::SUBXrr;
1253  case AArch64::SUBSXri:
1254  return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1255  case AArch64::SUBSXrs:
1256  return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1257  case AArch64::SUBSXrx:
1258  return AArch64::SUBXrx;
1259  }
1260 }
1261 
1262 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1263 
1264 /// True when condition flags are accessed (either by writing or reading)
1265 /// on the instruction trace starting at From and ending at To.
1266 ///
1267 /// Note: If From and To are from different blocks it's assumed CC are accessed
1268 /// on the path.
1271  const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1272  // Early exit if To is at the beginning of the BB.
1273  if (To == To->getParent()->begin())
1274  return true;
1275 
1276  // Check whether the instructions are in the same basic block
1277  // If not, assume the condition flags might get modified somewhere.
1278  if (To->getParent() != From->getParent())
1279  return true;
1280 
1281  // From must be above To.
1283  ++To.getReverse(), To->getParent()->rend(),
1284  [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1285 
1286  // We iterate backward starting at \p To until we hit \p From.
1287  for (const MachineInstr &Instr :
1288  instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1289  if (((AccessToCheck & AK_Write) &&
1290  Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1291  ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1292  return true;
1293  }
1294  return false;
1295 }
1296 
1297 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1298 /// operation which could set the flags in an identical manner
1299 bool AArch64InstrInfo::optimizePTestInstr(
1300  MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1301  const MachineRegisterInfo *MRI) const {
1302  auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1303  auto *Pred = MRI->getUniqueVRegDef(PredReg);
1304  auto NewOp = Pred->getOpcode();
1305  bool OpChanged = false;
1306 
1307  unsigned MaskOpcode = Mask->getOpcode();
1308  unsigned PredOpcode = Pred->getOpcode();
1309  bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1310  bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1311 
1312  if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) {
1313  // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't
1314  // deactivate any lanes OTHER_INST might set.
1315  uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode);
1316  uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1317 
1318  // Must be an all active predicate of matching element size.
1319  if ((PredElementSize != MaskElementSize) ||
1320  (Mask->getOperand(1).getImm() != 31))
1321  return false;
1322 
1323  // Fallthough to simply remove the PTEST.
1324  } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) {
1325  // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1326  // instruction that sets the flags as PTEST would.
1327 
1328  // Fallthough to simply remove the PTEST.
1329  } else if (PredIsPTestLike) {
1330  // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both
1331  // instructions use the same predicate.
1332  auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1333  if (Mask != PTestLikeMask)
1334  return false;
1335 
1336  // Fallthough to simply remove the PTEST.
1337  } else {
1338  switch (Pred->getOpcode()) {
1339  case AArch64::BRKB_PPzP:
1340  case AArch64::BRKPB_PPzPP: {
1341  // Op 0 is chain, 1 is the mask, 2 the previous predicate to
1342  // propagate, 3 the new predicate.
1343 
1344  // Check to see if our mask is the same as the brkpb's. If
1345  // not the resulting flag bits may be different and we
1346  // can't remove the ptest.
1347  auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1348  if (Mask != PredMask)
1349  return false;
1350 
1351  // Switch to the new opcode
1352  NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP
1353  : AArch64::BRKPBS_PPzPP;
1354  OpChanged = true;
1355  break;
1356  }
1357  case AArch64::BRKN_PPzP: {
1358  auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1359  if (Mask != PredMask)
1360  return false;
1361 
1362  NewOp = AArch64::BRKNS_PPzP;
1363  OpChanged = true;
1364  break;
1365  }
1366  case AArch64::RDFFR_PPz: {
1367  // rdffr p1.b, PredMask=p0/z <--- Definition of Pred
1368  // ptest Mask=p0, Pred=p1.b <--- If equal masks, remove this and use
1369  // `rdffrs p1.b, p0/z` above.
1370  auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1371  if (Mask != PredMask)
1372  return false;
1373 
1374  NewOp = AArch64::RDFFRS_PPz;
1375  OpChanged = true;
1376  break;
1377  }
1378  default:
1379  // Bail out if we don't recognize the input
1380  return false;
1381  }
1382  }
1383 
1385 
1386  // If another instruction between Pred and PTest accesses flags, don't remove
1387  // the ptest or update the earlier instruction to modify them.
1388  if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1389  return false;
1390 
1391  // If we pass all the checks, it's safe to remove the PTEST and use the flags
1392  // as they are prior to PTEST. Sometimes this requires the tested PTEST
1393  // operand to be replaced with an equivalent instruction that also sets the
1394  // flags.
1395  Pred->setDesc(get(NewOp));
1396  PTest->eraseFromParent();
1397  if (OpChanged) {
1398  bool succeeded = UpdateOperandRegClass(*Pred);
1399  (void)succeeded;
1400  assert(succeeded && "Operands have incompatible register classes!");
1401  Pred->addRegisterDefined(AArch64::NZCV, TRI);
1402  }
1403 
1404  // Ensure that the flags def is live.
1405  if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1406  unsigned i = 0, e = Pred->getNumOperands();
1407  for (; i != e; ++i) {
1408  MachineOperand &MO = Pred->getOperand(i);
1409  if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1410  MO.setIsDead(false);
1411  break;
1412  }
1413  }
1414  }
1415  return true;
1416 }
1417 
1418 /// Try to optimize a compare instruction. A compare instruction is an
1419 /// instruction which produces AArch64::NZCV. It can be truly compare
1420 /// instruction
1421 /// when there are no uses of its destination register.
1422 ///
1423 /// The following steps are tried in order:
1424 /// 1. Convert CmpInstr into an unconditional version.
1425 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1426 /// condition code or an instruction which can be converted into such an
1427 /// instruction.
1428 /// Only comparison with zero is supported.
1430  MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1431  int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1432  assert(CmpInstr.getParent());
1433  assert(MRI);
1434 
1435  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1436  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1437  if (DeadNZCVIdx != -1) {
1438  if (CmpInstr.definesRegister(AArch64::WZR) ||
1439  CmpInstr.definesRegister(AArch64::XZR)) {
1440  CmpInstr.eraseFromParent();
1441  return true;
1442  }
1443  unsigned Opc = CmpInstr.getOpcode();
1444  unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1445  if (NewOpc == Opc)
1446  return false;
1447  const MCInstrDesc &MCID = get(NewOpc);
1448  CmpInstr.setDesc(MCID);
1449  CmpInstr.RemoveOperand(DeadNZCVIdx);
1450  bool succeeded = UpdateOperandRegClass(CmpInstr);
1451  (void)succeeded;
1452  assert(succeeded && "Some operands reg class are incompatible!");
1453  return true;
1454  }
1455 
1456  if (CmpInstr.getOpcode() == AArch64::PTEST_PP)
1457  return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1458 
1459  if (SrcReg2 != 0)
1460  return false;
1461 
1462  // CmpInstr is a Compare instruction if destination register is not used.
1463  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1464  return false;
1465 
1466  if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1467  return true;
1468  return (CmpValue == 0 || CmpValue == 1) &&
1469  removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1470 }
1471 
1472 /// Get opcode of S version of Instr.
1473 /// If Instr is S version its opcode is returned.
1474 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1475 /// or we are not interested in it.
1476 static unsigned sForm(MachineInstr &Instr) {
1477  switch (Instr.getOpcode()) {
1478  default:
1479  return AArch64::INSTRUCTION_LIST_END;
1480 
1481  case AArch64::ADDSWrr:
1482  case AArch64::ADDSWri:
1483  case AArch64::ADDSXrr:
1484  case AArch64::ADDSXri:
1485  case AArch64::SUBSWrr:
1486  case AArch64::SUBSWri:
1487  case AArch64::SUBSXrr:
1488  case AArch64::SUBSXri:
1489  return Instr.getOpcode();
1490 
1491  case AArch64::ADDWrr:
1492  return AArch64::ADDSWrr;
1493  case AArch64::ADDWri:
1494  return AArch64::ADDSWri;
1495  case AArch64::ADDXrr:
1496  return AArch64::ADDSXrr;
1497  case AArch64::ADDXri:
1498  return AArch64::ADDSXri;
1499  case AArch64::ADCWr:
1500  return AArch64::ADCSWr;
1501  case AArch64::ADCXr:
1502  return AArch64::ADCSXr;
1503  case AArch64::SUBWrr:
1504  return AArch64::SUBSWrr;
1505  case AArch64::SUBWri:
1506  return AArch64::SUBSWri;
1507  case AArch64::SUBXrr:
1508  return AArch64::SUBSXrr;
1509  case AArch64::SUBXri:
1510  return AArch64::SUBSXri;
1511  case AArch64::SBCWr:
1512  return AArch64::SBCSWr;
1513  case AArch64::SBCXr:
1514  return AArch64::SBCSXr;
1515  case AArch64::ANDWri:
1516  return AArch64::ANDSWri;
1517  case AArch64::ANDXri:
1518  return AArch64::ANDSXri;
1519  }
1520 }
1521 
1522 /// Check if AArch64::NZCV should be alive in successors of MBB.
1524  for (auto *BB : MBB->successors())
1525  if (BB->isLiveIn(AArch64::NZCV))
1526  return true;
1527  return false;
1528 }
1529 
1530 /// \returns The condition code operand index for \p Instr if it is a branch
1531 /// or select and -1 otherwise.
1532 static int
1534  switch (Instr.getOpcode()) {
1535  default:
1536  return -1;
1537 
1538  case AArch64::Bcc: {
1539  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1540  assert(Idx >= 2);
1541  return Idx - 2;
1542  }
1543 
1544  case AArch64::CSINVWr:
1545  case AArch64::CSINVXr:
1546  case AArch64::CSINCWr:
1547  case AArch64::CSINCXr:
1548  case AArch64::CSELWr:
1549  case AArch64::CSELXr:
1550  case AArch64::CSNEGWr:
1551  case AArch64::CSNEGXr:
1552  case AArch64::FCSELSrrr:
1553  case AArch64::FCSELDrrr: {
1554  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1555  assert(Idx >= 1);
1556  return Idx - 1;
1557  }
1558  }
1559 }
1560 
1561 namespace {
1562 
1563 struct UsedNZCV {
1564  bool N = false;
1565  bool Z = false;
1566  bool C = false;
1567  bool V = false;
1568 
1569  UsedNZCV() = default;
1570 
1571  UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1572  this->N |= UsedFlags.N;
1573  this->Z |= UsedFlags.Z;
1574  this->C |= UsedFlags.C;
1575  this->V |= UsedFlags.V;
1576  return *this;
1577  }
1578 };
1579 
1580 } // end anonymous namespace
1581 
1582 /// Find a condition code used by the instruction.
1583 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1584 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1586  int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1587  return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1588  Instr.getOperand(CCIdx).getImm())
1590 }
1591 
1592 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1593  assert(CC != AArch64CC::Invalid);
1594  UsedNZCV UsedFlags;
1595  switch (CC) {
1596  default:
1597  break;
1598 
1599  case AArch64CC::EQ: // Z set
1600  case AArch64CC::NE: // Z clear
1601  UsedFlags.Z = true;
1602  break;
1603 
1604  case AArch64CC::HI: // Z clear and C set
1605  case AArch64CC::LS: // Z set or C clear
1606  UsedFlags.Z = true;
1608  case AArch64CC::HS: // C set
1609  case AArch64CC::LO: // C clear
1610  UsedFlags.C = true;
1611  break;
1612 
1613  case AArch64CC::MI: // N set
1614  case AArch64CC::PL: // N clear
1615  UsedFlags.N = true;
1616  break;
1617 
1618  case AArch64CC::VS: // V set
1619  case AArch64CC::VC: // V clear
1620  UsedFlags.V = true;
1621  break;
1622 
1623  case AArch64CC::GT: // Z clear, N and V the same
1624  case AArch64CC::LE: // Z set, N and V differ
1625  UsedFlags.Z = true;
1627  case AArch64CC::GE: // N and V the same
1628  case AArch64CC::LT: // N and V differ
1629  UsedFlags.N = true;
1630  UsedFlags.V = true;
1631  break;
1632  }
1633  return UsedFlags;
1634 }
1635 
1636 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if they
1637 /// are not containing C or V flags and NZCV flags are not alive in successors
1638 /// of the same \p CmpInstr and \p MI parent. \returns None otherwise.
1639 ///
1640 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1641 static Optional<UsedNZCV>
1643  const TargetRegisterInfo &TRI,
1644  SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) {
1645  MachineBasicBlock *CmpParent = CmpInstr.getParent();
1646  if (MI.getParent() != CmpParent)
1647  return None;
1648 
1649  if (areCFlagsAliveInSuccessors(CmpParent))
1650  return None;
1651 
1652  UsedNZCV NZCVUsedAfterCmp;
1653  for (MachineInstr &Instr : instructionsWithoutDebug(
1654  std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1655  if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1657  if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1658  return None;
1659  NZCVUsedAfterCmp |= getUsedNZCV(CC);
1660  if (CCUseInstrs)
1661  CCUseInstrs->push_back(&Instr);
1662  }
1663  if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1664  break;
1665  }
1666  if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V)
1667  return None;
1668  return NZCVUsedAfterCmp;
1669 }
1670 
1671 static bool isADDSRegImm(unsigned Opcode) {
1672  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1673 }
1674 
1675 static bool isSUBSRegImm(unsigned Opcode) {
1676  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1677 }
1678 
1679 /// Check if CmpInstr can be substituted by MI.
1680 ///
1681 /// CmpInstr can be substituted:
1682 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1683 /// - and, MI and CmpInstr are from the same MachineBB
1684 /// - and, condition flags are not alive in successors of the CmpInstr parent
1685 /// - and, if MI opcode is the S form there must be no defs of flags between
1686 /// MI and CmpInstr
1687 /// or if MI opcode is not the S form there must be neither defs of flags
1688 /// nor uses of flags between MI and CmpInstr.
1689 /// - and C/V flags are not used after CmpInstr
1691  const TargetRegisterInfo &TRI) {
1692  assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1693 
1694  const unsigned CmpOpcode = CmpInstr.getOpcode();
1695  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1696  return false;
1697 
1698  if (!examineCFlagsUse(MI, CmpInstr, TRI))
1699  return false;
1700 
1701  AccessKind AccessToCheck = AK_Write;
1702  if (sForm(MI) != MI.getOpcode())
1703  AccessToCheck = AK_All;
1704  return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1705 }
1706 
1707 /// Substitute an instruction comparing to zero with another instruction
1708 /// which produces needed condition flags.
1709 ///
1710 /// Return true on success.
1711 bool AArch64InstrInfo::substituteCmpToZero(
1712  MachineInstr &CmpInstr, unsigned SrcReg,
1713  const MachineRegisterInfo &MRI) const {
1714  // Get the unique definition of SrcReg.
1715  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1716  if (!MI)
1717  return false;
1718 
1720 
1721  unsigned NewOpc = sForm(*MI);
1722  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1723  return false;
1724 
1725  if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1726  return false;
1727 
1728  // Update the instruction to set NZCV.
1729  MI->setDesc(get(NewOpc));
1730  CmpInstr.eraseFromParent();
1731  bool succeeded = UpdateOperandRegClass(*MI);
1732  (void)succeeded;
1733  assert(succeeded && "Some operands reg class are incompatible!");
1734  MI->addRegisterDefined(AArch64::NZCV, &TRI);
1735  return true;
1736 }
1737 
1738 /// \returns True if \p CmpInstr can be removed.
1739 ///
1740 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1741 /// codes used in \p CCUseInstrs must be inverted.
1743  int CmpValue, const TargetRegisterInfo &TRI,
1744  SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1745  bool &IsInvertCC) {
1746  assert((CmpValue == 0 || CmpValue == 1) &&
1747  "Only comparisons to 0 or 1 considered for removal!");
1748 
1749  // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1750  unsigned MIOpc = MI.getOpcode();
1751  if (MIOpc == AArch64::CSINCWr) {
1752  if (MI.getOperand(1).getReg() != AArch64::WZR ||
1753  MI.getOperand(2).getReg() != AArch64::WZR)
1754  return false;
1755  } else if (MIOpc == AArch64::CSINCXr) {
1756  if (MI.getOperand(1).getReg() != AArch64::XZR ||
1757  MI.getOperand(2).getReg() != AArch64::XZR)
1758  return false;
1759  } else {
1760  return false;
1761  }
1763  if (MICC == AArch64CC::Invalid)
1764  return false;
1765 
1766  // NZCV needs to be defined
1767  if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
1768  return false;
1769 
1770  // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1771  const unsigned CmpOpcode = CmpInstr.getOpcode();
1772  bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1773  if (CmpValue && !IsSubsRegImm)
1774  return false;
1775  if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1776  return false;
1777 
1778  // MI conditions allowed: eq, ne, mi, pl
1779  UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1780  if (MIUsedNZCV.C || MIUsedNZCV.V)
1781  return false;
1782 
1783  Optional<UsedNZCV> NZCVUsedAfterCmp =
1784  examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1785  // Condition flags are not used in CmpInstr basic block successors and only
1786  // Z or N flags allowed to be used after CmpInstr within its basic block
1787  if (!NZCVUsedAfterCmp)
1788  return false;
1789  // Z or N flag used after CmpInstr must correspond to the flag used in MI
1790  if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1791  (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1792  return false;
1793  // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1794  if (MIUsedNZCV.N && !CmpValue)
1795  return false;
1796 
1797  // There must be no defs of flags between MI and CmpInstr
1798  if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1799  return false;
1800 
1801  // Condition code is inverted in the following cases:
1802  // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1803  // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1804  IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1805  (!CmpValue && MICC == AArch64CC::NE);
1806  return true;
1807 }
1808 
1809 /// Remove comparision in csinc-cmp sequence
1810 ///
1811 /// Examples:
1812 /// 1. \code
1813 /// csinc w9, wzr, wzr, ne
1814 /// cmp w9, #0
1815 /// b.eq
1816 /// \endcode
1817 /// to
1818 /// \code
1819 /// csinc w9, wzr, wzr, ne
1820 /// b.ne
1821 /// \endcode
1822 ///
1823 /// 2. \code
1824 /// csinc x2, xzr, xzr, mi
1825 /// cmp x2, #1
1826 /// b.pl
1827 /// \endcode
1828 /// to
1829 /// \code
1830 /// csinc x2, xzr, xzr, mi
1831 /// b.pl
1832 /// \endcode
1833 ///
1834 /// \param CmpInstr comparison instruction
1835 /// \return True when comparison removed
1836 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1837  MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1838  const MachineRegisterInfo &MRI) const {
1839  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1840  if (!MI)
1841  return false;
1843  SmallVector<MachineInstr *, 4> CCUseInstrs;
1844  bool IsInvertCC = false;
1845  if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1846  IsInvertCC))
1847  return false;
1848  // Make transformation
1849  CmpInstr.eraseFromParent();
1850  if (IsInvertCC) {
1851  // Invert condition codes in CmpInstr CC users
1852  for (MachineInstr *CCUseInstr : CCUseInstrs) {
1853  int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1854  assert(Idx >= 0 && "Unexpected instruction using CC.");
1855  MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1857  static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1858  CCOperand.setImm(CCUse);
1859  }
1860  }
1861  return true;
1862 }
1863 
1865  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1866  MI.getOpcode() != AArch64::CATCHRET)
1867  return false;
1868 
1869  MachineBasicBlock &MBB = *MI.getParent();
1870  auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1871  auto TRI = Subtarget.getRegisterInfo();
1872  DebugLoc DL = MI.getDebugLoc();
1873 
1874  if (MI.getOpcode() == AArch64::CATCHRET) {
1875  // Skip to the first instruction before the epilog.
1876  const TargetInstrInfo *TII =
1878  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1880  MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1881  while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1882  FirstEpilogSEH != MBB.begin())
1883  FirstEpilogSEH = std::prev(FirstEpilogSEH);
1884  if (FirstEpilogSEH != MBB.begin())
1885  FirstEpilogSEH = std::next(FirstEpilogSEH);
1886  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1887  .addReg(AArch64::X0, RegState::Define)
1888  .addMBB(TargetMBB);
1889  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1890  .addReg(AArch64::X0, RegState::Define)
1891  .addReg(AArch64::X0)
1892  .addMBB(TargetMBB)
1893  .addImm(0);
1894  return true;
1895  }
1896 
1897  Register Reg = MI.getOperand(0).getReg();
1899  if (M.getStackProtectorGuard() == "sysreg") {
1900  const AArch64SysReg::SysReg *SrcReg =
1901  AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
1902  if (!SrcReg)
1903  report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
1904 
1905  // mrs xN, sysreg
1908  .addImm(SrcReg->Encoding);
1909  int Offset = M.getStackProtectorGuardOffset();
1910  if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
1911  // ldr xN, [xN, #offset]
1912  BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1913  .addDef(Reg)
1915  .addImm(Offset / 8);
1916  } else if (Offset >= -256 && Offset <= 255) {
1917  // ldur xN, [xN, #offset]
1918  BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
1919  .addDef(Reg)
1921  .addImm(Offset);
1922  } else if (Offset >= -4095 && Offset <= 4095) {
1923  if (Offset > 0) {
1924  // add xN, xN, #offset
1925  BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
1926  .addDef(Reg)
1928  .addImm(Offset)
1929  .addImm(0);
1930  } else {
1931  // sub xN, xN, #offset
1932  BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
1933  .addDef(Reg)
1935  .addImm(-Offset)
1936  .addImm(0);
1937  }
1938  // ldr xN, [xN]
1939  BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1940  .addDef(Reg)
1942  .addImm(0);
1943  } else {
1944  // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
1945  // than 23760.
1946  // It might be nice to use AArch64::MOVi32imm here, which would get
1947  // expanded in PreSched2 after PostRA, but our lone scratch Reg already
1948  // contains the MRS result. findScratchNonCalleeSaveRegister() in
1949  // AArch64FrameLowering might help us find such a scratch register
1950  // though. If we failed to find a scratch register, we could emit a
1951  // stream of add instructions to build up the immediate. Or, we could try
1952  // to insert a AArch64::MOVi32imm before register allocation so that we
1953  // didn't need to scavenge for a scratch register.
1954  report_fatal_error("Unable to encode Stack Protector Guard Offset");
1955  }
1956  MBB.erase(MI);
1957  return true;
1958  }
1959 
1960  const GlobalValue *GV =
1961  cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1962  const TargetMachine &TM = MBB.getParent()->getTarget();
1963  unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1964  const unsigned char MO_NC = AArch64II::MO_NC;
1965 
1966  if ((OpFlags & AArch64II::MO_GOT) != 0) {
1968  .addGlobalAddress(GV, 0, OpFlags);
1969  if (Subtarget.isTargetILP32()) {
1970  unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1971  BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1972  .addDef(Reg32, RegState::Dead)
1974  .addImm(0)
1975  .addMemOperand(*MI.memoperands_begin())
1977  } else {
1978  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1980  .addImm(0)
1981  .addMemOperand(*MI.memoperands_begin());
1982  }
1983  } else if (TM.getCodeModel() == CodeModel::Large) {
1984  assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1985  BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1987  .addImm(0);
1988  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1991  .addImm(16);
1992  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1995  .addImm(32);
1996  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1999  .addImm(48);
2000  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2002  .addImm(0)
2003  .addMemOperand(*MI.memoperands_begin());
2004  } else if (TM.getCodeModel() == CodeModel::Tiny) {
2006  .addGlobalAddress(GV, 0, OpFlags);
2007  } else {
2009  .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2010  unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2011  if (Subtarget.isTargetILP32()) {
2012  unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2013  BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2014  .addDef(Reg32, RegState::Dead)
2016  .addGlobalAddress(GV, 0, LoFlags)
2017  .addMemOperand(*MI.memoperands_begin())
2019  } else {
2020  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2022  .addGlobalAddress(GV, 0, LoFlags)
2023  .addMemOperand(*MI.memoperands_begin());
2024  }
2025  }
2026 
2027  MBB.erase(MI);
2028 
2029  return true;
2030 }
2031 
2032 // Return true if this instruction simply sets its single destination register
2033 // to zero. This is equivalent to a register rename of the zero-register.
2035  switch (MI.getOpcode()) {
2036  default:
2037  break;
2038  case AArch64::MOVZWi:
2039  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2040  if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2041  assert(MI.getDesc().getNumOperands() == 3 &&
2042  MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2043  return true;
2044  }
2045  break;
2046  case AArch64::ANDWri: // and Rd, Rzr, #imm
2047  return MI.getOperand(1).getReg() == AArch64::WZR;
2048  case AArch64::ANDXri:
2049  return MI.getOperand(1).getReg() == AArch64::XZR;
2050  case TargetOpcode::COPY:
2051  return MI.getOperand(1).getReg() == AArch64::WZR;
2052  }
2053  return false;
2054 }
2055 
2056 // Return true if this instruction simply renames a general register without
2057 // modifying bits.
2059  switch (MI.getOpcode()) {
2060  default:
2061  break;
2062  case TargetOpcode::COPY: {
2063  // GPR32 copies will by lowered to ORRXrs
2064  Register DstReg = MI.getOperand(0).getReg();
2065  return (AArch64::GPR32RegClass.contains(DstReg) ||
2066  AArch64::GPR64RegClass.contains(DstReg));
2067  }
2068  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2069  if (MI.getOperand(1).getReg() == AArch64::XZR) {
2070  assert(MI.getDesc().getNumOperands() == 4 &&
2071  MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2072  return true;
2073  }
2074  break;
2075  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2076  if (MI.getOperand(2).getImm() == 0) {
2077  assert(MI.getDesc().getNumOperands() == 4 &&
2078  MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2079  return true;
2080  }
2081  break;
2082  }
2083  return false;
2084 }
2085 
2086 // Return true if this instruction simply renames a general register without
2087 // modifying bits.
2089  switch (MI.getOpcode()) {
2090  default:
2091  break;
2092  case TargetOpcode::COPY: {
2093  Register DstReg = MI.getOperand(0).getReg();
2094  return AArch64::FPR128RegClass.contains(DstReg);
2095  }
2096  case AArch64::ORRv16i8:
2097  if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2098  assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2099  "invalid ORRv16i8 operands");
2100  return true;
2101  }
2102  break;
2103  }
2104  return false;
2105 }
2106 
2108  int &FrameIndex) const {
2109  switch (MI.getOpcode()) {
2110  default:
2111  break;
2112  case AArch64::LDRWui:
2113  case AArch64::LDRXui:
2114  case AArch64::LDRBui:
2115  case AArch64::LDRHui:
2116  case AArch64::LDRSui:
2117  case AArch64::LDRDui:
2118  case AArch64::LDRQui:
2119  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2120  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2121  FrameIndex = MI.getOperand(1).getIndex();
2122  return MI.getOperand(0).getReg();
2123  }
2124  break;
2125  }
2126 
2127  return 0;
2128 }
2129 
2131  int &FrameIndex) const {
2132  switch (MI.getOpcode()) {
2133  default:
2134  break;
2135  case AArch64::STRWui:
2136  case AArch64::STRXui:
2137  case AArch64::STRBui:
2138  case AArch64::STRHui:
2139  case AArch64::STRSui:
2140  case AArch64::STRDui:
2141  case AArch64::STRQui:
2142  case AArch64::LDR_PXI:
2143  case AArch64::STR_PXI:
2144  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2145  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2146  FrameIndex = MI.getOperand(1).getIndex();
2147  return MI.getOperand(0).getReg();
2148  }
2149  break;
2150  }
2151  return 0;
2152 }
2153 
2154 /// Check all MachineMemOperands for a hint to suppress pairing.
2156  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2157  return MMO->getFlags() & MOSuppressPair;
2158  });
2159 }
2160 
2161 /// Set a flag on the first MachineMemOperand to suppress pairing.
2163  if (MI.memoperands_empty())
2164  return;
2165  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2166 }
2167 
2168 /// Check all MachineMemOperands for a hint that the load/store is strided.
2170  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2171  return MMO->getFlags() & MOStridedAccess;
2172  });
2173 }
2174 
2176  switch (Opc) {
2177  default:
2178  return false;
2179  case AArch64::STURSi:
2180  case AArch64::STRSpre:
2181  case AArch64::STURDi:
2182  case AArch64::STRDpre:
2183  case AArch64::STURQi:
2184  case AArch64::STRQpre:
2185  case AArch64::STURBBi:
2186  case AArch64::STURHHi:
2187  case AArch64::STURWi:
2188  case AArch64::STRWpre:
2189  case AArch64::STURXi:
2190  case AArch64::STRXpre:
2191  case AArch64::LDURSi:
2192  case AArch64::LDRSpre:
2193  case AArch64::LDURDi:
2194  case AArch64::LDRDpre:
2195  case AArch64::LDURQi:
2196  case AArch64::LDRQpre:
2197  case AArch64::LDURWi:
2198  case AArch64::LDRWpre:
2199  case AArch64::LDURXi:
2200  case AArch64::LDRXpre:
2201  case AArch64::LDURSWi:
2202  case AArch64::LDURHHi:
2203  case AArch64::LDURBBi:
2204  case AArch64::LDURSBWi:
2205  case AArch64::LDURSHWi:
2206  return true;
2207  }
2208 }
2209 
2211  switch (Opc) {
2212  default: return {};
2213  case AArch64::PRFMui: return AArch64::PRFUMi;
2214  case AArch64::LDRXui: return AArch64::LDURXi;
2215  case AArch64::LDRWui: return AArch64::LDURWi;
2216  case AArch64::LDRBui: return AArch64::LDURBi;
2217  case AArch64::LDRHui: return AArch64::LDURHi;
2218  case AArch64::LDRSui: return AArch64::LDURSi;
2219  case AArch64::LDRDui: return AArch64::LDURDi;
2220  case AArch64::LDRQui: return AArch64::LDURQi;
2221  case AArch64::LDRBBui: return AArch64::LDURBBi;
2222  case AArch64::LDRHHui: return AArch64::LDURHHi;
2223  case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2224  case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2225  case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2226  case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2227  case AArch64::LDRSWui: return AArch64::LDURSWi;
2228  case AArch64::STRXui: return AArch64::STURXi;
2229  case AArch64::STRWui: return AArch64::STURWi;
2230  case AArch64::STRBui: return AArch64::STURBi;
2231  case AArch64::STRHui: return AArch64::STURHi;
2232  case AArch64::STRSui: return AArch64::STURSi;
2233  case AArch64::STRDui: return AArch64::STURDi;
2234  case AArch64::STRQui: return AArch64::STURQi;
2235  case AArch64::STRBBui: return AArch64::STURBBi;
2236  case AArch64::STRHHui: return AArch64::STURHHi;
2237  }
2238 }
2239 
2241  switch (Opc) {
2242  default:
2243  return 2;
2244  case AArch64::LDPXi:
2245  case AArch64::LDPDi:
2246  case AArch64::STPXi:
2247  case AArch64::STPDi:
2248  case AArch64::LDNPXi:
2249  case AArch64::LDNPDi:
2250  case AArch64::STNPXi:
2251  case AArch64::STNPDi:
2252  case AArch64::LDPQi:
2253  case AArch64::STPQi:
2254  case AArch64::LDNPQi:
2255  case AArch64::STNPQi:
2256  case AArch64::LDPWi:
2257  case AArch64::LDPSi:
2258  case AArch64::STPWi:
2259  case AArch64::STPSi:
2260  case AArch64::LDNPWi:
2261  case AArch64::LDNPSi:
2262  case AArch64::STNPWi:
2263  case AArch64::STNPSi:
2264  case AArch64::LDG:
2265  case AArch64::STGPi:
2266 
2267  case AArch64::LD1B_IMM:
2268  case AArch64::LD1B_H_IMM:
2269  case AArch64::LD1B_S_IMM:
2270  case AArch64::LD1B_D_IMM:
2271  case AArch64::LD1SB_H_IMM:
2272  case AArch64::LD1SB_S_IMM:
2273  case AArch64::LD1SB_D_IMM:
2274  case AArch64::LD1H_IMM:
2275  case AArch64::LD1H_S_IMM:
2276  case AArch64::LD1H_D_IMM:
2277  case AArch64::LD1SH_S_IMM:
2278  case AArch64::LD1SH_D_IMM:
2279  case AArch64::LD1W_IMM:
2280  case AArch64::LD1W_D_IMM:
2281  case AArch64::LD1SW_D_IMM:
2282  case AArch64::LD1D_IMM:
2283 
2284  case AArch64::ST1B_IMM:
2285  case AArch64::ST1B_H_IMM:
2286  case AArch64::ST1B_S_IMM:
2287  case AArch64::ST1B_D_IMM:
2288  case AArch64::ST1H_IMM:
2289  case AArch64::ST1H_S_IMM:
2290  case AArch64::ST1H_D_IMM:
2291  case AArch64::ST1W_IMM:
2292  case AArch64::ST1W_D_IMM:
2293  case AArch64::ST1D_IMM:
2294 
2295  case AArch64::LD1RB_IMM:
2296  case AArch64::LD1RB_H_IMM:
2297  case AArch64::LD1RB_S_IMM:
2298  case AArch64::LD1RB_D_IMM:
2299  case AArch64::LD1RSB_H_IMM:
2300  case AArch64::LD1RSB_S_IMM:
2301  case AArch64::LD1RSB_D_IMM:
2302  case AArch64::LD1RH_IMM:
2303  case AArch64::LD1RH_S_IMM:
2304  case AArch64::LD1RH_D_IMM:
2305  case AArch64::LD1RSH_S_IMM:
2306  case AArch64::LD1RSH_D_IMM:
2307  case AArch64::LD1RW_IMM:
2308  case AArch64::LD1RW_D_IMM:
2309  case AArch64::LD1RSW_IMM:
2310  case AArch64::LD1RD_IMM:
2311 
2312  case AArch64::LDNT1B_ZRI:
2313  case AArch64::LDNT1H_ZRI:
2314  case AArch64::LDNT1W_ZRI:
2315  case AArch64::LDNT1D_ZRI:
2316  case AArch64::STNT1B_ZRI:
2317  case AArch64::STNT1H_ZRI:
2318  case AArch64::STNT1W_ZRI:
2319  case AArch64::STNT1D_ZRI:
2320 
2321  case AArch64::LDNF1B_IMM:
2322  case AArch64::LDNF1B_H_IMM:
2323  case AArch64::LDNF1B_S_IMM:
2324  case AArch64::LDNF1B_D_IMM:
2325  case AArch64::LDNF1SB_H_IMM:
2326  case AArch64::LDNF1SB_S_IMM:
2327  case AArch64::LDNF1SB_D_IMM:
2328  case AArch64::LDNF1H_IMM:
2329  case AArch64::LDNF1H_S_IMM:
2330  case AArch64::LDNF1H_D_IMM:
2331  case AArch64::LDNF1SH_S_IMM:
2332  case AArch64::LDNF1SH_D_IMM:
2333  case AArch64::LDNF1W_IMM:
2334  case AArch64::LDNF1W_D_IMM:
2335  case AArch64::LDNF1SW_D_IMM:
2336  case AArch64::LDNF1D_IMM:
2337  return 3;
2338  case AArch64::ADDG:
2339  case AArch64::STGOffset:
2340  case AArch64::LDR_PXI:
2341  case AArch64::STR_PXI:
2342  return 2;
2343  }
2344 }
2345 
2347  switch (MI.getOpcode()) {
2348  default:
2349  return false;
2350  // Scaled instructions.
2351  case AArch64::STRSui:
2352  case AArch64::STRDui:
2353  case AArch64::STRQui:
2354  case AArch64::STRXui:
2355  case AArch64::STRWui:
2356  case AArch64::LDRSui:
2357  case AArch64::LDRDui:
2358  case AArch64::LDRQui:
2359  case AArch64::LDRXui:
2360  case AArch64::LDRWui:
2361  case AArch64::LDRSWui:
2362  // Unscaled instructions.
2363  case AArch64::STURSi:
2364  case AArch64::STRSpre:
2365  case AArch64::STURDi:
2366  case AArch64::STRDpre:
2367  case AArch64::STURQi:
2368  case AArch64::STRQpre:
2369  case AArch64::STURWi:
2370  case AArch64::STRWpre:
2371  case AArch64::STURXi:
2372  case AArch64::STRXpre:
2373  case AArch64::LDURSi:
2374  case AArch64::LDRSpre:
2375  case AArch64::LDURDi:
2376  case AArch64::LDRDpre:
2377  case AArch64::LDURQi:
2378  case AArch64::LDRQpre:
2379  case AArch64::LDURWi:
2380  case AArch64::LDRWpre:
2381  case AArch64::LDURXi:
2382  case AArch64::LDRXpre:
2383  case AArch64::LDURSWi:
2384  return true;
2385  }
2386 }
2387 
2389  bool &Is64Bit) {
2390  switch (Opc) {
2391  default:
2392  llvm_unreachable("Opcode has no flag setting equivalent!");
2393  // 32-bit cases:
2394  case AArch64::ADDWri:
2395  Is64Bit = false;
2396  return AArch64::ADDSWri;
2397  case AArch64::ADDWrr:
2398  Is64Bit = false;
2399  return AArch64::ADDSWrr;
2400  case AArch64::ADDWrs:
2401  Is64Bit = false;
2402  return AArch64::ADDSWrs;
2403  case AArch64::ADDWrx:
2404  Is64Bit = false;
2405  return AArch64::ADDSWrx;
2406  case AArch64::ANDWri:
2407  Is64Bit = false;
2408  return AArch64::ANDSWri;
2409  case AArch64::ANDWrr:
2410  Is64Bit = false;
2411  return AArch64::ANDSWrr;
2412  case AArch64::ANDWrs:
2413  Is64Bit = false;
2414  return AArch64::ANDSWrs;
2415  case AArch64::BICWrr:
2416  Is64Bit = false;
2417  return AArch64::BICSWrr;
2418  case AArch64::BICWrs:
2419  Is64Bit = false;
2420  return AArch64::BICSWrs;
2421  case AArch64::SUBWri:
2422  Is64Bit = false;
2423  return AArch64::SUBSWri;
2424  case AArch64::SUBWrr:
2425  Is64Bit = false;
2426  return AArch64::SUBSWrr;
2427  case AArch64::SUBWrs:
2428  Is64Bit = false;
2429  return AArch64::SUBSWrs;
2430  case AArch64::SUBWrx:
2431  Is64Bit = false;
2432  return AArch64::SUBSWrx;
2433  // 64-bit cases:
2434  case AArch64::ADDXri:
2435  Is64Bit = true;
2436  return AArch64::ADDSXri;
2437  case AArch64::ADDXrr:
2438  Is64Bit = true;
2439  return AArch64::ADDSXrr;
2440  case AArch64::ADDXrs:
2441  Is64Bit = true;
2442  return AArch64::ADDSXrs;
2443  case AArch64::ADDXrx:
2444  Is64Bit = true;
2445  return AArch64::ADDSXrx;
2446  case AArch64::ANDXri:
2447  Is64Bit = true;
2448  return AArch64::ANDSXri;
2449  case AArch64::ANDXrr:
2450  Is64Bit = true;
2451  return AArch64::ANDSXrr;
2452  case AArch64::ANDXrs:
2453  Is64Bit = true;
2454  return AArch64::ANDSXrs;
2455  case AArch64::BICXrr:
2456  Is64Bit = true;
2457  return AArch64::BICSXrr;
2458  case AArch64::BICXrs:
2459  Is64Bit = true;
2460  return AArch64::BICSXrs;
2461  case AArch64::SUBXri:
2462  Is64Bit = true;
2463  return AArch64::SUBSXri;
2464  case AArch64::SUBXrr:
2465  Is64Bit = true;
2466  return AArch64::SUBSXrr;
2467  case AArch64::SUBXrs:
2468  Is64Bit = true;
2469  return AArch64::SUBSXrs;
2470  case AArch64::SUBXrx:
2471  Is64Bit = true;
2472  return AArch64::SUBSXrx;
2473  }
2474 }
2475 
2476 // Is this a candidate for ld/st merging or pairing? For example, we don't
2477 // touch volatiles or load/stores that have a hint to avoid pair formation.
2479 
2480  bool IsPreLdSt = isPreLdSt(MI);
2481 
2482  // If this is a volatile load/store, don't mess with it.
2483  if (MI.hasOrderedMemoryRef())
2484  return false;
2485 
2486  // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2487  // For Pre-inc LD/ST, the operand is shifted by one.
2488  assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2489  MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2490  "Expected a reg or frame index operand.");
2491 
2492  // For Pre-indexed addressing quadword instructions, the third operand is the
2493  // immediate value.
2494  bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2495 
2496  if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2497  return false;
2498 
2499  // Can't merge/pair if the instruction modifies the base register.
2500  // e.g., ldr x0, [x0]
2501  // This case will never occur with an FI base.
2502  // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged.
2503  // For example:
2504  // ldr q0, [x11, #32]!
2505  // ldr q1, [x11, #16]
2506  // to
2507  // ldp q0, q1, [x11, #32]!
2508  if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2509  Register BaseReg = MI.getOperand(1).getReg();
2511  if (MI.modifiesRegister(BaseReg, TRI))
2512  return false;
2513  }
2514 
2515  // Check if this load/store has a hint to avoid pair formation.
2516  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2517  if (isLdStPairSuppressed(MI))
2518  return false;
2519 
2520  // Do not pair any callee-save store/reload instructions in the
2521  // prologue/epilogue if the CFI information encoded the operations as separate
2522  // instructions, as that will cause the size of the actual prologue to mismatch
2523  // with the prologue size recorded in the Windows CFI.
2524  const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2525  bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2526  MI.getMF()->getFunction().needsUnwindTableEntry();
2527  if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2528  MI.getFlag(MachineInstr::FrameDestroy)))
2529  return false;
2530 
2531  // On some CPUs quad load/store pairs are slower than two single load/stores.
2532  if (Subtarget.isPaired128Slow()) {
2533  switch (MI.getOpcode()) {
2534  default:
2535  break;
2536  case AArch64::LDURQi:
2537  case AArch64::STURQi:
2538  case AArch64::LDRQui:
2539  case AArch64::STRQui:
2540  return false;
2541  }
2542  }
2543 
2544  return true;
2545 }
2546 
2549  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2550  const TargetRegisterInfo *TRI) const {
2551  if (!LdSt.mayLoadOrStore())
2552  return false;
2553 
2554  const MachineOperand *BaseOp;
2555  if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2556  Width, TRI))
2557  return false;
2558  BaseOps.push_back(BaseOp);
2559  return true;
2560 }
2561 
2564  const TargetRegisterInfo *TRI) const {
2565  const MachineOperand *Base; // Filled with the base operand of MI.
2566  int64_t Offset; // Filled with the offset of MI.
2567  bool OffsetIsScalable;
2568  if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2569  return None;
2570 
2571  if (!Base->isReg())
2572  return None;
2573  ExtAddrMode AM;
2574  AM.BaseReg = Base->getReg();
2575  AM.Displacement = Offset;
2576  AM.ScaledReg = 0;
2577  AM.Scale = 0;
2578  return AM;
2579 }
2580 
2582  const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
2583  bool &OffsetIsScalable, unsigned &Width,
2584  const TargetRegisterInfo *TRI) const {
2585  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2586  // Handle only loads/stores with base register followed by immediate offset.
2587  if (LdSt.getNumExplicitOperands() == 3) {
2588  // Non-paired instruction (e.g., ldr x1, [x0, #8]).
2589  if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
2590  !LdSt.getOperand(2).isImm())
2591  return false;
2592  } else if (LdSt.getNumExplicitOperands() == 4) {
2593  // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2594  if (!LdSt.getOperand(1).isReg() ||
2595  (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2596  !LdSt.getOperand(3).isImm())
2597  return false;
2598  } else
2599  return false;
2600 
2601  // Get the scaling factor for the instruction and set the width for the
2602  // instruction.
2603  TypeSize Scale(0U, false);
2604  int64_t Dummy1, Dummy2;
2605 
2606  // If this returns false, then it's an instruction we don't want to handle.
2607  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2608  return false;
2609 
2610  // Compute the offset. Offset is calculated as the immediate operand
2611  // multiplied by the scaling factor. Unscaled instructions have scaling factor
2612  // set to 1.
2613  if (LdSt.getNumExplicitOperands() == 3) {
2614  BaseOp = &LdSt.getOperand(1);
2615  Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
2616  } else {
2617  assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2618  BaseOp = &LdSt.getOperand(2);
2619  Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
2620  }
2621  OffsetIsScalable = Scale.isScalable();
2622 
2623  if (!BaseOp->isReg() && !BaseOp->isFI())
2624  return false;
2625 
2626  return true;
2627 }
2628 
2631  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2632  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2633  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2634  return OfsOp;
2635 }
2636 
2637 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
2638  unsigned &Width, int64_t &MinOffset,
2639  int64_t &MaxOffset) {
2640  const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
2641  switch (Opcode) {
2642  // Not a memory operation or something we want to handle.
2643  default:
2644  Scale = TypeSize::Fixed(0);
2645  Width = 0;
2646  MinOffset = MaxOffset = 0;
2647  return false;
2648  case AArch64::STRWpost:
2649  case AArch64::LDRWpost:
2650  Width = 32;
2651  Scale = TypeSize::Fixed(4);
2652  MinOffset = -256;
2653  MaxOffset = 255;
2654  break;
2655  case AArch64::LDURQi:
2656  case AArch64::STURQi:
2657  Width = 16;
2658  Scale = TypeSize::Fixed(1);
2659  MinOffset = -256;
2660  MaxOffset = 255;
2661  break;
2662  case AArch64::PRFUMi:
2663  case AArch64::LDURXi:
2664  case AArch64::LDURDi:
2665  case AArch64::STURXi:
2666  case AArch64::STURDi:
2667  Width = 8;
2668  Scale = TypeSize::Fixed(1);
2669  MinOffset = -256;
2670  MaxOffset = 255;
2671  break;
2672  case AArch64::LDURWi:
2673  case AArch64::LDURSi:
2674  case AArch64::LDURSWi:
2675  case AArch64::STURWi:
2676  case AArch64::STURSi:
2677  Width = 4;
2678  Scale = TypeSize::Fixed(1);
2679  MinOffset = -256;
2680  MaxOffset = 255;
2681  break;
2682  case AArch64::LDURHi:
2683  case AArch64::LDURHHi:
2684  case AArch64::LDURSHXi:
2685  case AArch64::LDURSHWi:
2686  case AArch64::STURHi:
2687  case AArch64::STURHHi:
2688  Width = 2;
2689  Scale = TypeSize::Fixed(1);
2690  MinOffset = -256;
2691  MaxOffset = 255;
2692  break;
2693  case AArch64::LDURBi:
2694  case AArch64::LDURBBi:
2695  case AArch64::LDURSBXi:
2696  case AArch64::LDURSBWi:
2697  case AArch64::STURBi:
2698  case AArch64::STURBBi:
2699  Width = 1;
2700  Scale = TypeSize::Fixed(1);
2701  MinOffset = -256;
2702  MaxOffset = 255;
2703  break;
2704  case AArch64::LDPQi:
2705  case AArch64::LDNPQi:
2706  case AArch64::STPQi:
2707  case AArch64::STNPQi:
2708  Scale = TypeSize::Fixed(16);
2709  Width = 32;
2710  MinOffset = -64;
2711  MaxOffset = 63;
2712  break;
2713  case AArch64::LDRQui:
2714  case AArch64::STRQui:
2715  Scale = TypeSize::Fixed(16);
2716  Width = 16;
2717  MinOffset = 0;
2718  MaxOffset = 4095;
2719  break;
2720  case AArch64::LDPXi:
2721  case AArch64::LDPDi:
2722  case AArch64::LDNPXi:
2723  case AArch64::LDNPDi:
2724  case AArch64::STPXi:
2725  case AArch64::STPDi:
2726  case AArch64::STNPXi:
2727  case AArch64::STNPDi:
2728  Scale = TypeSize::Fixed(8);
2729  Width = 16;
2730  MinOffset = -64;
2731  MaxOffset = 63;
2732  break;
2733  case AArch64::PRFMui:
2734  case AArch64::LDRXui:
2735  case AArch64::LDRDui:
2736  case AArch64::STRXui:
2737  case AArch64::STRDui:
2738  Scale = TypeSize::Fixed(8);
2739  Width = 8;
2740  MinOffset = 0;
2741  MaxOffset = 4095;
2742  break;
2743  case AArch64::StoreSwiftAsyncContext:
2744  // Store is an STRXui, but there might be an ADDXri in the expansion too.
2745  Scale = TypeSize::Fixed(1);
2746  Width = 8;
2747  MinOffset = 0;
2748  MaxOffset = 4095;
2749  break;
2750  case AArch64::LDPWi:
2751  case AArch64::LDPSi:
2752  case AArch64::LDNPWi:
2753  case AArch64::LDNPSi:
2754  case AArch64::STPWi:
2755  case AArch64::STPSi:
2756  case AArch64::STNPWi:
2757  case AArch64::STNPSi:
2758  Scale = TypeSize::Fixed(4);
2759  Width = 8;
2760  MinOffset = -64;
2761  MaxOffset = 63;
2762  break;
2763  case AArch64::LDRWui:
2764  case AArch64::LDRSui:
2765  case AArch64::LDRSWui:
2766  case AArch64::STRWui:
2767  case AArch64::STRSui:
2768  Scale = TypeSize::Fixed(4);
2769  Width = 4;
2770  MinOffset = 0;
2771  MaxOffset = 4095;
2772  break;
2773  case AArch64::LDRHui:
2774  case AArch64::LDRHHui:
2775  case AArch64::LDRSHWui:
2776  case AArch64::LDRSHXui:
2777  case AArch64::STRHui:
2778  case AArch64::STRHHui:
2779  Scale = TypeSize::Fixed(2);
2780  Width = 2;
2781  MinOffset = 0;
2782  MaxOffset = 4095;
2783  break;
2784  case AArch64::LDRBui:
2785  case AArch64::LDRBBui:
2786  case AArch64::LDRSBWui:
2787  case AArch64::LDRSBXui:
2788  case AArch64::STRBui:
2789  case AArch64::STRBBui:
2790  Scale = TypeSize::Fixed(1);
2791  Width = 1;
2792  MinOffset = 0;
2793  MaxOffset = 4095;
2794  break;
2795  case AArch64::STPXpre:
2796  case AArch64::LDPXpost:
2797  case AArch64::STPDpre:
2798  case AArch64::LDPDpost:
2799  Scale = TypeSize::Fixed(8);
2800  Width = 8;
2801  MinOffset = -512;
2802  MaxOffset = 504;
2803  break;
2804  case AArch64::STPQpre:
2805  case AArch64::LDPQpost:
2806  Scale = TypeSize::Fixed(16);
2807  Width = 16;
2808  MinOffset = -1024;
2809  MaxOffset = 1008;
2810  break;
2811  case AArch64::STRXpre:
2812  case AArch64::STRDpre:
2813  case AArch64::LDRXpost:
2814  case AArch64::LDRDpost:
2815  Scale = TypeSize::Fixed(1);
2816  Width = 8;
2817  MinOffset = -256;
2818  MaxOffset = 255;
2819  break;
2820  case AArch64::STRQpre:
2821  case AArch64::LDRQpost:
2822  Scale = TypeSize::Fixed(1);
2823  Width = 16;
2824  MinOffset = -256;
2825  MaxOffset = 255;
2826  break;
2827  case AArch64::ADDG:
2828  Scale = TypeSize::Fixed(16);
2829  Width = 0;
2830  MinOffset = 0;
2831  MaxOffset = 63;
2832  break;
2833  case AArch64::TAGPstack:
2834  Scale = TypeSize::Fixed(16);
2835  Width = 0;
2836  // TAGP with a negative offset turns into SUBP, which has a maximum offset
2837  // of 63 (not 64!).
2838  MinOffset = -63;
2839  MaxOffset = 63;
2840  break;
2841  case AArch64::LDG:
2842  case AArch64::STGOffset:
2843  case AArch64::STZGOffset:
2844  Scale = TypeSize::Fixed(16);
2845  Width = 16;
2846  MinOffset = -256;
2847  MaxOffset = 255;
2848  break;
2849  case AArch64::STR_ZZZZXI:
2850  case AArch64::LDR_ZZZZXI:
2851  Scale = TypeSize::Scalable(16);
2852  Width = SVEMaxBytesPerVector * 4;
2853  MinOffset = -256;
2854  MaxOffset = 252;
2855  break;
2856  case AArch64::STR_ZZZXI:
2857  case AArch64::LDR_ZZZXI:
2858  Scale = TypeSize::Scalable(16);
2859  Width = SVEMaxBytesPerVector * 3;
2860  MinOffset = -256;
2861  MaxOffset = 253;
2862  break;
2863  case AArch64::STR_ZZXI:
2864  case AArch64::LDR_ZZXI:
2865  Scale = TypeSize::Scalable(16);
2866  Width = SVEMaxBytesPerVector * 2;
2867  MinOffset = -256;
2868  MaxOffset = 254;
2869  break;
2870  case AArch64::LDR_PXI:
2871  case AArch64::STR_PXI:
2872  Scale = TypeSize::Scalable(2);
2873  Width = SVEMaxBytesPerVector / 8;
2874  MinOffset = -256;
2875  MaxOffset = 255;
2876  break;
2877  case AArch64::LDR_ZXI:
2878  case AArch64::STR_ZXI:
2879  Scale = TypeSize::Scalable(16);
2880  Width = SVEMaxBytesPerVector;
2881  MinOffset = -256;
2882  MaxOffset = 255;
2883  break;
2884  case AArch64::LD1B_IMM:
2885  case AArch64::LD1H_IMM:
2886  case AArch64::LD1W_IMM:
2887  case AArch64::LD1D_IMM:
2888  case AArch64::LDNT1B_ZRI:
2889  case AArch64::LDNT1H_ZRI:
2890  case AArch64::LDNT1W_ZRI:
2891  case AArch64::LDNT1D_ZRI:
2892  case AArch64::ST1B_IMM:
2893  case AArch64::ST1H_IMM:
2894  case AArch64::ST1W_IMM:
2895  case AArch64::ST1D_IMM:
2896  case AArch64::STNT1B_ZRI:
2897  case AArch64::STNT1H_ZRI:
2898  case AArch64::STNT1W_ZRI:
2899  case AArch64::STNT1D_ZRI:
2900  case AArch64::LDNF1B_IMM:
2901  case AArch64::LDNF1H_IMM:
2902  case AArch64::LDNF1W_IMM:
2903  case AArch64::LDNF1D_IMM:
2904  // A full vectors worth of data
2905  // Width = mbytes * elements
2906  Scale = TypeSize::Scalable(16);
2907  Width = SVEMaxBytesPerVector;
2908  MinOffset = -8;
2909  MaxOffset = 7;
2910  break;
2911  case AArch64::LD1B_H_IMM:
2912  case AArch64::LD1SB_H_IMM:
2913  case AArch64::LD1H_S_IMM:
2914  case AArch64::LD1SH_S_IMM:
2915  case AArch64::LD1W_D_IMM:
2916  case AArch64::LD1SW_D_IMM:
2917  case AArch64::ST1B_H_IMM:
2918  case AArch64::ST1H_S_IMM:
2919  case AArch64::ST1W_D_IMM:
2920  case AArch64::LDNF1B_H_IMM:
2921  case AArch64::LDNF1SB_H_IMM:
2922  case AArch64::LDNF1H_S_IMM:
2923  case AArch64::LDNF1SH_S_IMM:
2924  case AArch64::LDNF1W_D_IMM:
2925  case AArch64::LDNF1SW_D_IMM:
2926  // A half vector worth of data
2927  // Width = mbytes * elements
2928  Scale = TypeSize::Scalable(8);
2929  Width = SVEMaxBytesPerVector / 2;
2930  MinOffset = -8;
2931  MaxOffset = 7;
2932  break;
2933  case AArch64::LD1B_S_IMM:
2934  case AArch64::LD1SB_S_IMM:
2935  case AArch64::LD1H_D_IMM:
2936  case AArch64::LD1SH_D_IMM:
2937  case AArch64::ST1B_S_IMM:
2938  case AArch64::ST1H_D_IMM:
2939  case AArch64::LDNF1B_S_IMM:
2940  case AArch64::LDNF1SB_S_IMM:
2941  case AArch64::LDNF1H_D_IMM:
2942  case AArch64::LDNF1SH_D_IMM:
2943  // A quarter vector worth of data
2944  // Width = mbytes * elements
2945  Scale = TypeSize::Scalable(4);
2946  Width = SVEMaxBytesPerVector / 4;
2947  MinOffset = -8;
2948  MaxOffset = 7;
2949  break;
2950  case AArch64::LD1B_D_IMM:
2951  case AArch64::LD1SB_D_IMM:
2952  case AArch64::ST1B_D_IMM:
2953  case AArch64::LDNF1B_D_IMM:
2954  case AArch64::LDNF1SB_D_IMM:
2955  // A eighth vector worth of data
2956  // Width = mbytes * elements
2957  Scale = TypeSize::Scalable(2);
2958  Width = SVEMaxBytesPerVector / 8;
2959  MinOffset = -8;
2960  MaxOffset = 7;
2961  break;
2962  case AArch64::ST2GOffset:
2963  case AArch64::STZ2GOffset:
2964  Scale = TypeSize::Fixed(16);
2965  Width = 32;
2966  MinOffset = -256;
2967  MaxOffset = 255;
2968  break;
2969  case AArch64::STGPi:
2970  Scale = TypeSize::Fixed(16);
2971  Width = 16;
2972  MinOffset = -64;
2973  MaxOffset = 63;
2974  break;
2975  case AArch64::LD1RB_IMM:
2976  case AArch64::LD1RB_H_IMM:
2977  case AArch64::LD1RB_S_IMM:
2978  case AArch64::LD1RB_D_IMM:
2979  case AArch64::LD1RSB_H_IMM:
2980  case AArch64::LD1RSB_S_IMM:
2981  case AArch64::LD1RSB_D_IMM:
2982  Scale = TypeSize::Fixed(1);
2983  Width = 1;
2984  MinOffset = 0;
2985  MaxOffset = 63;
2986  break;
2987  case AArch64::LD1RH_IMM:
2988  case AArch64::LD1RH_S_IMM:
2989  case AArch64::LD1RH_D_IMM:
2990  case AArch64::LD1RSH_S_IMM:
2991  case AArch64::LD1RSH_D_IMM:
2992  Scale = TypeSize::Fixed(2);
2993  Width = 2;
2994  MinOffset = 0;
2995  MaxOffset = 63;
2996  break;
2997  case AArch64::LD1RW_IMM:
2998  case AArch64::LD1RW_D_IMM:
2999  case AArch64::LD1RSW_IMM:
3000  Scale = TypeSize::Fixed(4);
3001  Width = 4;
3002  MinOffset = 0;
3003  MaxOffset = 63;
3004  break;
3005  case AArch64::LD1RD_IMM:
3006  Scale = TypeSize::Fixed(8);
3007  Width = 8;
3008  MinOffset = 0;
3009  MaxOffset = 63;
3010  break;
3011  }
3012 
3013  return true;
3014 }
3015 
3016 // Scaling factor for unscaled load or store.
3018  switch (Opc) {
3019  default:
3020  llvm_unreachable("Opcode has unknown scale!");
3021  case AArch64::LDRBBui:
3022  case AArch64::LDURBBi:
3023  case AArch64::LDRSBWui:
3024  case AArch64::LDURSBWi:
3025  case AArch64::STRBBui:
3026  case AArch64::STURBBi:
3027  return 1;
3028  case AArch64::LDRHHui:
3029  case AArch64::LDURHHi:
3030  case AArch64::LDRSHWui:
3031  case AArch64::LDURSHWi:
3032  case AArch64::STRHHui:
3033  case AArch64::STURHHi:
3034  return 2;
3035  case AArch64::LDRSui:
3036  case AArch64::LDURSi:
3037  case AArch64::LDRSpre:
3038  case AArch64::LDRSWui:
3039  case AArch64::LDURSWi:
3040  case AArch64::LDRWpre:
3041  case AArch64::LDRWui:
3042  case AArch64::LDURWi:
3043  case AArch64::STRSui:
3044  case AArch64::STURSi:
3045  case AArch64::STRSpre:
3046  case AArch64::STRWui:
3047  case AArch64::STURWi:
3048  case AArch64::STRWpre:
3049  case AArch64::LDPSi:
3050  case AArch64::LDPSWi:
3051  case AArch64::LDPWi:
3052  case AArch64::STPSi:
3053  case AArch64::STPWi:
3054  return 4;
3055  case AArch64::LDRDui:
3056  case AArch64::LDURDi:
3057  case AArch64::LDRDpre:
3058  case AArch64::LDRXui:
3059  case AArch64::LDURXi:
3060  case AArch64::LDRXpre:
3061  case AArch64::STRDui:
3062  case AArch64::STURDi:
3063  case AArch64::STRDpre:
3064  case AArch64::STRXui:
3065  case AArch64::STURXi:
3066  case AArch64::STRXpre:
3067  case AArch64::LDPDi:
3068  case AArch64::LDPXi:
3069  case AArch64::STPDi:
3070  case AArch64::STPXi:
3071  return 8;
3072  case AArch64::LDRQui:
3073  case AArch64::LDURQi:
3074  case AArch64::STRQui:
3075  case AArch64::STURQi:
3076  case AArch64::STRQpre:
3077  case AArch64::LDPQi:
3078  case AArch64::LDRQpre:
3079  case AArch64::STPQi:
3080  case AArch64::STGOffset:
3081  case AArch64::STZGOffset:
3082  case AArch64::ST2GOffset:
3083  case AArch64::STZ2GOffset:
3084  case AArch64::STGPi:
3085  return 16;
3086  }
3087 }
3088 
3090  switch (MI.getOpcode()) {
3091  default:
3092  return false;
3093  case AArch64::LDRWpre:
3094  case AArch64::LDRXpre:
3095  case AArch64::LDRSpre:
3096  case AArch64::LDRDpre:
3097  case AArch64::LDRQpre:
3098  return true;
3099  }
3100 }
3101 
3103  switch (MI.getOpcode()) {
3104  default:
3105  return false;
3106  case AArch64::STRWpre:
3107  case AArch64::STRXpre:
3108  case AArch64::STRSpre:
3109  case AArch64::STRDpre:
3110  case AArch64::STRQpre:
3111  return true;
3112  }
3113 }
3114 
3116  return isPreLd(MI) || isPreSt(MI);
3117 }
3118 
3119 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
3120 // scaled.
3121 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
3122  int Scale = AArch64InstrInfo::getMemScale(Opc);
3123 
3124  // If the byte-offset isn't a multiple of the stride, we can't scale this
3125  // offset.
3126  if (Offset % Scale != 0)
3127  return false;
3128 
3129  // Convert the byte-offset used by unscaled into an "element" offset used
3130  // by the scaled pair load/store instructions.
3131  Offset /= Scale;
3132  return true;
3133 }
3134 
3135 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
3136  if (FirstOpc == SecondOpc)
3137  return true;
3138  // We can also pair sign-ext and zero-ext instructions.
3139  switch (FirstOpc) {
3140  default:
3141  return false;
3142  case AArch64::LDRWui:
3143  case AArch64::LDURWi:
3144  return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
3145  case AArch64::LDRSWui:
3146  case AArch64::LDURSWi:
3147  return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
3148  }
3149  // These instructions can't be paired based on their opcodes.
3150  return false;
3151 }
3152 
3153 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
3154  int64_t Offset1, unsigned Opcode1, int FI2,
3155  int64_t Offset2, unsigned Opcode2) {
3156  // Accesses through fixed stack object frame indices may access a different
3157  // fixed stack slot. Check that the object offsets + offsets match.
3158  if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
3159  int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
3160  int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
3161  assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
3162  // Convert to scaled object offsets.
3163  int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
3164  if (ObjectOffset1 % Scale1 != 0)
3165  return false;
3166  ObjectOffset1 /= Scale1;
3167  int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
3168  if (ObjectOffset2 % Scale2 != 0)
3169  return false;
3170  ObjectOffset2 /= Scale2;
3171  ObjectOffset1 += Offset1;
3172  ObjectOffset2 += Offset2;
3173  return ObjectOffset1 + 1 == ObjectOffset2;
3174  }
3175 
3176  return FI1 == FI2;
3177 }
3178 
3179 /// Detect opportunities for ldp/stp formation.
3180 ///
3181 /// Only called for LdSt for which getMemOperandWithOffset returns true.
3184  ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
3185  unsigned NumBytes) const {
3186  assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
3187  const MachineOperand &BaseOp1 = *BaseOps1.front();
3188  const MachineOperand &BaseOp2 = *BaseOps2.front();
3189  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
3190  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
3191  if (BaseOp1.getType() != BaseOp2.getType())
3192  return false;
3193 
3194  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
3195  "Only base registers and frame indices are supported.");
3196 
3197  // Check for both base regs and base FI.
3198  if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
3199  return false;
3200 
3201  // Only cluster up to a single pair.
3202  if (NumLoads > 2)
3203  return false;
3204 
3205  if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
3206  return false;
3207 
3208  // Can we pair these instructions based on their opcodes?
3209  unsigned FirstOpc = FirstLdSt.getOpcode();
3210  unsigned SecondOpc = SecondLdSt.getOpcode();
3211  if (!canPairLdStOpc(FirstOpc, SecondOpc))
3212  return false;
3213 
3214  // Can't merge volatiles or load/stores that have a hint to avoid pair
3215  // formation, for example.
3216  if (!isCandidateToMergeOrPair(FirstLdSt) ||
3217  !isCandidateToMergeOrPair(SecondLdSt))
3218  return false;
3219 
3220  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
3221  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
3222  if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
3223  return false;
3224 
3225  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
3226  if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
3227  return false;
3228 
3229  // Pairwise instructions have a 7-bit signed offset field.
3230  if (Offset1 > 63 || Offset1 < -64)
3231  return false;
3232 
3233  // The caller should already have ordered First/SecondLdSt by offset.
3234  // Note: except for non-equal frame index bases
3235  if (BaseOp1.isFI()) {
3236  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
3237  "Caller should have ordered offsets.");
3238 
3239  const MachineFrameInfo &MFI =
3240  FirstLdSt.getParent()->getParent()->getFrameInfo();
3241  return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
3242  BaseOp2.getIndex(), Offset2, SecondOpc);
3243  }
3244 
3245  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
3246 
3247  return Offset1 + 1 == Offset2;
3248 }
3249 
3251  unsigned Reg, unsigned SubIdx,
3252  unsigned State,
3253  const TargetRegisterInfo *TRI) {
3254  if (!SubIdx)
3255  return MIB.addReg(Reg, State);
3256 
3258  return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
3259  return MIB.addReg(Reg, State, SubIdx);
3260 }
3261 
3262 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
3263  unsigned NumRegs) {
3264  // We really want the positive remainder mod 32 here, that happens to be
3265  // easily obtainable with a mask.
3266  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
3267 }
3268 
3271  const DebugLoc &DL, MCRegister DestReg,
3272  MCRegister SrcReg, bool KillSrc,
3273  unsigned Opcode,
3274  ArrayRef<unsigned> Indices) const {
3275  assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
3277  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
3278  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
3279  unsigned NumRegs = Indices.size();
3280 
3281  int SubReg = 0, End = NumRegs, Incr = 1;
3282  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
3283  SubReg = NumRegs - 1;
3284  End = -1;
3285  Incr = -1;
3286  }
3287 
3288  for (; SubReg != End; SubReg += Incr) {
3289  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
3290  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
3291  AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
3292  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
3293  }
3294 }
3295 
3298  DebugLoc DL, unsigned DestReg,
3299  unsigned SrcReg, bool KillSrc,
3300  unsigned Opcode, unsigned ZeroReg,
3301  llvm::ArrayRef<unsigned> Indices) const {
3303  unsigned NumRegs = Indices.size();
3304 
3305 #ifndef NDEBUG
3306  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
3307  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
3308  assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
3309  "GPR reg sequences should not be able to overlap");
3310 #endif
3311 
3312  for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
3313  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
3314  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
3315  MIB.addReg(ZeroReg);
3316  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
3317  MIB.addImm(0);
3318  }
3319 }
3320 
3323  const DebugLoc &DL, MCRegister DestReg,
3324  MCRegister SrcReg, bool KillSrc) const {
3325  if (AArch64::GPR32spRegClass.contains(DestReg) &&
3326  (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
3328 
3329  if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
3330  // If either operand is WSP, expand to ADD #0.
3331  if (Subtarget.hasZeroCycleRegMove()) {
3332  // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
3333  MCRegister DestRegX = TRI->getMatchingSuperReg(
3334  DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3335  MCRegister SrcRegX = TRI->getMatchingSuperReg(
3336  SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3337  // This instruction is reading and writing X registers. This may upset
3338  // the register scavenger and machine verifier, so we need to indicate
3339  // that we are reading an undefined value from SrcRegX, but a proper
3340  // value from SrcReg.
3341  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
3342  .addReg(SrcRegX, RegState::Undef)
3343  .addImm(0)
3345  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
3346  } else {
3347  BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
3348  .addReg(SrcReg, getKillRegState(KillSrc))
3349  .addImm(0)
3351  }
3352  } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
3353  BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
3354  .addImm(0)
3356  } else {
3357  if (Subtarget.hasZeroCycleRegMove()) {
3358  // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
3359  MCRegister DestRegX = TRI->getMatchingSuperReg(
3360  DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3361  MCRegister SrcRegX = TRI->getMatchingSuperReg(
3362  SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3363  // This instruction is reading and writing X registers. This may upset
3364  // the register scavenger and machine verifier, so we need to indicate
3365  // that we are reading an undefined value from SrcRegX, but a proper
3366  // value from SrcReg.
3367  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
3368  .addReg(AArch64::XZR)
3369  .addReg(SrcRegX, RegState::Undef)
3370  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
3371  } else {
3372  // Otherwise, expand to ORR WZR.
3373  BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
3374  .addReg(AArch64::WZR)
3375  .addReg(SrcReg, getKillRegState(KillSrc));
3376  }
3377  }
3378  return;
3379  }
3380 
3381  // Copy a Predicate register by ORRing with itself.
3382  if (AArch64::PPRRegClass.contains(DestReg) &&
3383  AArch64::PPRRegClass.contains(SrcReg)) {
3384  assert(Subtarget.hasSVE() && "Unexpected SVE register.");
3385  BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
3386  .addReg(SrcReg) // Pg
3387  .addReg(SrcReg)
3388  .addReg(SrcReg, getKillRegState(KillSrc));
3389  return;
3390  }
3391 
3392  // Copy a Z register by ORRing with itself.
3393  if (AArch64::ZPRRegClass.contains(DestReg) &&
3394  AArch64::ZPRRegClass.contains(SrcReg)) {
3395  assert(Subtarget.hasSVE() && "Unexpected SVE register.");
3396  BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
3397  .addReg(SrcReg)
3398  .addReg(SrcReg, getKillRegState(KillSrc));
3399  return;
3400  }
3401 
3402  // Copy a Z register pair by copying the individual sub-registers.
3403  if (AArch64::ZPR2RegClass.contains(DestReg) &&
3404  AArch64::ZPR2RegClass.contains(SrcReg)) {
3405  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
3406  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3407  Indices);
3408  return;
3409  }
3410 
3411  // Copy a Z register triple by copying the individual sub-registers.
3412  if (AArch64::ZPR3RegClass.contains(DestReg) &&
3413  AArch64::ZPR3RegClass.contains(SrcReg)) {
3414  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
3415  AArch64::zsub2};
3416  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3417  Indices);
3418  return;
3419  }
3420 
3421  // Copy a Z register quad by copying the individual sub-registers.
3422  if (AArch64::ZPR4RegClass.contains(DestReg) &&
3423  AArch64::ZPR4RegClass.contains(SrcReg)) {
3424  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
3425  AArch64::zsub2, AArch64::zsub3};
3426  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3427  Indices);
3428  return;
3429  }
3430 
3431  if (AArch64::GPR64spRegClass.contains(DestReg) &&
3432  (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
3433  if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
3434  // If either operand is SP, expand to ADD #0.
3435  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
3436  .addReg(SrcReg, getKillRegState(KillSrc))
3437  .addImm(0)
3439  } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
3440  BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
3441  .addImm(0)
3443  } else {
3444  // Otherwise, expand to ORR XZR.
3445  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
3446  .addReg(AArch64::XZR)
3447  .addReg(SrcReg, getKillRegState(KillSrc));
3448  }
3449  return;
3450  }
3451 
3452  // Copy a DDDD register quad by copying the individual sub-registers.
3453  if (AArch64::DDDDRegClass.contains(DestReg) &&
3454  AArch64::DDDDRegClass.contains(SrcReg)) {
3455  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
3456  AArch64::dsub2, AArch64::dsub3};
3457  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3458  Indices);
3459  return;
3460  }
3461 
3462  // Copy a DDD register triple by copying the individual sub-registers.
3463  if (AArch64::DDDRegClass.contains(DestReg) &&
3464  AArch64::DDDRegClass.contains(SrcReg)) {
3465  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
3466  AArch64::dsub2};
3467  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3468  Indices);
3469  return;
3470  }
3471 
3472  // Copy a DD register pair by copying the individual sub-registers.
3473  if (AArch64::DDRegClass.contains(DestReg) &&
3474  AArch64::DDRegClass.contains(SrcReg)) {
3475  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
3476  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3477  Indices);
3478  return;
3479  }
3480 
3481  // Copy a QQQQ register quad by copying the individual sub-registers.
3482  if (AArch64::QQQQRegClass.contains(DestReg) &&
3483  AArch64::QQQQRegClass.contains(SrcReg)) {
3484  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
3485  AArch64::qsub2, AArch64::qsub3};
3486  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3487  Indices);
3488  return;
3489  }
3490 
3491  // Copy a QQQ register triple by copying the individual sub-registers.
3492  if (AArch64::QQQRegClass.contains(DestReg) &&
3493  AArch64::QQQRegClass.contains(SrcReg)) {
3494  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
3495  AArch64::qsub2};
3496  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3497  Indices);
3498  return;
3499  }
3500 
3501  // Copy a QQ register pair by copying the individual sub-registers.
3502  if (AArch64::QQRegClass.contains(DestReg) &&
3503  AArch64::QQRegClass.contains(SrcReg)) {
3504  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
3505  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3506  Indices);
3507  return;
3508  }
3509 
3510  if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
3511  AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
3512  static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
3513  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
3514  AArch64::XZR, Indices);
3515  return;
3516  }
3517 
3518  if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
3519  AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
3520  static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
3521  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
3522  AArch64::WZR, Indices);
3523  return;
3524  }
3525 
3526  if (AArch64::FPR128RegClass.contains(DestReg) &&
3527  AArch64::FPR128RegClass.contains(SrcReg)) {
3528  if (Subtarget.hasNEON()) {
3529  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3530  .addReg(SrcReg)
3531  .addReg(SrcReg, getKillRegState(KillSrc));
3532  } else {
3533  BuildMI(MBB, I, DL, get(AArch64::STRQpre))
3534  .addReg(AArch64::SP, RegState::Define)
3535  .addReg(SrcReg, getKillRegState(KillSrc))
3536  .addReg(AArch64::SP)
3537  .addImm(-16);
3538  BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
3539  .addReg(AArch64::SP, RegState::Define)
3540  .addReg(DestReg, RegState::Define)
3541  .addReg(AArch64::SP)
3542  .addImm(16);
3543  }
3544  return;
3545  }
3546 
3547  if (AArch64::FPR64RegClass.contains(DestReg) &&
3548  AArch64::FPR64RegClass.contains(SrcReg)) {
3549  BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
3550  .addReg(SrcReg, getKillRegState(KillSrc));
3551  return;
3552  }
3553 
3554  if (AArch64::FPR32RegClass.contains(DestReg) &&
3555  AArch64::FPR32RegClass.contains(SrcReg)) {
3556  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3557  .addReg(SrcReg, getKillRegState(KillSrc));
3558  return;
3559  }
3560 
3561  if (AArch64::FPR16RegClass.contains(DestReg) &&
3562  AArch64::FPR16RegClass.contains(SrcReg)) {
3563  DestReg =
3564  RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
3565  SrcReg =
3566  RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
3567  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3568  .addReg(SrcReg, getKillRegState(KillSrc));
3569  return;
3570  }
3571 
3572  if (AArch64::FPR8RegClass.contains(DestReg) &&
3573  AArch64::FPR8RegClass.contains(SrcReg)) {
3574  DestReg =
3575  RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
3576  SrcReg =
3577  RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
3578  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3579  .addReg(SrcReg, getKillRegState(KillSrc));
3580  return;
3581  }
3582 
3583  // Copies between GPR64 and FPR64.
3584  if (AArch64::FPR64RegClass.contains(DestReg) &&
3585  AArch64::GPR64RegClass.contains(SrcReg)) {
3586  BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
3587  .addReg(SrcReg, getKillRegState(KillSrc));
3588  return;
3589  }
3590  if (AArch64::GPR64RegClass.contains(DestReg) &&
3591  AArch64::FPR64RegClass.contains(SrcReg)) {
3592  BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
3593  .addReg(SrcReg, getKillRegState(KillSrc));
3594  return;
3595  }
3596  // Copies between GPR32 and FPR32.
3597  if (AArch64::FPR32RegClass.contains(DestReg) &&
3598  AArch64::GPR32RegClass.contains(SrcReg)) {
3599  BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
3600  .addReg(SrcReg, getKillRegState(KillSrc));
3601  return;
3602  }
3603  if (AArch64::GPR32RegClass.contains(DestReg) &&
3604  AArch64::FPR32RegClass.contains(SrcReg)) {
3605  BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
3606  .addReg(SrcReg, getKillRegState(KillSrc));
3607  return;
3608  }
3609 
3610  if (DestReg == AArch64::NZCV) {
3611  assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
3612  BuildMI(MBB, I, DL, get(AArch64::MSR))
3613  .addImm(AArch64SysReg::NZCV)
3614  .addReg(SrcReg, getKillRegState(KillSrc))
3615  .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
3616  return;
3617  }
3618 
3619  if (SrcReg == AArch64::NZCV) {
3620  assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
3621  BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
3622  .addImm(AArch64SysReg::NZCV)
3623  .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
3624  return;
3625  }
3626 
3627 #ifndef NDEBUG
3629  errs() << TRI.getRegAsmName(DestReg) << " = COPY "
3630  << TRI.getRegAsmName(SrcReg) << "\n";
3631 #endif
3632  llvm_unreachable("unimplemented reg-to-reg copy");
3633 }
3634 
3637  MachineBasicBlock::iterator InsertBefore,
3638  const MCInstrDesc &MCID,
3639  Register SrcReg, bool IsKill,
3640  unsigned SubIdx0, unsigned SubIdx1, int FI,
3641  MachineMemOperand *MMO) {
3642  Register SrcReg0 = SrcReg;
3643  Register SrcReg1 = SrcReg;
3644  if (Register::isPhysicalRegister(SrcReg)) {
3645  SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
3646  SubIdx0 = 0;
3647  SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
3648  SubIdx1 = 0;
3649  }
3650  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3651  .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
3652  .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
3653  .addFrameIndex(FI)
3654  .addImm(0)
3655  .addMemOperand(MMO);
3656 }
3657 
3660  bool isKill, int FI, const TargetRegisterClass *RC,
3661  const TargetRegisterInfo *TRI) const {
3662  MachineFunction &MF = *MBB.getParent();
3663  MachineFrameInfo &MFI = MF.getFrameInfo();
3664 
3666  MachineMemOperand *MMO =
3668  MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3669  unsigned Opc = 0;
3670  bool Offset = true;
3671  unsigned StackID = TargetStackID::Default;
3672  switch (TRI->getSpillSize(*RC)) {
3673  case 1:
3674  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3675  Opc = AArch64::STRBui;
3676  break;
3677  case 2:
3678  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3679  Opc = AArch64::STRHui;
3680  else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3681  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3682  Opc = AArch64::STR_PXI;
3684  }
3685  break;
3686  case 4:
3687  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3688  Opc = AArch64::STRWui;
3689  if (Register::isVirtualRegister(SrcReg))
3690  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
3691  else
3692  assert(SrcReg != AArch64::WSP);
3693  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3694  Opc = AArch64::STRSui;
3695  break;
3696  case 8:
3697  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3698  Opc = AArch64::STRXui;
3699  if (Register::isVirtualRegister(SrcReg))
3700  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3701  else
3702  assert(SrcReg != AArch64::SP);
3703  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3704  Opc = AArch64::STRDui;
3705  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3707  get(AArch64::STPWi), SrcReg, isKill,
3708  AArch64::sube32, AArch64::subo32, FI, MMO);
3709  return;
3710  }
3711  break;
3712  case 16:
3713  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3714  Opc = AArch64::STRQui;
3715  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3716  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3717  Opc = AArch64::ST1Twov1d;
3718  Offset = false;
3719  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3721  get(AArch64::STPXi), SrcReg, isKill,
3722  AArch64::sube64, AArch64::subo64, FI, MMO);
3723  return;
3724  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3725  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3726  Opc = AArch64::STR_ZXI;
3728  }
3729  break;
3730  case 24:
3731  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3732  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3733  Opc = AArch64::ST1Threev1d;
3734  Offset = false;
3735  }
3736  break;
3737  case 32:
3738  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3739  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3740  Opc = AArch64::ST1Fourv1d;
3741  Offset = false;
3742  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3743  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3744  Opc = AArch64::ST1Twov2d;
3745  Offset = false;
3746  } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3747  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3748  Opc = AArch64::STR_ZZXI;
3750  }
3751  break;
3752  case 48:
3753  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3754  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3755  Opc = AArch64::ST1Threev2d;
3756  Offset = false;
3757  } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3758  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3759  Opc = AArch64::STR_ZZZXI;
3761  }
3762  break;
3763  case 64:
3764  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3765  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3766  Opc = AArch64::ST1Fourv2d;
3767  Offset = false;
3768  } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3769  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3770  Opc = AArch64::STR_ZZZZXI;
3772  }
3773  break;
3774  }
3775  assert(Opc && "Unknown register class");
3776  MFI.setStackID(FI, StackID);
3777 
3778  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3779  .addReg(SrcReg, getKillRegState(isKill))
3780  .addFrameIndex(FI);
3781 
3782  if (Offset)
3783  MI.addImm(0);
3784  MI.addMemOperand(MMO);
3785 }
3786 
3789  MachineBasicBlock::iterator InsertBefore,
3790  const MCInstrDesc &MCID,
3791  Register DestReg, unsigned SubIdx0,
3792  unsigned SubIdx1, int FI,
3793  MachineMemOperand *MMO) {
3794  Register DestReg0 = DestReg;
3795  Register DestReg1 = DestReg;
3796  bool IsUndef = true;
3797  if (Register::isPhysicalRegister(DestReg)) {
3798  DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
3799  SubIdx0 = 0;
3800  DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
3801  SubIdx1 = 0;
3802  IsUndef = false;
3803  }
3804  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3805  .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
3806  .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
3807  .addFrameIndex(FI)
3808  .addImm(0)
3809  .addMemOperand(MMO);
3810 }
3811 
3814  int FI, const TargetRegisterClass *RC,
3815  const TargetRegisterInfo *TRI) const {
3816  MachineFunction &MF = *MBB.getParent();
3817  MachineFrameInfo &MFI = MF.getFrameInfo();
3819  MachineMemOperand *MMO =
3821  MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3822 
3823  unsigned Opc = 0;
3824  bool Offset = true;
3825  unsigned StackID = TargetStackID::Default;
3826  switch (TRI->getSpillSize(*RC)) {
3827  case 1:
3828  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3829  Opc = AArch64::LDRBui;
3830  break;
3831  case 2:
3832  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3833  Opc = AArch64::LDRHui;
3834  else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3835  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3836  Opc = AArch64::LDR_PXI;
3838  }
3839  break;
3840  case 4:
3841  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3842  Opc = AArch64::LDRWui;
3843  if (Register::isVirtualRegister(DestReg))
3844  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
3845  else
3846  assert(DestReg != AArch64::WSP);
3847  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3848  Opc = AArch64::LDRSui;
3849  break;
3850  case 8:
3851  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3852  Opc = AArch64::LDRXui;
3853  if (Register::isVirtualRegister(DestReg))
3854  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
3855  else
3856  assert(DestReg != AArch64::SP);
3857  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3858  Opc = AArch64::LDRDui;
3859  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3861  get(AArch64::LDPWi), DestReg, AArch64::sube32,
3862  AArch64::subo32, FI, MMO);
3863  return;
3864  }
3865  break;
3866  case 16:
3867  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3868  Opc = AArch64::LDRQui;
3869  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3870  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3871  Opc = AArch64::LD1Twov1d;
3872  Offset = false;
3873  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3875  get(AArch64::LDPXi), DestReg, AArch64::sube64,
3876  AArch64::subo64, FI, MMO);
3877  return;
3878  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3879  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3880  Opc = AArch64::LDR_ZXI;
3882  }
3883  break;
3884  case 24:
3885  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3886  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3887  Opc = AArch64::LD1Threev1d;
3888  Offset = false;
3889  }
3890  break;
3891  case 32:
3892  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3893  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3894  Opc = AArch64::LD1Fourv1d;
3895  Offset = false;
3896  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3897  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3898  Opc = AArch64::LD1Twov2d;
3899  Offset = false;
3900  } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3901  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3902  Opc = AArch64::LDR_ZZXI;
3904  }
3905  break;
3906  case 48:
3907  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3908  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3909  Opc = AArch64::LD1Threev2d;
3910  Offset = false;
3911  } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3912  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3913  Opc = AArch64::LDR_ZZZXI;
3915  }
3916  break;
3917  case 64:
3918  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3919  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3920  Opc = AArch64::LD1Fourv2d;
3921  Offset = false;
3922  } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3923  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3924  Opc = AArch64::LDR_ZZZZXI;
3926  }
3927  break;
3928  }
3929 
3930  assert(Opc && "Unknown register class");
3931  MFI.setStackID(FI, StackID);
3932 
3933  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3934  .addReg(DestReg, getDefRegState(true))
3935  .addFrameIndex(FI);
3936  if (Offset)
3937  MI.addImm(0);
3938  MI.addMemOperand(MMO);
3939 }
3940 
3942  const MachineInstr &UseMI,
3943  const TargetRegisterInfo *TRI) {
3944  return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
3945  UseMI.getIterator()),
3946  [TRI](const MachineInstr &I) {
3947  return I.modifiesRegister(AArch64::NZCV, TRI) ||
3948  I.readsRegister(AArch64::NZCV, TRI);
3949  });
3950 }
3951 
3953  const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
3954  // The smallest scalable element supported by scaled SVE addressing
3955  // modes are predicates, which are 2 scalable bytes in size. So the scalable
3956  // byte offset must always be a multiple of 2.
3957  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3958 
3959  // VGSized offsets are divided by '2', because the VG register is the
3960  // the number of 64bit granules as opposed to 128bit vector chunks,
3961  // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
3962  // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
3963  // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
3964  ByteSized = Offset.getFixed();
3965  VGSized = Offset.getScalable() / 2;
3966 }
3967 
3968 /// Returns the offset in parts to which this frame offset can be
3969 /// decomposed for the purpose of describing a frame offset.
3970 /// For non-scalable offsets this is simply its byte size.
3972  const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
3973  int64_t &NumDataVectors) {
3974  // The smallest scalable element supported by scaled SVE addressing
3975  // modes are predicates, which are 2 scalable bytes in size. So the scalable
3976  // byte offset must always be a multiple of 2.
3977  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3978 
3979  NumBytes = Offset.getFixed();
3980  NumDataVectors = 0;
3981  NumPredicateVectors = Offset.getScalable() / 2;
3982  // This method is used to get the offsets to adjust the frame offset.
3983  // If the function requires ADDPL to be used and needs more than two ADDPL
3984  // instructions, part of the offset is folded into NumDataVectors so that it
3985  // uses ADDVL for part of it, reducing the number of ADDPL instructions.
3986  if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
3987  NumPredicateVectors > 62) {
3988  NumDataVectors = NumPredicateVectors / 8;
3989  NumPredicateVectors -= NumDataVectors * 8;
3990  }
3991 }
3992 
3993 // Helper function to emit a frame offset adjustment from a given
3994 // pointer (SrcReg), stored into DestReg. This function is explicit
3995 // in that it requires the opcode.
3998  const DebugLoc &DL, unsigned DestReg,
3999  unsigned SrcReg, int64_t Offset, unsigned Opc,
4000  const TargetInstrInfo *TII,
4001  MachineInstr::MIFlag Flag, bool NeedsWinCFI,
4002  bool *HasWinCFI) {
4003  int Sign = 1;
4004  unsigned MaxEncoding, ShiftSize;
4005  switch (Opc) {
4006  case AArch64::ADDXri:
4007  case AArch64::ADDSXri:
4008  case AArch64::SUBXri:
4009  case AArch64::SUBSXri:
4010  MaxEncoding = 0xfff;
4011  ShiftSize = 12;
4012  break;
4013  case AArch64::ADDVL_XXI:
4014  case AArch64::ADDPL_XXI:
4015  MaxEncoding = 31;
4016  ShiftSize = 0;
4017  if (Offset < 0) {
4018  MaxEncoding = 32;
4019  Sign = -1;
4020  Offset = -Offset;
4021  }
4022  break;
4023  default:
4024  llvm_unreachable("Unsupported opcode");
4025  }
4026 
4027  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
4028  // scratch register. If DestReg is a virtual register, use it as the
4029  // scratch register; otherwise, create a new virtual register (to be
4030  // replaced by the scavenger at the end of PEI). That case can be optimized
4031  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
4032  // register can be loaded with offset%8 and the add/sub can use an extending
4033  // instruction with LSL#3.
4034  // Currently the function handles any offsets but generates a poor sequence
4035  // of code.
4036  // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
4037 
4038  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
4039  Register TmpReg = DestReg;
4040  if (TmpReg == AArch64::XZR)
4042  &AArch64::GPR64RegClass);
4043  do {
4044  uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
4045  unsigned LocalShiftSize = 0;
4046  if (ThisVal > MaxEncoding) {
4047  ThisVal = ThisVal >> ShiftSize;
4048  LocalShiftSize = ShiftSize;
4049  }
4050  assert((ThisVal >> ShiftSize) <= MaxEncoding &&
4051  "Encoding cannot handle value that big");
4052 
4053  Offset -= ThisVal << LocalShiftSize;
4054  if (Offset == 0)
4055  TmpReg = DestReg;
4056  auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
4057  .addReg(SrcReg)
4058  .addImm(Sign * (int)ThisVal);
4059  if (ShiftSize)
4060  MBI = MBI.addImm(
4061  AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
4062  MBI = MBI.setMIFlag(Flag);
4063 
4064  if (NeedsWinCFI) {
4065  assert(Sign == 1 && "SEH directives should always have a positive sign");
4066  int Imm = (int)(ThisVal << LocalShiftSize);
4067  if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
4068  (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
4069  if (HasWinCFI)
4070  *HasWinCFI = true;
4071  if (Imm == 0)
4072  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
4073  else
4074  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
4075  .addImm(Imm)
4076  .setMIFlag(Flag);
4077  assert(Offset == 0 && "Expected remaining offset to be zero to "
4078  "emit a single SEH directive");
4079  } else if (DestReg == AArch64::SP) {
4080  if (HasWinCFI)
4081  *HasWinCFI = true;
4082  assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
4083  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
4084  .addImm(Imm)
4085  .setMIFlag(Flag);
4086  }
4087  if (HasWinCFI)
4088  *HasWinCFI = true;
4089  }
4090 
4091  SrcReg = TmpReg;
4092  } while (Offset);
4093 }
4094 
4097  unsigned DestReg, unsigned SrcReg,
4099  MachineInstr::MIFlag Flag, bool SetNZCV,
4100  bool NeedsWinCFI, bool *HasWinCFI) {
4101  int64_t Bytes, NumPredicateVectors, NumDataVectors;
4103  Offset, Bytes, NumPredicateVectors, NumDataVectors);
4104 
4105  // First emit non-scalable frame offsets, or a simple 'mov'.
4106  if (Bytes || (!Offset && SrcReg != DestReg)) {
4107  assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
4108  "SP increment/decrement not 8-byte aligned");
4109  unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
4110  if (Bytes < 0) {
4111  Bytes = -Bytes;
4112  Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
4113  }
4114  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
4115  NeedsWinCFI, HasWinCFI);
4116  SrcReg = DestReg;
4117  }
4118 
4119  assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
4120  "SetNZCV not supported with SVE vectors");
4121  assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
4122  "WinCFI not supported with SVE vectors");
4123 
4124  if (NumDataVectors) {
4125  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
4126  AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
4127  SrcReg = DestReg;
4128  }
4129 
4130  if (NumPredicateVectors) {
4131  assert(DestReg != AArch64::SP && "Unaligned access to SP");
4132  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
4133  AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
4134  }
4135 }
4136 
4140  LiveIntervals *LIS, VirtRegMap *VRM) const {
4141  // This is a bit of a hack. Consider this instruction:
4142  //
4143  // %0 = COPY %sp; GPR64all:%0
4144  //
4145  // We explicitly chose GPR64all for the virtual register so such a copy might
4146  // be eliminated by RegisterCoalescer. However, that may not be possible, and
4147  // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
4148  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
4149  //
4150  // To prevent that, we are going to constrain the %0 register class here.
4151  //
4152  // <rdar://problem/11522048>
4153  //
4154  if (MI.isFullCopy()) {
4155  Register DstReg = MI.getOperand(0).getReg();
4156  Register SrcReg = MI.getOperand(1).getReg();
4157  if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
4158  MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
4159  return nullptr;
4160  }
4161  if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
4162  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4163  return nullptr;
4164  }
4165  }
4166 
4167  // Handle the case where a copy is being spilled or filled but the source
4168  // and destination register class don't match. For example:
4169  //
4170  // %0 = COPY %xzr; GPR64common:%0
4171  //
4172  // In this case we can still safely fold away the COPY and generate the
4173  // following spill code:
4174  //
4175  // STRXui %xzr, %stack.0
4176  //
4177  // This also eliminates spilled cross register class COPYs (e.g. between x and
4178  // d regs) of the same size. For example:
4179  //
4180  // %0 = COPY %1; GPR64:%0, FPR64:%1
4181  //
4182  // will be filled as
4183  //
4184  // LDRDui %0, fi<#0>
4185  //
4186  // instead of
4187  //
4188  // LDRXui %Temp, fi<#0>
4189  // %0 = FMOV %Temp
4190  //
4191  if (MI.isCopy() && Ops.size() == 1 &&
4192  // Make sure we're only folding the explicit COPY defs/uses.
4193  (Ops[0] == 0 || Ops[0] == 1)) {
4194  bool IsSpill = Ops[0] == 0;
4195  bool IsFill = !IsSpill;
4197  const MachineRegisterInfo &MRI = MF.getRegInfo();
4198  MachineBasicBlock &MBB = *MI.getParent();
4199  const MachineOperand &DstMO = MI.getOperand(0);
4200  const MachineOperand &SrcMO = MI.getOperand(1);
4201  Register DstReg = DstMO.getReg();
4202  Register SrcReg = SrcMO.getReg();
4203  // This is slightly expensive to compute for physical regs since
4204  // getMinimalPhysRegClass is slow.
4205  auto getRegClass = [&](unsigned Reg) {
4208  };
4209 
4210  if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
4211  assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
4212  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
4213  "Mismatched register size in non subreg COPY");
4214  if (IsSpill)
4215  storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
4216  getRegClass(SrcReg), &TRI);
4217  else
4218  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
4219  getRegClass(DstReg), &TRI);
4220  return &*--InsertPt;
4221  }
4222 
4223  // Handle cases like spilling def of:
4224  //
4225  // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
4226  //
4227  // where the physical register source can be widened and stored to the full
4228  // virtual reg destination stack slot, in this case producing:
4229  //
4230  // STRXui %xzr, %stack.0
4231  //
4232  if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
4233  assert(SrcMO.getSubReg() == 0 &&
4234  "Unexpected subreg on physical register");
4235  const TargetRegisterClass *SpillRC;
4236  unsigned SpillSubreg;
4237  switch (DstMO.getSubReg()) {
4238  default:
4239  SpillRC = nullptr;
4240  break;
4241  case AArch64::sub_32:
4242  case AArch64::ssub:
4243  if (AArch64::GPR32RegClass.contains(SrcReg)) {
4244  SpillRC = &AArch64::GPR64RegClass;
4245  SpillSubreg = AArch64::sub_32;
4246  } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
4247  SpillRC = &AArch64::FPR64RegClass;
4248  SpillSubreg = AArch64::ssub;
4249  } else
4250  SpillRC = nullptr;
4251  break;
4252  case AArch64::dsub:
4253  if (AArch64::FPR64RegClass.contains(SrcReg)) {
4254  SpillRC = &AArch64::FPR128RegClass;
4255  SpillSubreg = AArch64::dsub;
4256  } else
4257  SpillRC = nullptr;
4258  break;
4259  }
4260 
4261  if (SpillRC)
4262  if (unsigned WidenedSrcReg =
4263  TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
4264  storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
4265  FrameIndex, SpillRC, &TRI);
4266  return &*--InsertPt;
4267  }
4268  }
4269 
4270  // Handle cases like filling use of:
4271  //
4272  // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
4273  //
4274  // where we can load the full virtual reg source stack slot, into the subreg
4275  // destination, in this case producing:
4276  //
4277  // LDRWui %0:sub_32<def,read-undef>, %stack.0
4278  //
4279  if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
4280  const TargetRegisterClass *FillRC;
4281  switch (DstMO.getSubReg()) {
4282  default:
4283  FillRC = nullptr;
4284  break;
4285  case AArch64::sub_32:
4286  FillRC = &AArch64::GPR32RegClass;
4287  break;
4288  case AArch64::ssub:
4289  FillRC = &AArch64::FPR32RegClass;
4290  break;
4291  case AArch64::dsub:
4292  FillRC = &AArch64::FPR64RegClass;
4293  break;
4294  }
4295 
4296  if (FillRC) {
4297  assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
4298  TRI.getRegSizeInBits(*FillRC) &&
4299  "Mismatched regclass size on folded subreg COPY");
4300  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
4301  MachineInstr &LoadMI = *--InsertPt;
4302  MachineOperand &LoadDst = LoadMI.getOperand(0);
4303  assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
4304  LoadDst.setSubReg(DstMO.getSubReg());
4305  LoadDst.setIsUndef();
4306  return &LoadMI;
4307  }
4308  }
4309  }
4310 
4311  // Cannot fold.
4312  return nullptr;
4313 }
4314 
4316  StackOffset &SOffset,
4317  bool *OutUseUnscaledOp,
4318  unsigned *OutUnscaledOp,
4319  int64_t *EmittableOffset) {
4320  // Set output values in case of early exit.
4321  if (EmittableOffset)
4322  *EmittableOffset = 0;
4323  if (OutUseUnscaledOp)
4324  *OutUseUnscaledOp = false;
4325  if (OutUnscaledOp)
4326  *OutUnscaledOp = 0;
4327 
4328  // Exit early for structured vector spills/fills as they can't take an
4329  // immediate offset.
4330  switch (MI.getOpcode()) {
4331  default:
4332  break;
4333  case AArch64::LD1Twov2d:
4334  case AArch64::LD1Threev2d:
4335  case AArch64::LD1Fourv2d:
4336  case AArch64::LD1Twov1d:
4337  case AArch64::LD1Threev1d:
4338  case AArch64::LD1Fourv1d:
4339  case AArch64::ST1Twov2d:
4340  case AArch64::ST1Threev2d:
4341  case AArch64::ST1Fourv2d:
4342  case AArch64::ST1Twov1d:
4343  case AArch64::ST1Threev1d:
4344  case AArch64::ST1Fourv1d:
4345  case AArch64::ST1i8:
4346  case AArch64::ST1i16:
4347  case AArch64::ST1i32:
4348  case AArch64::ST1i64:
4349  case AArch64::IRG:
4350  case AArch64::IRGstack:
4351  case AArch64::STGloop:
4352  case AArch64::STZGloop:
4354  }
4355 
4356  // Get the min/max offset and the scale.
4357  TypeSize ScaleValue(0U, false);
4358  unsigned Width;
4359  int64_t MinOff, MaxOff;
4360  if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
4361  MaxOff))
4362  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
4363 
4364  // Construct the complete offset.
4365  bool IsMulVL = ScaleValue.isScalable();
4366  unsigned Scale = ScaleValue.getKnownMinSize();
4367  int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
4368 
4369  const MachineOperand &ImmOpnd =
4370  MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
4371  Offset += ImmOpnd.getImm() * Scale;
4372 
4373  // If the offset doesn't match the scale, we rewrite the instruction to
4374  // use the unscaled instruction instead. Likewise, if we have a negative
4375  // offset and there is an unscaled op to use.
4376  Optional<unsigned> UnscaledOp =
4378  bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
4379  if (useUnscaledOp &&
4380  !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
4381  MaxOff))
4382  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
4383 
4384  Scale = ScaleValue.getKnownMinSize();
4385  assert(IsMulVL == ScaleValue.isScalable() &&
4386  "Unscaled opcode has different value for scalable");
4387 
4388  int64_t Remainder = Offset % Scale;
4389  assert(!(Remainder && useUnscaledOp) &&
4390  "Cannot have remainder when using unscaled op");
4391 
4392  assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
4393  int64_t NewOffset = Offset / Scale;
4394  if (MinOff <= NewOffset && NewOffset <= MaxOff)
4395  Offset = Remainder;
4396  else {
4397  NewOffset = NewOffset < 0 ? MinOff : MaxOff;
4398  Offset = Offset - NewOffset * Scale + Remainder;
4399  }
4400 
4401  if (EmittableOffset)
4402  *EmittableOffset = NewOffset;
4403  if (OutUseUnscaledOp)
4404  *OutUseUnscaledOp = useUnscaledOp;
4405  if (OutUnscaledOp && UnscaledOp)
4406  *OutUnscaledOp = *UnscaledOp;
4407 
4408  if (IsMulVL)
4409  SOffset = StackOffset::get(SOffset.getFixed(), Offset);
4410  else
4411  SOffset = StackOffset::get(Offset, SOffset.getScalable());
4413  (SOffset ? 0 : AArch64FrameOffsetIsLegal);
4414 }
4415 
4416 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
4417  unsigned FrameReg, StackOffset &Offset,
4418  const AArch64InstrInfo *TII) {
4419  unsigned Opcode = MI.getOpcode();
4420  unsigned ImmIdx = FrameRegIdx + 1;
4421 
4422  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
4423  Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
4424  emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
4425  MI.getOperand(0).getReg(), FrameReg, Offset, TII,
4426  MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
4427  MI.eraseFromParent();
4428  Offset = StackOffset();
4429  return true;
4430  }
4431 
4432  int64_t NewOffset;
4433  unsigned UnscaledOp;
4434  bool UseUnscaledOp;
4435  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
4436  &UnscaledOp, &NewOffset);
4439  // Replace the FrameIndex with FrameReg.
4440  MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
4441  if (UseUnscaledOp)
4442  MI.setDesc(TII->get(UnscaledOp));
4443 
4444  MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
4445  return !Offset;
4446  }
4447 
4448  return false;
4449 }
4450 
4452  return MCInstBuilder(AArch64::HINT).addImm(0);
4453 }
4454 
4455 // AArch64 supports MachineCombiner.
4456 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
4457 
4458 // True when Opc sets flag
4459 static bool isCombineInstrSettingFlag(unsigned Opc) {
4460  switch (Opc) {
4461  case AArch64::ADDSWrr:
4462  case AArch64::ADDSWri:
4463  case AArch64::ADDSXrr:
4464  case AArch64::ADDSXri:
4465  case AArch64::SUBSWrr:
4466  case AArch64::SUBSXrr:
4467  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4468  case AArch64::SUBSWri:
4469  case AArch64::SUBSXri:
4470  return true;
4471  default:
4472  break;
4473  }
4474  return false;
4475 }
4476 
4477 // 32b Opcodes that can be combined with a MUL
4478 static bool isCombineInstrCandidate32(unsigned Opc) {
4479  switch (Opc) {
4480  case AArch64::ADDWrr:
4481  case AArch64::ADDWri:
4482  case AArch64::SUBWrr:
4483  case AArch64::ADDSWrr:
4484  case AArch64::ADDSWri:
4485  case AArch64::SUBSWrr:
4486  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4487  case AArch64::SUBWri:
4488  case AArch64::SUBSWri:
4489  return true;
4490  default:
4491  break;
4492  }
4493  return false;
4494 }
4495 
4496 // 64b Opcodes that can be combined with a MUL
4497 static bool isCombineInstrCandidate64(unsigned Opc) {
4498  switch (Opc) {
4499  case AArch64::ADDXrr:
4500  case AArch64::ADDXri:
4501  case AArch64::SUBXrr:
4502  case AArch64::ADDSXrr:
4503  case AArch64::ADDSXri:
4504  case AArch64::SUBSXrr:
4505  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4506  case AArch64::SUBXri:
4507  case AArch64::SUBSXri:
4508  case AArch64::ADDv8i8:
4509  case AArch64::ADDv16i8:
4510  case AArch64::ADDv4i16:
4511  case AArch64::ADDv8i16:
4512  case AArch64::ADDv2i32:
4513  case AArch64::ADDv4i32:
4514  case AArch64::SUBv8i8:
4515  case AArch64::SUBv16i8:
4516  case AArch64::SUBv4i16:
4517  case AArch64::SUBv8i16:
4518  case AArch64::SUBv2i32:
4519  case AArch64::SUBv4i32:
4520  return true;
4521  default:
4522  break;
4523  }
4524  return false;
4525 }
4526 
4527 // FP Opcodes that can be combined with a FMUL.
4528 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
4529  switch (Inst.getOpcode()) {
4530  default:
4531  break;
4532  case AArch64::FADDHrr:
4533  case AArch64::FADDSrr:
4534  case AArch64::FADDDrr:
4535  case AArch64::FADDv4f16:
4536  case AArch64::FADDv8f16:
4537  case AArch64::FADDv2f32:
4538  case AArch64::FADDv2f64:
4539  case AArch64::FADDv4f32:
4540  case AArch64::FSUBHrr:
4541  case AArch64::FSUBSrr:
4542  case AArch64::FSUBDrr:
4543  case AArch64::FSUBv4f16:
4544  case AArch64::FSUBv8f16:
4545  case AArch64::FSUBv2f32:
4546  case AArch64::FSUBv2f64:
4547  case AArch64::FSUBv4f32:
4549  // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
4550  // the target options or if FADD/FSUB has the contract fast-math flag.
4551  return Options.UnsafeFPMath ||
4552  Options.AllowFPOpFusion == FPOpFusion::Fast ||
4554  return true;
4555  }
4556  return false;
4557 }
4558 
4559 // Opcodes that can be combined with a MUL
4560 static bool isCombineInstrCandidate(unsigned Opc) {
4562 }
4563 
4564 //
4565 // Utility routine that checks if \param MO is defined by an
4566 // \param CombineOpc instruction in the basic block \param MBB
4568  unsigned CombineOpc, unsigned ZeroReg = 0,
4569  bool CheckZeroReg = false) {
4571  MachineInstr *MI = nullptr;
4572 
4573  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
4574  MI = MRI.getUniqueVRegDef(MO.getReg());
4575  // And it needs to be in the trace (otherwise, it won't have a depth).
4576  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
4577  return false;
4578  // Must only used by the user we combine with.
4579  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
4580  return false;
4581 
4582  if (CheckZeroReg) {
4583  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
4584  MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
4585  MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
4586  // The third input reg must be zero.
4587  if (MI->getOperand(3).getReg() != ZeroReg)
4588  return false;
4589  }
4590 
4591  return true;
4592 }
4593 
4594 //
4595 // Is \param MO defined by an integer multiply and can be combined?
4597  unsigned MulOpc, unsigned ZeroReg) {
4598  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
4599 }
4600 
4601 //
4602 // Is \param MO defined by a floating-point multiply and can be combined?
4604  unsigned MulOpc) {
4605  return canCombine(MBB, MO, MulOpc);
4606 }
4607 
4608 // TODO: There are many more machine instruction opcodes to match:
4609 // 1. Other data types (integer, vectors)
4610 // 2. Other math / logic operations (xor, or)
4611 // 3. Other forms of the same operation (intrinsics and other variants)
4613  const MachineInstr &Inst) const {
4614  switch (Inst.getOpcode()) {
4615  case AArch64::FADDDrr:
4616  case AArch64::FADDSrr:
4617  case AArch64::FADDv2f32:
4618  case AArch64::FADDv2f64:
4619  case AArch64::FADDv4f32:
4620  case AArch64::FMULDrr:
4621  case AArch64::FMULSrr:
4622  case AArch64::FMULX32:
4623  case AArch64::FMULX64:
4624  case AArch64::FMULXv2f32:
4625  case AArch64::FMULXv2f64:
4626  case AArch64::FMULXv4f32:
4627  case AArch64::FMULv2f32:
4628  case AArch64::FMULv2f64:
4629  case AArch64::FMULv4f32:
4630  return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
4631  default:
4632  return false;
4633  }
4634 }
4635 
4636 /// Find instructions that can be turned into madd.
4637 static bool getMaddPatterns(MachineInstr &Root,
4639  unsigned Opc = Root.getOpcode();
4640  MachineBasicBlock &MBB = *Root.getParent();
4641  bool Found = false;
4642 
4643  if (!isCombineInstrCandidate(Opc))
4644  return false;
4645  if (isCombineInstrSettingFlag(Opc)) {
4646  int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
4647  // When NZCV is live bail out.
4648  if (Cmp_NZCV == -1)
4649  return false;
4650  unsigned NewOpc = convertToNonFlagSettingOpc(Root);
4651  // When opcode can't change bail out.
4652  // CHECKME: do we miss any cases for opcode conversion?
4653  if (NewOpc == Opc)
4654  return false;
4655  Opc = NewOpc;
4656  }
4657 
4658  auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
4660  if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
4661  Patterns.push_back(Pattern);
4662  Found = true;
4663  }
4664  };
4665 
4666  auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
4667  if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
4668  Patterns.push_back(Pattern);
4669  Found = true;
4670  }
4671  };
4672 
4673  typedef MachineCombinerPattern MCP;
4674 
4675  switch (Opc) {
4676  default:
4677  break;
4678  case AArch64::ADDWrr:
4679  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4680  "ADDWrr does not have register operands");
4681  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
4682  setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
4683  break;
4684  case AArch64::ADDXrr:
4685  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
4686  setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
4687  break;
4688  case AArch64::SUBWrr:
4689  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
4690  setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
4691  break;
4692  case AArch64::SUBXrr:
4693  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
4694  setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
4695  break;
4696  case AArch64::ADDWri:
4697  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
4698  break;
4699  case AArch64::ADDXri:
4700  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
4701  break;
4702  case AArch64::SUBWri:
4703  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
4704  break;
4705  case AArch64::SUBXri:
4706  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
4707  break;
4708  case AArch64::ADDv8i8:
4709  setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
4710  setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
4711  break;
4712  case AArch64::ADDv16i8:
4713  setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
4714  setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
4715  break;
4716  case AArch64::ADDv4i16:
4717  setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
4718  setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
4719  setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
4720  setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
4721  break;
4722  case AArch64::ADDv8i16:
4723  setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
4724  setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
4725  setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
4726  setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
4727  break;
4728  case AArch64::ADDv2i32:
4729  setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
4730  setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
4731  setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
4732  setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
4733  break;
4734  case AArch64::ADDv4i32:
4735  setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
4736  setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
4737  setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
4738  setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
4739  break;
4740  case AArch64::SUBv8i8:
4741  setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
4742  setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
4743  break;
4744  case AArch64::SUBv16i8:
4745  setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
4746  setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
4747  break;
4748  case AArch64::SUBv4i16:
4749  setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
4750  setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
4751  setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
4752  setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
4753  break;
4754  case AArch64::SUBv8i16:
4755  setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
4756  setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
4757  setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
4758  setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
4759  break;
4760  case AArch64::SUBv2i32:
4761  setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
4762  setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
4763  setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
4764  setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
4765  break;
4766  case AArch64::SUBv4i32:
4767  setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
4768  setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
4769  setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
4770  setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
4771  break;
4772  }
4773  return Found;
4774 }
4775 /// Floating-Point Support
4776 
4777 /// Find instructions that can be turned into madd.
4778 static bool getFMAPatterns(MachineInstr &Root,
4780 
4781  if (!isCombineInstrCandidateFP(Root))
4782  return false;
4783 
4784  MachineBasicBlock &MBB = *Root.getParent();
4785  bool Found = false;
4786 
4787  auto Match = [&](int Opcode, int Operand,
4788  MachineCombinerPattern Pattern) -> bool {
4789  if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
4790  Patterns.push_back(Pattern);
4791  return true;
4792  }
4793  return false;
4794  };
4795 
4796  typedef MachineCombinerPattern MCP;
4797 
4798  switch (Root.getOpcode()) {
4799  default:
4800  assert(false && "Unsupported FP instruction in combiner\n");
4801  break;
4802  case AArch64::FADDHrr:
4803  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4804  "FADDHrr does not have register operands");
4805 
4806  Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
4807  Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
4808  break;
4809  case AArch64::FADDSrr:
4810  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4811  "FADDSrr does not have register operands");
4812 
4813  Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
4814  Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
4815 
4816  Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
4817  Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
4818  break;
4819  case AArch64::FADDDrr:
4820  Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
4821  Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
4822 
4823  Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
4824  Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
4825  break;
4826  case AArch64::FADDv4f16:
4827  Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
4828  Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
4829 
4830  Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
4831  Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
4832  break;
4833  case AArch64::FADDv8f16:
4834  Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
4835  Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
4836 
4837  Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
4838  Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
4839  break;
4840  case AArch64::FADDv2f32:
4841  Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
4842  Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
4843 
4844  Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
4845  Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
4846  break;
4847  case AArch64::FADDv2f64:
4848  Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
4849  Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
4850 
4851  Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
4852  Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
4853  break;
4854  case AArch64::FADDv4f32:
4855  Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
4856  Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
4857 
4858  Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
4859  Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
4860  break;
4861  case AArch64::FSUBHrr:
4862  Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
4863  Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
4864  Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
4865  break;
4866  case AArch64::FSUBSrr:
4867  Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
4868 
4869  Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
4870  Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
4871 
4872  Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
4873  break;
4874  case AArch64::FSUBDrr:
4875  Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
4876 
4877  Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
4878  Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
4879 
4880  Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
4881  break;
4882  case AArch64::FSUBv4f16:
4883  Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
4884  Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
4885 
4886  Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
4887  Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
4888  break;
4889  case AArch64::FSUBv8f16:
4890  Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
4891  Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
4892 
4893  Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
4894  Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
4895  break;
4896  case AArch64::FSUBv2f32:
4897  Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
4898  Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
4899 
4900  Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
4901  Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
4902  break;
4903  case AArch64::FSUBv2f64:
4904  Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
4905  Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
4906 
4907  Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
4908  Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
4909  break;
4910  case AArch64::FSUBv4f32:
4911  Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
4912  Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
4913 
4914  Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
4915  Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
4916  break;
4917  }
4918  return Found;
4919 }
4920 
4921 static bool getFMULPatterns(MachineInstr &Root,
4923  MachineBasicBlock &MBB = *Root.getParent();
4924  bool Found = false;
4925 
4926  auto Match = [&](unsigned Opcode, int Operand,
4927  MachineCombinerPattern Pattern) -> bool {
4929  MachineOperand &MO = Root.getOperand(Operand);
4930  MachineInstr *MI = nullptr;
4931  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
4932  MI = MRI.getUniqueVRegDef(MO.getReg());
4933  if (MI && MI->getOpcode() == Opcode) {
4934  Patterns.push_back(Pattern);
4935  return true;
4936  }
4937  return false;
4938  };
4939 
4940  typedef MachineCombinerPattern MCP;
4941 
4942  switch (Root.getOpcode()) {
4943  default:
4944  return false;
4945  case AArch64::FMULv2f32:
4946  Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
4947  Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
4948  break;
4949  case AArch64::FMULv2f64:
4950  Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
4951  Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
4952  break;
4953  case AArch64::FMULv4f16:
4954  Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
4955  Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
4956  break;
4957  case AArch64::FMULv4f32:
4958  Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
4959  Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
4960  break;
4961  case AArch64::FMULv8f16:
4962  Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
4963  Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
4964  break;
4965  }
4966 
4967  return Found;
4968 }
4969 
4970 /// Return true when a code sequence can improve throughput. It
4971 /// should be called only for instructions in loops.
4972 /// \param Pattern - combiner pattern
4975  switch (Pattern) {
4976  default:
4977  break;
5083  return true;
5084  } // end switch (Pattern)
5085  return false;
5086 }
5087 /// Return true when there is potentially a faster code sequence for an
5088 /// instruction chain ending in \p Root. All potential patterns are listed in
5089 /// the \p Pattern vector. Pattern should be sorted in priority order since the
5090 /// pattern evaluator stops checking as soon as it finds a faster sequence.
5091 
5094  bool DoRegPressureReduce) const {
5095  // Integer patterns
5096  if (getMaddPatterns(Root, Patterns))
5097  return true;
5098  // Floating point patterns
5099  if (getFMULPatterns(Root, Patterns))
5100  return true;
5101  if (getFMAPatterns(Root, Patterns))
5102  return true;
5103 
5104  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
5105  DoRegPressureReduce);
5106 }
5107 
5108 enum class FMAInstKind { Default, Indexed, Accumulator };
5109 /// genFusedMultiply - Generate fused multiply instructions.
5110 /// This function supports both integer and floating point instructions.
5111 /// A typical example:
5112 /// F|MUL I=A,B,0
5113 /// F|ADD R,I,C
5114 /// ==> F|MADD R,A,B,C
5115 /// \param MF Containing MachineFunction
5116 /// \param MRI Register information
5117 /// \param TII Target information
5118 /// \param Root is the F|ADD instruction
5119 /// \param [out] InsInstrs is a vector of machine instructions and will
5120 /// contain the generated madd instruction
5121 /// \param IdxMulOpd is index of operand in Root that is the result of
5122 /// the F|MUL. In the example above IdxMulOpd is 1.
5123 /// \param MaddOpc the opcode fo the f|madd instruction
5124 /// \param RC Register class of operands
5125 /// \param kind of fma instruction (addressing mode) to be generated
5126 /// \param ReplacedAddend is the result register from the instruction
5127 /// replacing the non-combined operand, if any.
5128 static MachineInstr *
5130  const TargetInstrInfo *TII, MachineInstr &Root,
5131  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
5132  unsigned MaddOpc, const TargetRegisterClass *RC,
5134  const Register *ReplacedAddend = nullptr) {
5135  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
5136 
5137  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
5138  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
5139  Register ResultReg = Root.getOperand(0).getReg();
5140  Register SrcReg0 = MUL->getOperand(1).getReg();
5141  bool Src0IsKill = MUL->getOperand(1).isKill();
5142  Register SrcReg1 = MUL->getOperand(2).getReg();
5143  bool Src1IsKill = MUL->getOperand(2).isKill();
5144 
5145  unsigned SrcReg2;
5146  bool Src2IsKill;
5147  if (ReplacedAddend) {
5148  // If we just generated a new addend, we must be it's only use.
5149  SrcReg2 = *ReplacedAddend;
5150  Src2IsKill = true;
5151  } else {
5152  SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
5153  Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
5154  }
5155 
5156  if (Register::isVirtualRegister(ResultReg))
5157  MRI.constrainRegClass(ResultReg, RC);
5158  if (Register::isVirtualRegister(SrcReg0))
5159  MRI.constrainRegClass(SrcReg0, RC);
5160  if (Register::isVirtualRegister(SrcReg1))
5161  MRI.constrainRegClass(SrcReg1, RC);
5162  if (Register::isVirtualRegister(SrcReg2))
5163  MRI.constrainRegClass(SrcReg2, RC);
5164 
5165  MachineInstrBuilder MIB;
5166  if (kind == FMAInstKind::Default)
5167  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
5168  .addReg(SrcReg0, getKillRegState(Src0IsKill))
5169  .addReg(SrcReg1, getKillRegState(Src1IsKill))
5170  .addReg(SrcReg2, getKillRegState(Src2IsKill));
5171  else if (kind == FMAInstKind::Indexed)
5172  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
5173  .addReg(SrcReg2, getKillRegState(Src2IsKill))
5174  .addReg(SrcReg0, getKillRegState(Src0IsKill))
5175  .addReg(SrcReg1, getKillRegState(Src1IsKill))
5176  .addImm(MUL->getOperand(3).getImm());
5177  else if (kind == FMAInstKind::Accumulator)
5178  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
5179  .addReg(SrcReg2, getKillRegState(Src2IsKill))
5180  .addReg(SrcReg0, getKillRegState(Src0IsKill))
5181  .addReg(SrcReg1, getKillRegState(Src1IsKill));
5182  else
5183  assert(false && "Invalid FMA instruction kind \n");
5184  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
5185  InsInstrs.push_back(MIB);
5186  return MUL;
5187 }
5188 
5189 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
5190 static MachineInstr *
5193  unsigned IdxDupOp, unsigned MulOpc,
5195  assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
5196  "Invalid index of FMUL operand");
5197 
5198  MachineFunction &MF = *Root.getMF();
5199  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
5200 
5201  MachineInstr *Dup =
5202  MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
5203 
5204  Register DupSrcReg = Dup->getOperand(1).getReg();
5205  MRI.clearKillFlags(DupSrcReg);
5206  MRI.constrainRegClass(DupSrcReg, RC);
5207 
5208  unsigned DupSrcLane = Dup->getOperand(2).getImm();
5209 
5210  unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
5211  MachineOperand &MulOp = Root.getOperand(IdxMulOp);
5212 
5213  Register ResultReg = Root.getOperand(0).getReg();
5214 
5215  MachineInstrBuilder MIB;
5216  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MulOpc), ResultReg)
5217  .add(MulOp)
5218  .addReg(DupSrcReg)
5219  .addImm(DupSrcLane);
5220 
5221  InsInstrs.push_back(MIB);
5222  return &Root;
5223 }
5224 
5225 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
5226 /// instructions.
5227 ///
5228 /// \see genFusedMultiply
5232  unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
5233  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
5235 }
5236 
5237 /// genNeg - Helper to generate an intermediate negation of the second operand
5238 /// of Root
5