LLVM  14.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/CodeGen/StackMaps.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/MC/MCAsmInfo.h"
37 #include "llvm/MC/MCInst.h"
38 #include "llvm/MC/MCInstBuilder.h"
39 #include "llvm/MC/MCInstrDesc.h"
40 #include "llvm/Support/Casting.h"
41 #include "llvm/Support/CodeGen.h"
43 #include "llvm/Support/Compiler.h"
48 #include <cassert>
49 #include <cstdint>
50 #include <iterator>
51 #include <utility>
52 
53 using namespace llvm;
54 
55 #define GET_INSTRINFO_CTOR_DTOR
56 #include "AArch64GenInstrInfo.inc"
57 
59  "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
60  cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
61 
63  "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
64  cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
65 
66 static cl::opt<unsigned>
67  BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
68  cl::desc("Restrict range of Bcc instructions (DEBUG)"));
69 
71  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
72  AArch64::CATCHRET),
73  RI(STI.getTargetTriple()), Subtarget(STI) {}
74 
75 /// GetInstSize - Return the number of bytes of code the specified
76 /// instruction may be. This returns the maximum number of bytes.
78  const MachineBasicBlock &MBB = *MI.getParent();
79  const MachineFunction *MF = MBB.getParent();
80  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
81 
82  {
83  auto Op = MI.getOpcode();
85  return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
86  }
87 
88  // Meta-instructions emit no code.
89  if (MI.isMetaInstruction())
90  return 0;
91 
92  // FIXME: We currently only handle pseudoinstructions that don't get expanded
93  // before the assembly printer.
94  unsigned NumBytes = 0;
95  const MCInstrDesc &Desc = MI.getDesc();
96  switch (Desc.getOpcode()) {
97  default:
98  // Anything not explicitly designated otherwise is a normal 4-byte insn.
99  NumBytes = 4;
100  break;
101  case TargetOpcode::STACKMAP:
102  // The upper bound for a stackmap intrinsic is the full length of its shadow
103  NumBytes = StackMapOpers(&MI).getNumPatchBytes();
104  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
105  break;
106  case TargetOpcode::PATCHPOINT:
107  // The size of the patchpoint intrinsic is the number of bytes requested
108  NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
109  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
110  break;
111  case TargetOpcode::STATEPOINT:
112  NumBytes = StatepointOpers(&MI).getNumPatchBytes();
113  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
114  // No patch bytes means a normal call inst is emitted
115  if (NumBytes == 0)
116  NumBytes = 4;
117  break;
119  // This gets lowered to an instruction sequence which takes 16 bytes
120  NumBytes = 16;
121  break;
122  case AArch64::SpeculationBarrierISBDSBEndBB:
123  // This gets lowered to 2 4-byte instructions.
124  NumBytes = 8;
125  break;
126  case AArch64::SpeculationBarrierSBEndBB:
127  // This gets lowered to 1 4-byte instructions.
128  NumBytes = 4;
129  break;
130  case AArch64::JumpTableDest32:
131  case AArch64::JumpTableDest16:
132  case AArch64::JumpTableDest8:
133  NumBytes = 12;
134  break;
135  case AArch64::SPACE:
136  NumBytes = MI.getOperand(1).getImm();
137  break;
138  case AArch64::StoreSwiftAsyncContext:
139  NumBytes = 20;
140  break;
141  case TargetOpcode::BUNDLE:
142  NumBytes = getInstBundleLength(MI);
143  break;
144  }
145 
146  return NumBytes;
147 }
148 
149 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
150  unsigned Size = 0;
152  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
153  while (++I != E && I->isInsideBundle()) {
154  assert(!I->isBundle() && "No nested bundle!");
156  }
157  return Size;
158 }
159 
162  // Block ends with fall-through condbranch.
163  switch (LastInst->getOpcode()) {
164  default:
165  llvm_unreachable("Unknown branch instruction?");
166  case AArch64::Bcc:
167  Target = LastInst->getOperand(1).getMBB();
168  Cond.push_back(LastInst->getOperand(0));
169  break;
170  case AArch64::CBZW:
171  case AArch64::CBZX:
172  case AArch64::CBNZW:
173  case AArch64::CBNZX:
174  Target = LastInst->getOperand(1).getMBB();
175  Cond.push_back(MachineOperand::CreateImm(-1));
176  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
177  Cond.push_back(LastInst->getOperand(0));
178  break;
179  case AArch64::TBZW:
180  case AArch64::TBZX:
181  case AArch64::TBNZW:
182  case AArch64::TBNZX:
183  Target = LastInst->getOperand(2).getMBB();
184  Cond.push_back(MachineOperand::CreateImm(-1));
185  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
186  Cond.push_back(LastInst->getOperand(0));
187  Cond.push_back(LastInst->getOperand(1));
188  }
189 }
190 
191 static unsigned getBranchDisplacementBits(unsigned Opc) {
192  switch (Opc) {
193  default:
194  llvm_unreachable("unexpected opcode!");
195  case AArch64::B:
196  return 64;
197  case AArch64::TBNZW:
198  case AArch64::TBZW:
199  case AArch64::TBNZX:
200  case AArch64::TBZX:
201  return TBZDisplacementBits;
202  case AArch64::CBNZW:
203  case AArch64::CBZW:
204  case AArch64::CBNZX:
205  case AArch64::CBZX:
206  return CBZDisplacementBits;
207  case AArch64::Bcc:
208  return BCCDisplacementBits;
209  }
210 }
211 
213  int64_t BrOffset) const {
214  unsigned Bits = getBranchDisplacementBits(BranchOp);
215  assert(Bits >= 3 && "max branch displacement must be enough to jump"
216  "over conditional branch expansion");
217  return isIntN(Bits, BrOffset / 4);
218 }
219 
222  switch (MI.getOpcode()) {
223  default:
224  llvm_unreachable("unexpected opcode!");
225  case AArch64::B:
226  return MI.getOperand(0).getMBB();
227  case AArch64::TBZW:
228  case AArch64::TBNZW:
229  case AArch64::TBZX:
230  case AArch64::TBNZX:
231  return MI.getOperand(2).getMBB();
232  case AArch64::CBZW:
233  case AArch64::CBNZW:
234  case AArch64::CBZX:
235  case AArch64::CBNZX:
236  case AArch64::Bcc:
237  return MI.getOperand(1).getMBB();
238  }
239 }
240 
241 // Branch analysis.
243  MachineBasicBlock *&TBB,
244  MachineBasicBlock *&FBB,
246  bool AllowModify) const {
247  // If the block has no terminators, it just falls into the block after it.
249  if (I == MBB.end())
250  return false;
251 
252  // Skip over SpeculationBarrierEndBB terminators
253  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
254  I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
255  --I;
256  }
257 
258  if (!isUnpredicatedTerminator(*I))
259  return false;
260 
261  // Get the last instruction in the block.
262  MachineInstr *LastInst = &*I;
263 
264  // If there is only one terminator instruction, process it.
265  unsigned LastOpc = LastInst->getOpcode();
266  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
267  if (isUncondBranchOpcode(LastOpc)) {
268  TBB = LastInst->getOperand(0).getMBB();
269  return false;
270  }
271  if (isCondBranchOpcode(LastOpc)) {
272  // Block ends with fall-through condbranch.
273  parseCondBranch(LastInst, TBB, Cond);
274  return false;
275  }
276  return true; // Can't handle indirect branch.
277  }
278 
279  // Get the instruction before it if it is a terminator.
280  MachineInstr *SecondLastInst = &*I;
281  unsigned SecondLastOpc = SecondLastInst->getOpcode();
282 
283  // If AllowModify is true and the block ends with two or more unconditional
284  // branches, delete all but the first unconditional branch.
285  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
286  while (isUncondBranchOpcode(SecondLastOpc)) {
287  LastInst->eraseFromParent();
288  LastInst = SecondLastInst;
289  LastOpc = LastInst->getOpcode();
290  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
291  // Return now the only terminator is an unconditional branch.
292  TBB = LastInst->getOperand(0).getMBB();
293  return false;
294  } else {
295  SecondLastInst = &*I;
296  SecondLastOpc = SecondLastInst->getOpcode();
297  }
298  }
299  }
300 
301  // If we're allowed to modify and the block ends in a unconditional branch
302  // which could simply fallthrough, remove the branch. (Note: This case only
303  // matters when we can't understand the whole sequence, otherwise it's also
304  // handled by BranchFolding.cpp.)
305  if (AllowModify && isUncondBranchOpcode(LastOpc) &&
307  LastInst->eraseFromParent();
308  LastInst = SecondLastInst;
309  LastOpc = LastInst->getOpcode();
310  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
311  assert(!isUncondBranchOpcode(LastOpc) &&
312  "unreachable unconditional branches removed above");
313 
314  if (isCondBranchOpcode(LastOpc)) {
315  // Block ends with fall-through condbranch.
316  parseCondBranch(LastInst, TBB, Cond);
317  return false;
318  }
319  return true; // Can't handle indirect branch.
320  } else {
321  SecondLastInst = &*I;
322  SecondLastOpc = SecondLastInst->getOpcode();
323  }
324  }
325 
326  // If there are three terminators, we don't know what sort of block this is.
327  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
328  return true;
329 
330  // If the block ends with a B and a Bcc, handle it.
331  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
332  parseCondBranch(SecondLastInst, TBB, Cond);
333  FBB = LastInst->getOperand(0).getMBB();
334  return false;
335  }
336 
337  // If the block ends with two unconditional branches, handle it. The second
338  // one is not executed, so remove it.
339  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
340  TBB = SecondLastInst->getOperand(0).getMBB();
341  I = LastInst;
342  if (AllowModify)
343  I->eraseFromParent();
344  return false;
345  }
346 
347  // ...likewise if it ends with an indirect branch followed by an unconditional
348  // branch.
349  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
350  I = LastInst;
351  if (AllowModify)
352  I->eraseFromParent();
353  return true;
354  }
355 
356  // Otherwise, can't handle this.
357  return true;
358 }
359 
361  MachineBranchPredicate &MBP,
362  bool AllowModify) const {
363  // For the moment, handle only a block which ends with a cb(n)zx followed by
364  // a fallthrough. Why this? Because it is a common form.
365  // TODO: Should we handle b.cc?
366 
368  if (I == MBB.end())
369  return true;
370 
371  // Skip over SpeculationBarrierEndBB terminators
372  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
373  I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
374  --I;
375  }
376 
377  if (!isUnpredicatedTerminator(*I))
378  return true;
379 
380  // Get the last instruction in the block.
381  MachineInstr *LastInst = &*I;
382  unsigned LastOpc = LastInst->getOpcode();
383  if (!isCondBranchOpcode(LastOpc))
384  return true;
385 
386  switch (LastOpc) {
387  default:
388  return true;
389  case AArch64::CBZW:
390  case AArch64::CBZX:
391  case AArch64::CBNZW:
392  case AArch64::CBNZX:
393  break;
394  };
395 
396  MBP.TrueDest = LastInst->getOperand(1).getMBB();
397  assert(MBP.TrueDest && "expected!");
398  MBP.FalseDest = MBB.getNextNode();
399 
400  MBP.ConditionDef = nullptr;
401  MBP.SingleUseCondition = false;
402 
403  MBP.LHS = LastInst->getOperand(0);
404  MBP.RHS = MachineOperand::CreateImm(0);
405  MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
407  return false;
408 }
409 
412  if (Cond[0].getImm() != -1) {
413  // Regular Bcc
414  AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
415  Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
416  } else {
417  // Folded compare-and-branch
418  switch (Cond[1].getImm()) {
419  default:
420  llvm_unreachable("Unknown conditional branch!");
421  case AArch64::CBZW:
422  Cond[1].setImm(AArch64::CBNZW);
423  break;
424  case AArch64::CBNZW:
425  Cond[1].setImm(AArch64::CBZW);
426  break;
427  case AArch64::CBZX:
428  Cond[1].setImm(AArch64::CBNZX);
429  break;
430  case AArch64::CBNZX:
431  Cond[1].setImm(AArch64::CBZX);
432  break;
433  case AArch64::TBZW:
434  Cond[1].setImm(AArch64::TBNZW);
435  break;
436  case AArch64::TBNZW:
437  Cond[1].setImm(AArch64::TBZW);
438  break;
439  case AArch64::TBZX:
440  Cond[1].setImm(AArch64::TBNZX);
441  break;
442  case AArch64::TBNZX:
443  Cond[1].setImm(AArch64::TBZX);
444  break;
445  }
446  }
447 
448  return false;
449 }
450 
452  int *BytesRemoved) const {
454  if (I == MBB.end())
455  return 0;
456 
457  if (!isUncondBranchOpcode(I->getOpcode()) &&
458  !isCondBranchOpcode(I->getOpcode()))
459  return 0;
460 
461  // Remove the branch.
462  I->eraseFromParent();
463 
464  I = MBB.end();
465 
466  if (I == MBB.begin()) {
467  if (BytesRemoved)
468  *BytesRemoved = 4;
469  return 1;
470  }
471  --I;
472  if (!isCondBranchOpcode(I->getOpcode())) {
473  if (BytesRemoved)
474  *BytesRemoved = 4;
475  return 1;
476  }
477 
478  // Remove the branch.
479  I->eraseFromParent();
480  if (BytesRemoved)
481  *BytesRemoved = 8;
482 
483  return 2;
484 }
485 
486 void AArch64InstrInfo::instantiateCondBranch(
489  if (Cond[0].getImm() != -1) {
490  // Regular Bcc
491  BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
492  } else {
493  // Folded compare-and-branch
494  // Note that we use addOperand instead of addReg to keep the flags.
495  const MachineInstrBuilder MIB =
496  BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
497  if (Cond.size() > 3)
498  MIB.addImm(Cond[3].getImm());
499  MIB.addMBB(TBB);
500  }
501 }
502 
505  ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
506  // Shouldn't be a fall through.
507  assert(TBB && "insertBranch must not be told to insert a fallthrough");
508 
509  if (!FBB) {
510  if (Cond.empty()) // Unconditional branch?
511  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
512  else
513  instantiateCondBranch(MBB, DL, TBB, Cond);
514 
515  if (BytesAdded)
516  *BytesAdded = 4;
517 
518  return 1;
519  }
520 
521  // Two-way conditional branch.
522  instantiateCondBranch(MBB, DL, TBB, Cond);
523  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
524 
525  if (BytesAdded)
526  *BytesAdded = 8;
527 
528  return 2;
529 }
530 
531 // Find the original register that VReg is copied from.
532 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
533  while (Register::isVirtualRegister(VReg)) {
534  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
535  if (!DefMI->isFullCopy())
536  return VReg;
537  VReg = DefMI->getOperand(1).getReg();
538  }
539  return VReg;
540 }
541 
542 // Determine if VReg is defined by an instruction that can be folded into a
543 // csel instruction. If so, return the folded opcode, and the replacement
544 // register.
545 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
546  unsigned *NewVReg = nullptr) {
547  VReg = removeCopies(MRI, VReg);
548  if (!Register::isVirtualRegister(VReg))
549  return 0;
550 
551  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
552  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
553  unsigned Opc = 0;
554  unsigned SrcOpNum = 0;
555  switch (DefMI->getOpcode()) {
556  case AArch64::ADDSXri:
557  case AArch64::ADDSWri:
558  // if NZCV is used, do not fold.
559  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
560  return 0;
561  // fall-through to ADDXri and ADDWri.
563  case AArch64::ADDXri:
564  case AArch64::ADDWri:
565  // add x, 1 -> csinc.
566  if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
567  DefMI->getOperand(3).getImm() != 0)
568  return 0;
569  SrcOpNum = 1;
570  Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
571  break;
572 
573  case AArch64::ORNXrr:
574  case AArch64::ORNWrr: {
575  // not x -> csinv, represented as orn dst, xzr, src.
576  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
577  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
578  return 0;
579  SrcOpNum = 2;
580  Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
581  break;
582  }
583 
584  case AArch64::SUBSXrr:
585  case AArch64::SUBSWrr:
586  // if NZCV is used, do not fold.
587  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
588  return 0;
589  // fall-through to SUBXrr and SUBWrr.
591  case AArch64::SUBXrr:
592  case AArch64::SUBWrr: {
593  // neg x -> csneg, represented as sub dst, xzr, src.
594  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
595  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
596  return 0;
597  SrcOpNum = 2;
598  Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
599  break;
600  }
601  default:
602  return 0;
603  }
604  assert(Opc && SrcOpNum && "Missing parameters");
605 
606  if (NewVReg)
607  *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
608  return Opc;
609 }
610 
613  Register DstReg, Register TrueReg,
614  Register FalseReg, int &CondCycles,
615  int &TrueCycles,
616  int &FalseCycles) const {
617  // Check register classes.
619  const TargetRegisterClass *RC =
620  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
621  if (!RC)
622  return false;
623 
624  // Also need to check the dest regclass, in case we're trying to optimize
625  // something like:
626  // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
627  if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
628  return false;
629 
630  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
631  unsigned ExtraCondLat = Cond.size() != 1;
632 
633  // GPRs are handled by csel.
634  // FIXME: Fold in x+1, -x, and ~x when applicable.
635  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
636  AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
637  // Single-cycle csel, csinc, csinv, and csneg.
638  CondCycles = 1 + ExtraCondLat;
639  TrueCycles = FalseCycles = 1;
640  if (canFoldIntoCSel(MRI, TrueReg))
641  TrueCycles = 0;
642  else if (canFoldIntoCSel(MRI, FalseReg))
643  FalseCycles = 0;
644  return true;
645  }
646 
647  // Scalar floating point is handled by fcsel.
648  // FIXME: Form fabs, fmin, and fmax when applicable.
649  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
650  AArch64::FPR32RegClass.hasSubClassEq(RC)) {
651  CondCycles = 5 + ExtraCondLat;
652  TrueCycles = FalseCycles = 2;
653  return true;
654  }
655 
656  // Can't do vectors.
657  return false;
658 }
659 
662  const DebugLoc &DL, Register DstReg,
664  Register TrueReg, Register FalseReg) const {
666 
667  // Parse the condition code, see parseCondBranch() above.
669  switch (Cond.size()) {
670  default:
671  llvm_unreachable("Unknown condition opcode in Cond");
672  case 1: // b.cc
673  CC = AArch64CC::CondCode(Cond[0].getImm());
674  break;
675  case 3: { // cbz/cbnz
676  // We must insert a compare against 0.
677  bool Is64Bit;
678  switch (Cond[1].getImm()) {
679  default:
680  llvm_unreachable("Unknown branch opcode in Cond");
681  case AArch64::CBZW:
682  Is64Bit = false;
683  CC = AArch64CC::EQ;
684  break;
685  case AArch64::CBZX:
686  Is64Bit = true;
687  CC = AArch64CC::EQ;
688  break;
689  case AArch64::CBNZW:
690  Is64Bit = false;
691  CC = AArch64CC::NE;
692  break;
693  case AArch64::CBNZX:
694  Is64Bit = true;
695  CC = AArch64CC::NE;
696  break;
697  }
698  Register SrcReg = Cond[2].getReg();
699  if (Is64Bit) {
700  // cmp reg, #0 is actually subs xzr, reg, #0.
701  MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
702  BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
703  .addReg(SrcReg)
704  .addImm(0)
705  .addImm(0);
706  } else {
707  MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
708  BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
709  .addReg(SrcReg)
710  .addImm(0)
711  .addImm(0);
712  }
713  break;
714  }
715  case 4: { // tbz/tbnz
716  // We must insert a tst instruction.
717  switch (Cond[1].getImm()) {
718  default:
719  llvm_unreachable("Unknown branch opcode in Cond");
720  case AArch64::TBZW:
721  case AArch64::TBZX:
722  CC = AArch64CC::EQ;
723  break;
724  case AArch64::TBNZW:
725  case AArch64::TBNZX:
726  CC = AArch64CC::NE;
727  break;
728  }
729  // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
730  if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
731  BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
732  .addReg(Cond[2].getReg())
733  .addImm(
734  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
735  else
736  BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
737  .addReg(Cond[2].getReg())
738  .addImm(
739  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
740  break;
741  }
742  }
743 
744  unsigned Opc = 0;
745  const TargetRegisterClass *RC = nullptr;
746  bool TryFold = false;
747  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
748  RC = &AArch64::GPR64RegClass;
749  Opc = AArch64::CSELXr;
750  TryFold = true;
751  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
752  RC = &AArch64::GPR32RegClass;
753  Opc = AArch64::CSELWr;
754  TryFold = true;
755  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
756  RC = &AArch64::FPR64RegClass;
757  Opc = AArch64::FCSELDrrr;
758  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
759  RC = &AArch64::FPR32RegClass;
760  Opc = AArch64::FCSELSrrr;
761  }
762  assert(RC && "Unsupported regclass");
763 
764  // Try folding simple instructions into the csel.
765  if (TryFold) {
766  unsigned NewVReg = 0;
767  unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
768  if (FoldedOpc) {
769  // The folded opcodes csinc, csinc and csneg apply the operation to
770  // FalseReg, so we need to invert the condition.
772  TrueReg = FalseReg;
773  } else
774  FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
775 
776  // Fold the operation. Leave any dead instructions for DCE to clean up.
777  if (FoldedOpc) {
778  FalseReg = NewVReg;
779  Opc = FoldedOpc;
780  // The extends the live range of NewVReg.
781  MRI.clearKillFlags(NewVReg);
782  }
783  }
784 
785  // Pull all virtual register into the appropriate class.
786  MRI.constrainRegClass(TrueReg, RC);
787  MRI.constrainRegClass(FalseReg, RC);
788 
789  // Insert the csel.
790  BuildMI(MBB, I, DL, get(Opc), DstReg)
791  .addReg(TrueReg)
792  .addReg(FalseReg)
793  .addImm(CC);
794 }
795 
796 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
797 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
798  uint64_t Imm = MI.getOperand(1).getImm();
799  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
800  uint64_t Encoding;
801  return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
802 }
803 
804 // FIXME: this implementation should be micro-architecture dependent, so a
805 // micro-architecture target hook should be introduced here in future.
807  if (!Subtarget.hasCustomCheapAsMoveHandling())
808  return MI.isAsCheapAsAMove();
809 
810  const unsigned Opcode = MI.getOpcode();
811 
812  // Firstly, check cases gated by features.
813 
814  if (Subtarget.hasZeroCycleZeroingFP()) {
815  if (Opcode == AArch64::FMOVH0 ||
816  Opcode == AArch64::FMOVS0 ||
817  Opcode == AArch64::FMOVD0)
818  return true;
819  }
820 
821  if (Subtarget.hasZeroCycleZeroingGP()) {
822  if (Opcode == TargetOpcode::COPY &&
823  (MI.getOperand(1).getReg() == AArch64::WZR ||
824  MI.getOperand(1).getReg() == AArch64::XZR))
825  return true;
826  }
827 
828  // Secondly, check cases specific to sub-targets.
829 
830  if (Subtarget.hasExynosCheapAsMoveHandling()) {
831  if (isExynosCheapAsMove(MI))
832  return true;
833 
834  return MI.isAsCheapAsAMove();
835  }
836 
837  // Finally, check generic cases.
838 
839  switch (Opcode) {
840  default:
841  return false;
842 
843  // add/sub on register without shift
844  case AArch64::ADDWri:
845  case AArch64::ADDXri:
846  case AArch64::SUBWri:
847  case AArch64::SUBXri:
848  return (MI.getOperand(3).getImm() == 0);
849 
850  // logical ops on immediate
851  case AArch64::ANDWri:
852  case AArch64::ANDXri:
853  case AArch64::EORWri:
854  case AArch64::EORXri:
855  case AArch64::ORRWri:
856  case AArch64::ORRXri:
857  return true;
858 
859  // logical ops on register without shift
860  case AArch64::ANDWrr:
861  case AArch64::ANDXrr:
862  case AArch64::BICWrr:
863  case AArch64::BICXrr:
864  case AArch64::EONWrr:
865  case AArch64::EONXrr:
866  case AArch64::EORWrr:
867  case AArch64::EORXrr:
868  case AArch64::ORNWrr:
869  case AArch64::ORNXrr:
870  case AArch64::ORRWrr:
871  case AArch64::ORRXrr:
872  return true;
873 
874  // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
875  // ORRXri, it is as cheap as MOV
876  case AArch64::MOVi32imm:
877  return canBeExpandedToORR(MI, 32);
878  case AArch64::MOVi64imm:
879  return canBeExpandedToORR(MI, 64);
880  }
881 
882  llvm_unreachable("Unknown opcode to check as cheap as a move!");
883 }
884 
886  switch (MI.getOpcode()) {
887  default:
888  return false;
889 
890  case AArch64::ADDWrs:
891  case AArch64::ADDXrs:
892  case AArch64::ADDSWrs:
893  case AArch64::ADDSXrs: {
894  unsigned Imm = MI.getOperand(3).getImm();
895  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
896  if (ShiftVal == 0)
897  return true;
898  return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
899  }
900 
901  case AArch64::ADDWrx:
902  case AArch64::ADDXrx:
903  case AArch64::ADDXrx64:
904  case AArch64::ADDSWrx:
905  case AArch64::ADDSXrx:
906  case AArch64::ADDSXrx64: {
907  unsigned Imm = MI.getOperand(3).getImm();
908  switch (AArch64_AM::getArithExtendType(Imm)) {
909  default:
910  return false;
911  case AArch64_AM::UXTB:
912  case AArch64_AM::UXTH:
913  case AArch64_AM::UXTW:
914  case AArch64_AM::UXTX:
915  return AArch64_AM::getArithShiftValue(Imm) <= 4;
916  }
917  }
918 
919  case AArch64::SUBWrs:
920  case AArch64::SUBSWrs: {
921  unsigned Imm = MI.getOperand(3).getImm();
922  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
923  return ShiftVal == 0 ||
924  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
925  }
926 
927  case AArch64::SUBXrs:
928  case AArch64::SUBSXrs: {
929  unsigned Imm = MI.getOperand(3).getImm();
930  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
931  return ShiftVal == 0 ||
932  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
933  }
934 
935  case AArch64::SUBWrx:
936  case AArch64::SUBXrx:
937  case AArch64::SUBXrx64:
938  case AArch64::SUBSWrx:
939  case AArch64::SUBSXrx:
940  case AArch64::SUBSXrx64: {
941  unsigned Imm = MI.getOperand(3).getImm();
942  switch (AArch64_AM::getArithExtendType(Imm)) {
943  default:
944  return false;
945  case AArch64_AM::UXTB:
946  case AArch64_AM::UXTH:
947  case AArch64_AM::UXTW:
948  case AArch64_AM::UXTX:
949  return AArch64_AM::getArithShiftValue(Imm) == 0;
950  }
951  }
952 
953  case AArch64::LDRBBroW:
954  case AArch64::LDRBBroX:
955  case AArch64::LDRBroW:
956  case AArch64::LDRBroX:
957  case AArch64::LDRDroW:
958  case AArch64::LDRDroX:
959  case AArch64::LDRHHroW:
960  case AArch64::LDRHHroX:
961  case AArch64::LDRHroW:
962  case AArch64::LDRHroX:
963  case AArch64::LDRQroW:
964  case AArch64::LDRQroX:
965  case AArch64::LDRSBWroW:
966  case AArch64::LDRSBWroX:
967  case AArch64::LDRSBXroW:
968  case AArch64::LDRSBXroX:
969  case AArch64::LDRSHWroW:
970  case AArch64::LDRSHWroX:
971  case AArch64::LDRSHXroW:
972  case AArch64::LDRSHXroX:
973  case AArch64::LDRSWroW:
974  case AArch64::LDRSWroX:
975  case AArch64::LDRSroW:
976  case AArch64::LDRSroX:
977  case AArch64::LDRWroW:
978  case AArch64::LDRWroX:
979  case AArch64::LDRXroW:
980  case AArch64::LDRXroX:
981  case AArch64::PRFMroW:
982  case AArch64::PRFMroX:
983  case AArch64::STRBBroW:
984  case AArch64::STRBBroX:
985  case AArch64::STRBroW:
986  case AArch64::STRBroX:
987  case AArch64::STRDroW:
988  case AArch64::STRDroX:
989  case AArch64::STRHHroW:
990  case AArch64::STRHHroX:
991  case AArch64::STRHroW:
992  case AArch64::STRHroX:
993  case AArch64::STRQroW:
994  case AArch64::STRQroX:
995  case AArch64::STRSroW:
996  case AArch64::STRSroX:
997  case AArch64::STRWroW:
998  case AArch64::STRWroX:
999  case AArch64::STRXroW:
1000  case AArch64::STRXroX: {
1001  unsigned IsSigned = MI.getOperand(3).getImm();
1002  return !IsSigned;
1003  }
1004  }
1005 }
1006 
1008  unsigned Opc = MI.getOpcode();
1009  switch (Opc) {
1010  default:
1011  return false;
1012  case AArch64::SEH_StackAlloc:
1013  case AArch64::SEH_SaveFPLR:
1014  case AArch64::SEH_SaveFPLR_X:
1015  case AArch64::SEH_SaveReg:
1016  case AArch64::SEH_SaveReg_X:
1017  case AArch64::SEH_SaveRegP:
1018  case AArch64::SEH_SaveRegP_X:
1019  case AArch64::SEH_SaveFReg:
1020  case AArch64::SEH_SaveFReg_X:
1021  case AArch64::SEH_SaveFRegP:
1022  case AArch64::SEH_SaveFRegP_X:
1023  case AArch64::SEH_SetFP:
1024  case AArch64::SEH_AddFP:
1025  case AArch64::SEH_Nop:
1026  case AArch64::SEH_PrologEnd:
1027  case AArch64::SEH_EpilogStart:
1028  case AArch64::SEH_EpilogEnd:
1029  return true;
1030  }
1031 }
1032 
1034  Register &SrcReg, Register &DstReg,
1035  unsigned &SubIdx) const {
1036  switch (MI.getOpcode()) {
1037  default:
1038  return false;
1039  case AArch64::SBFMXri: // aka sxtw
1040  case AArch64::UBFMXri: // aka uxtw
1041  // Check for the 32 -> 64 bit extension case, these instructions can do
1042  // much more.
1043  if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1044  return false;
1045  // This is a signed or unsigned 32 -> 64 bit extension.
1046  SrcReg = MI.getOperand(1).getReg();
1047  DstReg = MI.getOperand(0).getReg();
1048  SubIdx = AArch64::sub_32;
1049  return true;
1050  }
1051 }
1052 
1054  const MachineInstr &MIa, const MachineInstr &MIb) const {
1056  const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1057  int64_t OffsetA = 0, OffsetB = 0;
1058  unsigned WidthA = 0, WidthB = 0;
1059  bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1060 
1061  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1062  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1063 
1064  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1066  return false;
1067 
1068  // Retrieve the base, offset from the base and width. Width
1069  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1070  // base are identical, and the offset of a lower memory access +
1071  // the width doesn't overlap the offset of a higher memory access,
1072  // then the memory accesses are different.
1073  // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1074  // are assumed to have the same scale (vscale).
1075  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1076  WidthA, TRI) &&
1077  getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1078  WidthB, TRI)) {
1079  if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1080  OffsetAIsScalable == OffsetBIsScalable) {
1081  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1082  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1083  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1084  if (LowOffset + LowWidth <= HighOffset)
1085  return true;
1086  }
1087  }
1088  return false;
1089 }
1090 
1092  const MachineBasicBlock *MBB,
1093  const MachineFunction &MF) const {
1095  return true;
1096  switch (MI.getOpcode()) {
1097  case AArch64::HINT:
1098  // CSDB hints are scheduling barriers.
1099  if (MI.getOperand(0).getImm() == 0x14)
1100  return true;
1101  break;
1102  case AArch64::DSB:
1103  case AArch64::ISB:
1104  // DSB and ISB also are scheduling barriers.
1105  return true;
1106  default:;
1107  }
1108  return isSEHInstruction(MI);
1109 }
1110 
1111 /// analyzeCompare - For a comparison instruction, return the source registers
1112 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1113 /// Return true if the comparison instruction can be analyzed.
1115  Register &SrcReg2, int64_t &CmpMask,
1116  int64_t &CmpValue) const {
1117  // The first operand can be a frame index where we'd normally expect a
1118  // register.
1119  assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1120  if (!MI.getOperand(1).isReg())
1121  return false;
1122 
1123  switch (MI.getOpcode()) {
1124  default:
1125  break;
1126  case AArch64::PTEST_PP:
1127  SrcReg = MI.getOperand(0).getReg();
1128  SrcReg2 = MI.getOperand(1).getReg();
1129  // Not sure about the mask and value for now...
1130  CmpMask = ~0;
1131  CmpValue = 0;
1132  return true;
1133  case AArch64::SUBSWrr:
1134  case AArch64::SUBSWrs:
1135  case AArch64::SUBSWrx:
1136  case AArch64::SUBSXrr:
1137  case AArch64::SUBSXrs:
1138  case AArch64::SUBSXrx:
1139  case AArch64::ADDSWrr:
1140  case AArch64::ADDSWrs:
1141  case AArch64::ADDSWrx:
1142  case AArch64::ADDSXrr:
1143  case AArch64::ADDSXrs:
1144  case AArch64::ADDSXrx:
1145  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1146  SrcReg = MI.getOperand(1).getReg();
1147  SrcReg2 = MI.getOperand(2).getReg();
1148  CmpMask = ~0;
1149  CmpValue = 0;
1150  return true;
1151  case AArch64::SUBSWri:
1152  case AArch64::ADDSWri:
1153  case AArch64::SUBSXri:
1154  case AArch64::ADDSXri:
1155  SrcReg = MI.getOperand(1).getReg();
1156  SrcReg2 = 0;
1157  CmpMask = ~0;
1158  CmpValue = MI.getOperand(2).getImm();
1159  return true;
1160  case AArch64::ANDSWri:
1161  case AArch64::ANDSXri:
1162  // ANDS does not use the same encoding scheme as the others xxxS
1163  // instructions.
1164  SrcReg = MI.getOperand(1).getReg();
1165  SrcReg2 = 0;
1166  CmpMask = ~0;
1168  MI.getOperand(2).getImm(),
1169  MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1170  return true;
1171  }
1172 
1173  return false;
1174 }
1175 
1177  MachineBasicBlock *MBB = Instr.getParent();
1178  assert(MBB && "Can't get MachineBasicBlock here");
1179  MachineFunction *MF = MBB->getParent();
1180  assert(MF && "Can't get MachineFunction here");
1181  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1184 
1185  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1186  ++OpIdx) {
1187  MachineOperand &MO = Instr.getOperand(OpIdx);
1188  const TargetRegisterClass *OpRegCstraints =
1189  Instr.getRegClassConstraint(OpIdx, TII, TRI);
1190 
1191  // If there's no constraint, there's nothing to do.
1192  if (!OpRegCstraints)
1193  continue;
1194  // If the operand is a frame index, there's nothing to do here.
1195  // A frame index operand will resolve correctly during PEI.
1196  if (MO.isFI())
1197  continue;
1198 
1199  assert(MO.isReg() &&
1200  "Operand has register constraints without being a register!");
1201 
1202  Register Reg = MO.getReg();
1204  if (!OpRegCstraints->contains(Reg))
1205  return false;
1206  } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1207  !MRI->constrainRegClass(Reg, OpRegCstraints))
1208  return false;
1209  }
1210 
1211  return true;
1212 }
1213 
1214 /// Return the opcode that does not set flags when possible - otherwise
1215 /// return the original opcode. The caller is responsible to do the actual
1216 /// substitution and legality checking.
1217 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1218  // Don't convert all compare instructions, because for some the zero register
1219  // encoding becomes the sp register.
1220  bool MIDefinesZeroReg = false;
1221  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1222  MIDefinesZeroReg = true;
1223 
1224  switch (MI.getOpcode()) {
1225  default:
1226  return MI.getOpcode();
1227  case AArch64::ADDSWrr:
1228  return AArch64::ADDWrr;
1229  case AArch64::ADDSWri:
1230  return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1231  case AArch64::ADDSWrs:
1232  return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1233  case AArch64::ADDSWrx:
1234  return AArch64::ADDWrx;
1235  case AArch64::ADDSXrr:
1236  return AArch64::ADDXrr;
1237  case AArch64::ADDSXri:
1238  return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1239  case AArch64::ADDSXrs:
1240  return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1241  case AArch64::ADDSXrx:
1242  return AArch64::ADDXrx;
1243  case AArch64::SUBSWrr:
1244  return AArch64::SUBWrr;
1245  case AArch64::SUBSWri:
1246  return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1247  case AArch64::SUBSWrs:
1248  return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1249  case AArch64::SUBSWrx:
1250  return AArch64::SUBWrx;
1251  case AArch64::SUBSXrr:
1252  return AArch64::SUBXrr;
1253  case AArch64::SUBSXri:
1254  return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1255  case AArch64::SUBSXrs:
1256  return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1257  case AArch64::SUBSXrx:
1258  return AArch64::SUBXrx;
1259  }
1260 }
1261 
1262 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1263 
1264 /// True when condition flags are accessed (either by writing or reading)
1265 /// on the instruction trace starting at From and ending at To.
1266 ///
1267 /// Note: If From and To are from different blocks it's assumed CC are accessed
1268 /// on the path.
1271  const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1272  // Early exit if To is at the beginning of the BB.
1273  if (To == To->getParent()->begin())
1274  return true;
1275 
1276  // Check whether the instructions are in the same basic block
1277  // If not, assume the condition flags might get modified somewhere.
1278  if (To->getParent() != From->getParent())
1279  return true;
1280 
1281  // From must be above To.
1283  ++To.getReverse(), To->getParent()->rend(),
1284  [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1285 
1286  // We iterate backward starting at \p To until we hit \p From.
1287  for (const MachineInstr &Instr :
1288  instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1289  if (((AccessToCheck & AK_Write) &&
1290  Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1291  ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1292  return true;
1293  }
1294  return false;
1295 }
1296 
1297 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1298 /// operation which could set the flags in an identical manner
1299 bool AArch64InstrInfo::optimizePTestInstr(
1300  MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1301  const MachineRegisterInfo *MRI) const {
1302  auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1303  auto *Pred = MRI->getUniqueVRegDef(PredReg);
1304  auto NewOp = Pred->getOpcode();
1305  bool OpChanged = false;
1306 
1307  unsigned MaskOpcode = Mask->getOpcode();
1308  unsigned PredOpcode = Pred->getOpcode();
1309  bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1310  bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1311 
1312  if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) {
1313  // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't
1314  // deactivate any lanes OTHER_INST might set.
1315  uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode);
1316  uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1317 
1318  // Must be an all active predicate of matching element size.
1319  if ((PredElementSize != MaskElementSize) ||
1320  (Mask->getOperand(1).getImm() != 31))
1321  return false;
1322 
1323  // Fallthough to simply remove the PTEST.
1324  } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) {
1325  // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1326  // instruction that sets the flags as PTEST would.
1327 
1328  // Fallthough to simply remove the PTEST.
1329  } else if (PredIsPTestLike) {
1330  // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both
1331  // instructions use the same predicate.
1332  auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1333  if (Mask != PTestLikeMask)
1334  return false;
1335 
1336  // Fallthough to simply remove the PTEST.
1337  } else {
1338  switch (Pred->getOpcode()) {
1339  case AArch64::BRKB_PPzP:
1340  case AArch64::BRKPB_PPzPP: {
1341  // Op 0 is chain, 1 is the mask, 2 the previous predicate to
1342  // propagate, 3 the new predicate.
1343 
1344  // Check to see if our mask is the same as the brkpb's. If
1345  // not the resulting flag bits may be different and we
1346  // can't remove the ptest.
1347  auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1348  if (Mask != PredMask)
1349  return false;
1350 
1351  // Switch to the new opcode
1352  NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP
1353  : AArch64::BRKPBS_PPzPP;
1354  OpChanged = true;
1355  break;
1356  }
1357  case AArch64::BRKN_PPzP: {
1358  auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1359  if (Mask != PredMask)
1360  return false;
1361 
1362  NewOp = AArch64::BRKNS_PPzP;
1363  OpChanged = true;
1364  break;
1365  }
1366  case AArch64::RDFFR_PPz: {
1367  // rdffr p1.b, PredMask=p0/z <--- Definition of Pred
1368  // ptest Mask=p0, Pred=p1.b <--- If equal masks, remove this and use
1369  // `rdffrs p1.b, p0/z` above.
1370  auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1371  if (Mask != PredMask)
1372  return false;
1373 
1374  NewOp = AArch64::RDFFRS_PPz;
1375  OpChanged = true;
1376  break;
1377  }
1378  default:
1379  // Bail out if we don't recognize the input
1380  return false;
1381  }
1382  }
1383 
1385 
1386  // If another instruction between Pred and PTest accesses flags, don't remove
1387  // the ptest or update the earlier instruction to modify them.
1388  if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1389  return false;
1390 
1391  // If we pass all the checks, it's safe to remove the PTEST and use the flags
1392  // as they are prior to PTEST. Sometimes this requires the tested PTEST
1393  // operand to be replaced with an equivalent instruction that also sets the
1394  // flags.
1395  Pred->setDesc(get(NewOp));
1396  PTest->eraseFromParent();
1397  if (OpChanged) {
1398  bool succeeded = UpdateOperandRegClass(*Pred);
1399  (void)succeeded;
1400  assert(succeeded && "Operands have incompatible register classes!");
1401  Pred->addRegisterDefined(AArch64::NZCV, TRI);
1402  }
1403 
1404  // Ensure that the flags def is live.
1405  if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1406  unsigned i = 0, e = Pred->getNumOperands();
1407  for (; i != e; ++i) {
1408  MachineOperand &MO = Pred->getOperand(i);
1409  if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1410  MO.setIsDead(false);
1411  break;
1412  }
1413  }
1414  }
1415  return true;
1416 }
1417 
1418 /// Try to optimize a compare instruction. A compare instruction is an
1419 /// instruction which produces AArch64::NZCV. It can be truly compare
1420 /// instruction
1421 /// when there are no uses of its destination register.
1422 ///
1423 /// The following steps are tried in order:
1424 /// 1. Convert CmpInstr into an unconditional version.
1425 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1426 /// condition code or an instruction which can be converted into such an
1427 /// instruction.
1428 /// Only comparison with zero is supported.
1430  MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1431  int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1432  assert(CmpInstr.getParent());
1433  assert(MRI);
1434 
1435  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1436  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1437  if (DeadNZCVIdx != -1) {
1438  if (CmpInstr.definesRegister(AArch64::WZR) ||
1439  CmpInstr.definesRegister(AArch64::XZR)) {
1440  CmpInstr.eraseFromParent();
1441  return true;
1442  }
1443  unsigned Opc = CmpInstr.getOpcode();
1444  unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1445  if (NewOpc == Opc)
1446  return false;
1447  const MCInstrDesc &MCID = get(NewOpc);
1448  CmpInstr.setDesc(MCID);
1449  CmpInstr.RemoveOperand(DeadNZCVIdx);
1450  bool succeeded = UpdateOperandRegClass(CmpInstr);
1451  (void)succeeded;
1452  assert(succeeded && "Some operands reg class are incompatible!");
1453  return true;
1454  }
1455 
1456  if (CmpInstr.getOpcode() == AArch64::PTEST_PP)
1457  return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1458 
1459  if (SrcReg2 != 0)
1460  return false;
1461 
1462  // CmpInstr is a Compare instruction if destination register is not used.
1463  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1464  return false;
1465 
1466  if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1467  return true;
1468  return (CmpValue == 0 || CmpValue == 1) &&
1469  removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1470 }
1471 
1472 /// Get opcode of S version of Instr.
1473 /// If Instr is S version its opcode is returned.
1474 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1475 /// or we are not interested in it.
1476 static unsigned sForm(MachineInstr &Instr) {
1477  switch (Instr.getOpcode()) {
1478  default:
1479  return AArch64::INSTRUCTION_LIST_END;
1480 
1481  case AArch64::ADDSWrr:
1482  case AArch64::ADDSWri:
1483  case AArch64::ADDSXrr:
1484  case AArch64::ADDSXri:
1485  case AArch64::SUBSWrr:
1486  case AArch64::SUBSWri:
1487  case AArch64::SUBSXrr:
1488  case AArch64::SUBSXri:
1489  return Instr.getOpcode();
1490 
1491  case AArch64::ADDWrr:
1492  return AArch64::ADDSWrr;
1493  case AArch64::ADDWri:
1494  return AArch64::ADDSWri;
1495  case AArch64::ADDXrr:
1496  return AArch64::ADDSXrr;
1497  case AArch64::ADDXri:
1498  return AArch64::ADDSXri;
1499  case AArch64::ADCWr:
1500  return AArch64::ADCSWr;
1501  case AArch64::ADCXr:
1502  return AArch64::ADCSXr;
1503  case AArch64::SUBWrr:
1504  return AArch64::SUBSWrr;
1505  case AArch64::SUBWri:
1506  return AArch64::SUBSWri;
1507  case AArch64::SUBXrr:
1508  return AArch64::SUBSXrr;
1509  case AArch64::SUBXri:
1510  return AArch64::SUBSXri;
1511  case AArch64::SBCWr:
1512  return AArch64::SBCSWr;
1513  case AArch64::SBCXr:
1514  return AArch64::SBCSXr;
1515  case AArch64::ANDWri:
1516  return AArch64::ANDSWri;
1517  case AArch64::ANDXri:
1518  return AArch64::ANDSXri;
1519  }
1520 }
1521 
1522 /// Check if AArch64::NZCV should be alive in successors of MBB.
1524  for (auto *BB : MBB->successors())
1525  if (BB->isLiveIn(AArch64::NZCV))
1526  return true;
1527  return false;
1528 }
1529 
1530 /// \returns The condition code operand index for \p Instr if it is a branch
1531 /// or select and -1 otherwise.
1532 static int
1534  switch (Instr.getOpcode()) {
1535  default:
1536  return -1;
1537 
1538  case AArch64::Bcc: {
1539  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1540  assert(Idx >= 2);
1541  return Idx - 2;
1542  }
1543 
1544  case AArch64::CSINVWr:
1545  case AArch64::CSINVXr:
1546  case AArch64::CSINCWr:
1547  case AArch64::CSINCXr:
1548  case AArch64::CSELWr:
1549  case AArch64::CSELXr:
1550  case AArch64::CSNEGWr:
1551  case AArch64::CSNEGXr:
1552  case AArch64::FCSELSrrr:
1553  case AArch64::FCSELDrrr: {
1554  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1555  assert(Idx >= 1);
1556  return Idx - 1;
1557  }
1558  }
1559 }
1560 
1561 namespace {
1562 
1563 struct UsedNZCV {
1564  bool N = false;
1565  bool Z = false;
1566  bool C = false;
1567  bool V = false;
1568 
1569  UsedNZCV() = default;
1570 
1571  UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1572  this->N |= UsedFlags.N;
1573  this->Z |= UsedFlags.Z;
1574  this->C |= UsedFlags.C;
1575  this->V |= UsedFlags.V;
1576  return *this;
1577  }
1578 };
1579 
1580 } // end anonymous namespace
1581 
1582 /// Find a condition code used by the instruction.
1583 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1584 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1586  int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1587  return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1588  Instr.getOperand(CCIdx).getImm())
1590 }
1591 
1592 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1593  assert(CC != AArch64CC::Invalid);
1594  UsedNZCV UsedFlags;
1595  switch (CC) {
1596  default:
1597  break;
1598 
1599  case AArch64CC::EQ: // Z set
1600  case AArch64CC::NE: // Z clear
1601  UsedFlags.Z = true;
1602  break;
1603 
1604  case AArch64CC::HI: // Z clear and C set
1605  case AArch64CC::LS: // Z set or C clear
1606  UsedFlags.Z = true;
1608  case AArch64CC::HS: // C set
1609  case AArch64CC::LO: // C clear
1610  UsedFlags.C = true;
1611  break;
1612 
1613  case AArch64CC::MI: // N set
1614  case AArch64CC::PL: // N clear
1615  UsedFlags.N = true;
1616  break;
1617 
1618  case AArch64CC::VS: // V set
1619  case AArch64CC::VC: // V clear
1620  UsedFlags.V = true;
1621  break;
1622 
1623  case AArch64CC::GT: // Z clear, N and V the same
1624  case AArch64CC::LE: // Z set, N and V differ
1625  UsedFlags.Z = true;
1627  case AArch64CC::GE: // N and V the same
1628  case AArch64CC::LT: // N and V differ
1629  UsedFlags.N = true;
1630  UsedFlags.V = true;
1631  break;
1632  }
1633  return UsedFlags;
1634 }
1635 
1636 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if they
1637 /// are not containing C or V flags and NZCV flags are not alive in successors
1638 /// of the same \p CmpInstr and \p MI parent. \returns None otherwise.
1639 ///
1640 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1641 static Optional<UsedNZCV>
1643  const TargetRegisterInfo &TRI,
1644  SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) {
1645  MachineBasicBlock *CmpParent = CmpInstr.getParent();
1646  if (MI.getParent() != CmpParent)
1647  return None;
1648 
1649  if (areCFlagsAliveInSuccessors(CmpParent))
1650  return None;
1651 
1652  UsedNZCV NZCVUsedAfterCmp;
1653  for (MachineInstr &Instr : instructionsWithoutDebug(
1654  std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1655  if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1657  if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1658  return None;
1659  NZCVUsedAfterCmp |= getUsedNZCV(CC);
1660  if (CCUseInstrs)
1661  CCUseInstrs->push_back(&Instr);
1662  }
1663  if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1664  break;
1665  }
1666  if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V)
1667  return None;
1668  return NZCVUsedAfterCmp;
1669 }
1670 
1671 static bool isADDSRegImm(unsigned Opcode) {
1672  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1673 }
1674 
1675 static bool isSUBSRegImm(unsigned Opcode) {
1676  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1677 }
1678 
1679 /// Check if CmpInstr can be substituted by MI.
1680 ///
1681 /// CmpInstr can be substituted:
1682 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1683 /// - and, MI and CmpInstr are from the same MachineBB
1684 /// - and, condition flags are not alive in successors of the CmpInstr parent
1685 /// - and, if MI opcode is the S form there must be no defs of flags between
1686 /// MI and CmpInstr
1687 /// or if MI opcode is not the S form there must be neither defs of flags
1688 /// nor uses of flags between MI and CmpInstr.
1689 /// - and C/V flags are not used after CmpInstr
1691  const TargetRegisterInfo &TRI) {
1692  assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1693 
1694  const unsigned CmpOpcode = CmpInstr.getOpcode();
1695  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1696  return false;
1697 
1698  if (!examineCFlagsUse(MI, CmpInstr, TRI))
1699  return false;
1700 
1701  AccessKind AccessToCheck = AK_Write;
1702  if (sForm(MI) != MI.getOpcode())
1703  AccessToCheck = AK_All;
1704  return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1705 }
1706 
1707 /// Substitute an instruction comparing to zero with another instruction
1708 /// which produces needed condition flags.
1709 ///
1710 /// Return true on success.
1711 bool AArch64InstrInfo::substituteCmpToZero(
1712  MachineInstr &CmpInstr, unsigned SrcReg,
1713  const MachineRegisterInfo &MRI) const {
1714  // Get the unique definition of SrcReg.
1715  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1716  if (!MI)
1717  return false;
1718 
1720 
1721  unsigned NewOpc = sForm(*MI);
1722  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1723  return false;
1724 
1725  if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1726  return false;
1727 
1728  // Update the instruction to set NZCV.
1729  MI->setDesc(get(NewOpc));
1730  CmpInstr.eraseFromParent();
1731  bool succeeded = UpdateOperandRegClass(*MI);
1732  (void)succeeded;
1733  assert(succeeded && "Some operands reg class are incompatible!");
1734  MI->addRegisterDefined(AArch64::NZCV, &TRI);
1735  return true;
1736 }
1737 
1738 /// \returns True if \p CmpInstr can be removed.
1739 ///
1740 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1741 /// codes used in \p CCUseInstrs must be inverted.
1743  int CmpValue, const TargetRegisterInfo &TRI,
1744  SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1745  bool &IsInvertCC) {
1746  assert((CmpValue == 0 || CmpValue == 1) &&
1747  "Only comparisons to 0 or 1 considered for removal!");
1748 
1749  // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1750  unsigned MIOpc = MI.getOpcode();
1751  if (MIOpc == AArch64::CSINCWr) {
1752  if (MI.getOperand(1).getReg() != AArch64::WZR ||
1753  MI.getOperand(2).getReg() != AArch64::WZR)
1754  return false;
1755  } else if (MIOpc == AArch64::CSINCXr) {
1756  if (MI.getOperand(1).getReg() != AArch64::XZR ||
1757  MI.getOperand(2).getReg() != AArch64::XZR)
1758  return false;
1759  } else {
1760  return false;
1761  }
1763  if (MICC == AArch64CC::Invalid)
1764  return false;
1765 
1766  // NZCV needs to be defined
1767  if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
1768  return false;
1769 
1770  // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1771  const unsigned CmpOpcode = CmpInstr.getOpcode();
1772  bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1773  if (CmpValue && !IsSubsRegImm)
1774  return false;
1775  if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1776  return false;
1777 
1778  // MI conditions allowed: eq, ne, mi, pl
1779  UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1780  if (MIUsedNZCV.C || MIUsedNZCV.V)
1781  return false;
1782 
1783  Optional<UsedNZCV> NZCVUsedAfterCmp =
1784  examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1785  // Condition flags are not used in CmpInstr basic block successors and only
1786  // Z or N flags allowed to be used after CmpInstr within its basic block
1787  if (!NZCVUsedAfterCmp)
1788  return false;
1789  // Z or N flag used after CmpInstr must correspond to the flag used in MI
1790  if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1791  (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1792  return false;
1793  // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1794  if (MIUsedNZCV.N && !CmpValue)
1795  return false;
1796 
1797  // There must be no defs of flags between MI and CmpInstr
1798  if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1799  return false;
1800 
1801  // Condition code is inverted in the following cases:
1802  // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1803  // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1804  IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1805  (!CmpValue && MICC == AArch64CC::NE);
1806  return true;
1807 }
1808 
1809 /// Remove comparision in csinc-cmp sequence
1810 ///
1811 /// Examples:
1812 /// 1. \code
1813 /// csinc w9, wzr, wzr, ne
1814 /// cmp w9, #0
1815 /// b.eq
1816 /// \endcode
1817 /// to
1818 /// \code
1819 /// csinc w9, wzr, wzr, ne
1820 /// b.ne
1821 /// \endcode
1822 ///
1823 /// 2. \code
1824 /// csinc x2, xzr, xzr, mi
1825 /// cmp x2, #1
1826 /// b.pl
1827 /// \endcode
1828 /// to
1829 /// \code
1830 /// csinc x2, xzr, xzr, mi
1831 /// b.pl
1832 /// \endcode
1833 ///
1834 /// \param CmpInstr comparison instruction
1835 /// \return True when comparison removed
1836 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1837  MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1838  const MachineRegisterInfo &MRI) const {
1839  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1840  if (!MI)
1841  return false;
1843  SmallVector<MachineInstr *, 4> CCUseInstrs;
1844  bool IsInvertCC = false;
1845  if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1846  IsInvertCC))
1847  return false;
1848  // Make transformation
1849  CmpInstr.eraseFromParent();
1850  if (IsInvertCC) {
1851  // Invert condition codes in CmpInstr CC users
1852  for (MachineInstr *CCUseInstr : CCUseInstrs) {
1853  int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1854  assert(Idx >= 0 && "Unexpected instruction using CC.");
1855  MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1857  static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1858  CCOperand.setImm(CCUse);
1859  }
1860  }
1861  return true;
1862 }
1863 
1865  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1866  MI.getOpcode() != AArch64::CATCHRET)
1867  return false;
1868 
1869  MachineBasicBlock &MBB = *MI.getParent();
1870  auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1871  auto TRI = Subtarget.getRegisterInfo();
1872  DebugLoc DL = MI.getDebugLoc();
1873 
1874  if (MI.getOpcode() == AArch64::CATCHRET) {
1875  // Skip to the first instruction before the epilog.
1876  const TargetInstrInfo *TII =
1878  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1880  MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1881  while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1882  FirstEpilogSEH != MBB.begin())
1883  FirstEpilogSEH = std::prev(FirstEpilogSEH);
1884  if (FirstEpilogSEH != MBB.begin())
1885  FirstEpilogSEH = std::next(FirstEpilogSEH);
1886  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1887  .addReg(AArch64::X0, RegState::Define)
1888  .addMBB(TargetMBB);
1889  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1890  .addReg(AArch64::X0, RegState::Define)
1891  .addReg(AArch64::X0)
1892  .addMBB(TargetMBB)
1893  .addImm(0);
1894  return true;
1895  }
1896 
1897  Register Reg = MI.getOperand(0).getReg();
1899  if (M.getStackProtectorGuard() == "sysreg") {
1900  const AArch64SysReg::SysReg *SrcReg =
1901  AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
1902  if (!SrcReg)
1903  report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
1904 
1905  // mrs xN, sysreg
1908  .addImm(SrcReg->Encoding);
1909  int Offset = M.getStackProtectorGuardOffset();
1910  if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
1911  // ldr xN, [xN, #offset]
1912  BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1913  .addDef(Reg)
1915  .addImm(Offset / 8);
1916  } else if (Offset >= -256 && Offset <= 255) {
1917  // ldur xN, [xN, #offset]
1918  BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
1919  .addDef(Reg)
1921  .addImm(Offset);
1922  } else if (Offset >= -4095 && Offset <= 4095) {
1923  if (Offset > 0) {
1924  // add xN, xN, #offset
1925  BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
1926  .addDef(Reg)
1928  .addImm(Offset)
1929  .addImm(0);
1930  } else {
1931  // sub xN, xN, #offset
1932  BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
1933  .addDef(Reg)
1935  .addImm(-Offset)
1936  .addImm(0);
1937  }
1938  // ldr xN, [xN]
1939  BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1940  .addDef(Reg)
1942  .addImm(0);
1943  } else {
1944  // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
1945  // than 23760.
1946  // It might be nice to use AArch64::MOVi32imm here, which would get
1947  // expanded in PreSched2 after PostRA, but our lone scratch Reg already
1948  // contains the MRS result. findScratchNonCalleeSaveRegister() in
1949  // AArch64FrameLowering might help us find such a scratch register
1950  // though. If we failed to find a scratch register, we could emit a
1951  // stream of add instructions to build up the immediate. Or, we could try
1952  // to insert a AArch64::MOVi32imm before register allocation so that we
1953  // didn't need to scavenge for a scratch register.
1954  report_fatal_error("Unable to encode Stack Protector Guard Offset");
1955  }
1956  MBB.erase(MI);
1957  return true;
1958  }
1959 
1960  const GlobalValue *GV =
1961  cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1962  const TargetMachine &TM = MBB.getParent()->getTarget();
1963  unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1964  const unsigned char MO_NC = AArch64II::MO_NC;
1965 
1966  if ((OpFlags & AArch64II::MO_GOT) != 0) {
1968  .addGlobalAddress(GV, 0, OpFlags);
1969  if (Subtarget.isTargetILP32()) {
1970  unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1971  BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1972  .addDef(Reg32, RegState::Dead)
1974  .addImm(0)
1975  .addMemOperand(*MI.memoperands_begin())
1977  } else {
1978  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1980  .addImm(0)
1981  .addMemOperand(*MI.memoperands_begin());
1982  }
1983  } else if (TM.getCodeModel() == CodeModel::Large) {
1984  assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1985  BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1987  .addImm(0);
1988  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1991  .addImm(16);
1992  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1995  .addImm(32);
1996  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1999  .addImm(48);
2000  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2002  .addImm(0)
2003  .addMemOperand(*MI.memoperands_begin());
2004  } else if (TM.getCodeModel() == CodeModel::Tiny) {
2006  .addGlobalAddress(GV, 0, OpFlags);
2007  } else {
2009  .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2010  unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2011  if (Subtarget.isTargetILP32()) {
2012  unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2013  BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2014  .addDef(Reg32, RegState::Dead)
2016  .addGlobalAddress(GV, 0, LoFlags)
2017  .addMemOperand(*MI.memoperands_begin())
2019  } else {
2020  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2022  .addGlobalAddress(GV, 0, LoFlags)
2023  .addMemOperand(*MI.memoperands_begin());
2024  }
2025  }
2026 
2027  MBB.erase(MI);
2028 
2029  return true;
2030 }
2031 
2032 // Return true if this instruction simply sets its single destination register
2033 // to zero. This is equivalent to a register rename of the zero-register.
2035  switch (MI.getOpcode()) {
2036  default:
2037  break;
2038  case AArch64::MOVZWi:
2039  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2040  if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2041  assert(MI.getDesc().getNumOperands() == 3 &&
2042  MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2043  return true;
2044  }
2045  break;
2046  case AArch64::ANDWri: // and Rd, Rzr, #imm
2047  return MI.getOperand(1).getReg() == AArch64::WZR;
2048  case AArch64::ANDXri:
2049  return MI.getOperand(1).getReg() == AArch64::XZR;
2050  case TargetOpcode::COPY:
2051  return MI.getOperand(1).getReg() == AArch64::WZR;
2052  }
2053  return false;
2054 }
2055 
2056 // Return true if this instruction simply renames a general register without
2057 // modifying bits.
2059  switch (MI.getOpcode()) {
2060  default:
2061  break;
2062  case TargetOpcode::COPY: {
2063  // GPR32 copies will by lowered to ORRXrs
2064  Register DstReg = MI.getOperand(0).getReg();
2065  return (AArch64::GPR32RegClass.contains(DstReg) ||
2066  AArch64::GPR64RegClass.contains(DstReg));
2067  }
2068  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2069  if (MI.getOperand(1).getReg() == AArch64::XZR) {
2070  assert(MI.getDesc().getNumOperands() == 4 &&
2071  MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2072  return true;
2073  }
2074  break;
2075  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2076  if (MI.getOperand(2).getImm() == 0) {
2077  assert(MI.getDesc().getNumOperands() == 4 &&
2078  MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2079  return true;
2080  }
2081  break;
2082  }
2083  return false;
2084 }
2085 
2086 // Return true if this instruction simply renames a general register without
2087 // modifying bits.
2089  switch (MI.getOpcode()) {
2090  default:
2091  break;
2092  case TargetOpcode::COPY: {
2093  Register DstReg = MI.getOperand(0).getReg();
2094  return AArch64::FPR128RegClass.contains(DstReg);
2095  }
2096  case AArch64::ORRv16i8:
2097  if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2098  assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2099  "invalid ORRv16i8 operands");
2100  return true;
2101  }
2102  break;
2103  }
2104  return false;
2105 }
2106 
2108  int &FrameIndex) const {
2109  switch (MI.getOpcode()) {
2110  default:
2111  break;
2112  case AArch64::LDRWui:
2113  case AArch64::LDRXui:
2114  case AArch64::LDRBui:
2115  case AArch64::LDRHui:
2116  case AArch64::LDRSui:
2117  case AArch64::LDRDui:
2118  case AArch64::LDRQui:
2119  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2120  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2121  FrameIndex = MI.getOperand(1).getIndex();
2122  return MI.getOperand(0).getReg();
2123  }
2124  break;
2125  }
2126 
2127  return 0;
2128 }
2129 
2131  int &FrameIndex) const {
2132  switch (MI.getOpcode()) {
2133  default:
2134  break;
2135  case AArch64::STRWui:
2136  case AArch64::STRXui:
2137  case AArch64::STRBui:
2138  case AArch64::STRHui:
2139  case AArch64::STRSui:
2140  case AArch64::STRDui:
2141  case AArch64::STRQui:
2142  case AArch64::LDR_PXI:
2143  case AArch64::STR_PXI:
2144  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2145  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2146  FrameIndex = MI.getOperand(1).getIndex();
2147  return MI.getOperand(0).getReg();
2148  }
2149  break;
2150  }
2151  return 0;
2152 }
2153 
2154 /// Check all MachineMemOperands for a hint to suppress pairing.
2156  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2157  return MMO->getFlags() & MOSuppressPair;
2158  });
2159 }
2160 
2161 /// Set a flag on the first MachineMemOperand to suppress pairing.
2163  if (MI.memoperands_empty())
2164  return;
2165  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2166 }
2167 
2168 /// Check all MachineMemOperands for a hint that the load/store is strided.
2170  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2171  return MMO->getFlags() & MOStridedAccess;
2172  });
2173 }
2174 
2176  switch (Opc) {
2177  default:
2178  return false;
2179  case AArch64::STURSi:
2180  case AArch64::STRSpre:
2181  case AArch64::STURDi:
2182  case AArch64::STRDpre:
2183  case AArch64::STURQi:
2184  case AArch64::STRQpre:
2185  case AArch64::STURBBi:
2186  case AArch64::STURHHi:
2187  case AArch64::STURWi:
2188  case AArch64::STRWpre:
2189  case AArch64::STURXi:
2190  case AArch64::STRXpre:
2191  case AArch64::LDURSi:
2192  case AArch64::LDRSpre:
2193  case AArch64::LDURDi:
2194  case AArch64::LDRDpre:
2195  case AArch64::LDURQi:
2196  case AArch64::LDRQpre:
2197  case AArch64::LDURWi:
2198  case AArch64::LDRWpre:
2199  case AArch64::LDURXi:
2200  case AArch64::LDRXpre:
2201  case AArch64::LDURSWi:
2202  case AArch64::LDURHHi:
2203  case AArch64::LDURBBi:
2204  case AArch64::LDURSBWi:
2205  case AArch64::LDURSHWi:
2206  return true;
2207  }
2208 }
2209 
2211  switch (Opc) {
2212  default: return {};
2213  case AArch64::PRFMui: return AArch64::PRFUMi;
2214  case AArch64::LDRXui: return AArch64::LDURXi;
2215  case AArch64::LDRWui: return AArch64::LDURWi;
2216  case AArch64::LDRBui: return AArch64::LDURBi;
2217  case AArch64::LDRHui: return AArch64::LDURHi;
2218  case AArch64::LDRSui: return AArch64::LDURSi;
2219  case AArch64::LDRDui: return AArch64::LDURDi;
2220  case AArch64::LDRQui: return AArch64::LDURQi;
2221  case AArch64::LDRBBui: return AArch64::LDURBBi;
2222  case AArch64::LDRHHui: return AArch64::LDURHHi;
2223  case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2224  case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2225  case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2226  case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2227  case AArch64::LDRSWui: return AArch64::LDURSWi;
2228  case AArch64::STRXui: return AArch64::STURXi;
2229  case AArch64::STRWui: return AArch64::STURWi;
2230  case AArch64::STRBui: return AArch64::STURBi;
2231  case AArch64::STRHui: return AArch64::STURHi;
2232  case AArch64::STRSui: return AArch64::STURSi;
2233  case AArch64::STRDui: return AArch64::STURDi;
2234  case AArch64::STRQui: return AArch64::STURQi;
2235  case AArch64::STRBBui: return AArch64::STURBBi;
2236  case AArch64::STRHHui: return AArch64::STURHHi;
2237  }
2238 }
2239 
2241  switch (Opc) {
2242  default:
2243  return 2;
2244  case AArch64::LDPXi:
2245  case AArch64::LDPDi:
2246  case AArch64::STPXi:
2247  case AArch64::STPDi:
2248  case AArch64::LDNPXi:
2249  case AArch64::LDNPDi:
2250  case AArch64::STNPXi:
2251  case AArch64::STNPDi:
2252  case AArch64::LDPQi:
2253  case AArch64::STPQi:
2254  case AArch64::LDNPQi:
2255  case AArch64::STNPQi:
2256  case AArch64::LDPWi:
2257  case AArch64::LDPSi:
2258  case AArch64::STPWi:
2259  case AArch64::STPSi:
2260  case AArch64::LDNPWi:
2261  case AArch64::LDNPSi:
2262  case AArch64::STNPWi:
2263  case AArch64::STNPSi:
2264  case AArch64::LDG:
2265  case AArch64::STGPi:
2266  case AArch64::LD1B_IMM:
2267  case AArch64::LD1H_IMM:
2268  case AArch64::LD1W_IMM:
2269  case AArch64::LD1D_IMM:
2270  case AArch64::ST1B_IMM:
2271  case AArch64::ST1H_IMM:
2272  case AArch64::ST1W_IMM:
2273  case AArch64::ST1D_IMM:
2274  case AArch64::LD1B_H_IMM:
2275  case AArch64::LD1SB_H_IMM:
2276  case AArch64::LD1H_S_IMM:
2277  case AArch64::LD1SH_S_IMM:
2278  case AArch64::LD1W_D_IMM:
2279  case AArch64::LD1SW_D_IMM:
2280  case AArch64::ST1B_H_IMM:
2281  case AArch64::ST1H_S_IMM:
2282  case AArch64::ST1W_D_IMM:
2283  case AArch64::LD1B_S_IMM:
2284  case AArch64::LD1SB_S_IMM:
2285  case AArch64::LD1H_D_IMM:
2286  case AArch64::LD1SH_D_IMM:
2287  case AArch64::ST1B_S_IMM:
2288  case AArch64::ST1H_D_IMM:
2289  case AArch64::LD1B_D_IMM:
2290  case AArch64::LD1SB_D_IMM:
2291  case AArch64::ST1B_D_IMM:
2292  case AArch64::LD1RB_IMM:
2293  case AArch64::LD1RB_H_IMM:
2294  case AArch64::LD1RB_S_IMM:
2295  case AArch64::LD1RB_D_IMM:
2296  case AArch64::LD1RSB_H_IMM:
2297  case AArch64::LD1RSB_S_IMM:
2298  case AArch64::LD1RSB_D_IMM:
2299  case AArch64::LD1RH_IMM:
2300  case AArch64::LD1RH_S_IMM:
2301  case AArch64::LD1RH_D_IMM:
2302  case AArch64::LD1RSH_S_IMM:
2303  case AArch64::LD1RSH_D_IMM:
2304  case AArch64::LD1RW_IMM:
2305  case AArch64::LD1RW_D_IMM:
2306  case AArch64::LD1RSW_IMM:
2307  case AArch64::LD1RD_IMM:
2308  return 3;
2309  case AArch64::ADDG:
2310  case AArch64::STGOffset:
2311  case AArch64::LDR_PXI:
2312  case AArch64::STR_PXI:
2313  return 2;
2314  }
2315 }
2316 
2318  switch (MI.getOpcode()) {
2319  default:
2320  return false;
2321  // Scaled instructions.
2322  case AArch64::STRSui:
2323  case AArch64::STRDui:
2324  case AArch64::STRQui:
2325  case AArch64::STRXui:
2326  case AArch64::STRWui:
2327  case AArch64::LDRSui:
2328  case AArch64::LDRDui:
2329  case AArch64::LDRQui:
2330  case AArch64::LDRXui:
2331  case AArch64::LDRWui:
2332  case AArch64::LDRSWui:
2333  // Unscaled instructions.
2334  case AArch64::STURSi:
2335  case AArch64::STRSpre:
2336  case AArch64::STURDi:
2337  case AArch64::STRDpre:
2338  case AArch64::STURQi:
2339  case AArch64::STRQpre:
2340  case AArch64::STURWi:
2341  case AArch64::STRWpre:
2342  case AArch64::STURXi:
2343  case AArch64::STRXpre:
2344  case AArch64::LDURSi:
2345  case AArch64::LDRSpre:
2346  case AArch64::LDURDi:
2347  case AArch64::LDRDpre:
2348  case AArch64::LDURQi:
2349  case AArch64::LDRQpre:
2350  case AArch64::LDURWi:
2351  case AArch64::LDRWpre:
2352  case AArch64::LDURXi:
2353  case AArch64::LDRXpre:
2354  case AArch64::LDURSWi:
2355  return true;
2356  }
2357 }
2358 
2360  bool &Is64Bit) {
2361  switch (Opc) {
2362  default:
2363  llvm_unreachable("Opcode has no flag setting equivalent!");
2364  // 32-bit cases:
2365  case AArch64::ADDWri:
2366  Is64Bit = false;
2367  return AArch64::ADDSWri;
2368  case AArch64::ADDWrr:
2369  Is64Bit = false;
2370  return AArch64::ADDSWrr;
2371  case AArch64::ADDWrs:
2372  Is64Bit = false;
2373  return AArch64::ADDSWrs;
2374  case AArch64::ADDWrx:
2375  Is64Bit = false;
2376  return AArch64::ADDSWrx;
2377  case AArch64::ANDWri:
2378  Is64Bit = false;
2379  return AArch64::ANDSWri;
2380  case AArch64::ANDWrr:
2381  Is64Bit = false;
2382  return AArch64::ANDSWrr;
2383  case AArch64::ANDWrs:
2384  Is64Bit = false;
2385  return AArch64::ANDSWrs;
2386  case AArch64::BICWrr:
2387  Is64Bit = false;
2388  return AArch64::BICSWrr;
2389  case AArch64::BICWrs:
2390  Is64Bit = false;
2391  return AArch64::BICSWrs;
2392  case AArch64::SUBWri:
2393  Is64Bit = false;
2394  return AArch64::SUBSWri;
2395  case AArch64::SUBWrr:
2396  Is64Bit = false;
2397  return AArch64::SUBSWrr;
2398  case AArch64::SUBWrs:
2399  Is64Bit = false;
2400  return AArch64::SUBSWrs;
2401  case AArch64::SUBWrx:
2402  Is64Bit = false;
2403  return AArch64::SUBSWrx;
2404  // 64-bit cases:
2405  case AArch64::ADDXri:
2406  Is64Bit = true;
2407  return AArch64::ADDSXri;
2408  case AArch64::ADDXrr:
2409  Is64Bit = true;
2410  return AArch64::ADDSXrr;
2411  case AArch64::ADDXrs:
2412  Is64Bit = true;
2413  return AArch64::ADDSXrs;
2414  case AArch64::ADDXrx:
2415  Is64Bit = true;
2416  return AArch64::ADDSXrx;
2417  case AArch64::ANDXri:
2418  Is64Bit = true;
2419  return AArch64::ANDSXri;
2420  case AArch64::ANDXrr:
2421  Is64Bit = true;
2422  return AArch64::ANDSXrr;
2423  case AArch64::ANDXrs:
2424  Is64Bit = true;
2425  return AArch64::ANDSXrs;
2426  case AArch64::BICXrr:
2427  Is64Bit = true;
2428  return AArch64::BICSXrr;
2429  case AArch64::BICXrs:
2430  Is64Bit = true;
2431  return AArch64::BICSXrs;
2432  case AArch64::SUBXri:
2433  Is64Bit = true;
2434  return AArch64::SUBSXri;
2435  case AArch64::SUBXrr:
2436  Is64Bit = true;
2437  return AArch64::SUBSXrr;
2438  case AArch64::SUBXrs:
2439  Is64Bit = true;
2440  return AArch64::SUBSXrs;
2441  case AArch64::SUBXrx:
2442  Is64Bit = true;
2443  return AArch64::SUBSXrx;
2444  }
2445 }
2446 
2447 // Is this a candidate for ld/st merging or pairing? For example, we don't
2448 // touch volatiles or load/stores that have a hint to avoid pair formation.
2450 
2451  bool IsPreLdSt = isPreLdSt(MI);
2452 
2453  // If this is a volatile load/store, don't mess with it.
2454  if (MI.hasOrderedMemoryRef())
2455  return false;
2456 
2457  // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2458  // For Pre-inc LD/ST, the operand is shifted by one.
2459  assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2460  MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2461  "Expected a reg or frame index operand.");
2462 
2463  // For Pre-indexed addressing quadword instructions, the third operand is the
2464  // immediate value.
2465  bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2466 
2467  if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2468  return false;
2469 
2470  // Can't merge/pair if the instruction modifies the base register.
2471  // e.g., ldr x0, [x0]
2472  // This case will never occur with an FI base.
2473  // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged.
2474  // For example:
2475  // ldr q0, [x11, #32]!
2476  // ldr q1, [x11, #16]
2477  // to
2478  // ldp q0, q1, [x11, #32]!
2479  if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2480  Register BaseReg = MI.getOperand(1).getReg();
2482  if (MI.modifiesRegister(BaseReg, TRI))
2483  return false;
2484  }
2485 
2486  // Check if this load/store has a hint to avoid pair formation.
2487  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2488  if (isLdStPairSuppressed(MI))
2489  return false;
2490 
2491  // Do not pair any callee-save store/reload instructions in the
2492  // prologue/epilogue if the CFI information encoded the operations as separate
2493  // instructions, as that will cause the size of the actual prologue to mismatch
2494  // with the prologue size recorded in the Windows CFI.
2495  const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2496  bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2497  MI.getMF()->getFunction().needsUnwindTableEntry();
2498  if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2499  MI.getFlag(MachineInstr::FrameDestroy)))
2500  return false;
2501 
2502  // On some CPUs quad load/store pairs are slower than two single load/stores.
2503  if (Subtarget.isPaired128Slow()) {
2504  switch (MI.getOpcode()) {
2505  default:
2506  break;
2507  case AArch64::LDURQi:
2508  case AArch64::STURQi:
2509  case AArch64::LDRQui:
2510  case AArch64::STRQui:
2511  return false;
2512  }
2513  }
2514 
2515  return true;
2516 }
2517 
2520  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2521  const TargetRegisterInfo *TRI) const {
2522  if (!LdSt.mayLoadOrStore())
2523  return false;
2524 
2525  const MachineOperand *BaseOp;
2526  if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2527  Width, TRI))
2528  return false;
2529  BaseOps.push_back(BaseOp);
2530  return true;
2531 }
2532 
2535  const TargetRegisterInfo *TRI) const {
2536  const MachineOperand *Base; // Filled with the base operand of MI.
2537  int64_t Offset; // Filled with the offset of MI.
2538  bool OffsetIsScalable;
2539  if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2540  return None;
2541 
2542  if (!Base->isReg())
2543  return None;
2544  ExtAddrMode AM;
2545  AM.BaseReg = Base->getReg();
2546  AM.Displacement = Offset;
2547  AM.ScaledReg = 0;
2548  return AM;
2549 }
2550 
2552  const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
2553  bool &OffsetIsScalable, unsigned &Width,
2554  const TargetRegisterInfo *TRI) const {
2555  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2556  // Handle only loads/stores with base register followed by immediate offset.
2557  if (LdSt.getNumExplicitOperands() == 3) {
2558  // Non-paired instruction (e.g., ldr x1, [x0, #8]).
2559  if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
2560  !LdSt.getOperand(2).isImm())
2561  return false;
2562  } else if (LdSt.getNumExplicitOperands() == 4) {
2563  // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2564  if (!LdSt.getOperand(1).isReg() ||
2565  (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2566  !LdSt.getOperand(3).isImm())
2567  return false;
2568  } else
2569  return false;
2570 
2571  // Get the scaling factor for the instruction and set the width for the
2572  // instruction.
2573  TypeSize Scale(0U, false);
2574  int64_t Dummy1, Dummy2;
2575 
2576  // If this returns false, then it's an instruction we don't want to handle.
2577  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2578  return false;
2579 
2580  // Compute the offset. Offset is calculated as the immediate operand
2581  // multiplied by the scaling factor. Unscaled instructions have scaling factor
2582  // set to 1.
2583  if (LdSt.getNumExplicitOperands() == 3) {
2584  BaseOp = &LdSt.getOperand(1);
2585  Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
2586  } else {
2587  assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2588  BaseOp = &LdSt.getOperand(2);
2589  Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
2590  }
2591  OffsetIsScalable = Scale.isScalable();
2592 
2593  if (!BaseOp->isReg() && !BaseOp->isFI())
2594  return false;
2595 
2596  return true;
2597 }
2598 
2601  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2602  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2603  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2604  return OfsOp;
2605 }
2606 
2607 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
2608  unsigned &Width, int64_t &MinOffset,
2609  int64_t &MaxOffset) {
2610  const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
2611  switch (Opcode) {
2612  // Not a memory operation or something we want to handle.
2613  default:
2614  Scale = TypeSize::Fixed(0);
2615  Width = 0;
2616  MinOffset = MaxOffset = 0;
2617  return false;
2618  case AArch64::STRWpost:
2619  case AArch64::LDRWpost:
2620  Width = 32;
2621  Scale = TypeSize::Fixed(4);
2622  MinOffset = -256;
2623  MaxOffset = 255;
2624  break;
2625  case AArch64::LDURQi:
2626  case AArch64::STURQi:
2627  Width = 16;
2628  Scale = TypeSize::Fixed(1);
2629  MinOffset = -256;
2630  MaxOffset = 255;
2631  break;
2632  case AArch64::PRFUMi:
2633  case AArch64::LDURXi:
2634  case AArch64::LDURDi:
2635  case AArch64::STURXi:
2636  case AArch64::STURDi:
2637  Width = 8;
2638  Scale = TypeSize::Fixed(1);
2639  MinOffset = -256;
2640  MaxOffset = 255;
2641  break;
2642  case AArch64::LDURWi:
2643  case AArch64::LDURSi:
2644  case AArch64::LDURSWi:
2645  case AArch64::STURWi:
2646  case AArch64::STURSi:
2647  Width = 4;
2648  Scale = TypeSize::Fixed(1);
2649  MinOffset = -256;
2650  MaxOffset = 255;
2651  break;
2652  case AArch64::LDURHi:
2653  case AArch64::LDURHHi:
2654  case AArch64::LDURSHXi:
2655  case AArch64::LDURSHWi:
2656  case AArch64::STURHi:
2657  case AArch64::STURHHi:
2658  Width = 2;
2659  Scale = TypeSize::Fixed(1);
2660  MinOffset = -256;
2661  MaxOffset = 255;
2662  break;
2663  case AArch64::LDURBi:
2664  case AArch64::LDURBBi:
2665  case AArch64::LDURSBXi:
2666  case AArch64::LDURSBWi:
2667  case AArch64::STURBi:
2668  case AArch64::STURBBi:
2669  Width = 1;
2670  Scale = TypeSize::Fixed(1);
2671  MinOffset = -256;
2672  MaxOffset = 255;
2673  break;
2674  case AArch64::LDPQi:
2675  case AArch64::LDNPQi:
2676  case AArch64::STPQi:
2677  case AArch64::STNPQi:
2678  Scale = TypeSize::Fixed(16);
2679  Width = 32;
2680  MinOffset = -64;
2681  MaxOffset = 63;
2682  break;
2683  case AArch64::LDRQui:
2684  case AArch64::STRQui:
2685  Scale = TypeSize::Fixed(16);
2686  Width = 16;
2687  MinOffset = 0;
2688  MaxOffset = 4095;
2689  break;
2690  case AArch64::LDPXi:
2691  case AArch64::LDPDi:
2692  case AArch64::LDNPXi:
2693  case AArch64::LDNPDi:
2694  case AArch64::STPXi:
2695  case AArch64::STPDi:
2696  case AArch64::STNPXi:
2697  case AArch64::STNPDi:
2698  Scale = TypeSize::Fixed(8);
2699  Width = 16;
2700  MinOffset = -64;
2701  MaxOffset = 63;
2702  break;
2703  case AArch64::PRFMui:
2704  case AArch64::LDRXui:
2705  case AArch64::LDRDui:
2706  case AArch64::STRXui:
2707  case AArch64::STRDui:
2708  Scale = TypeSize::Fixed(8);
2709  Width = 8;
2710  MinOffset = 0;
2711  MaxOffset = 4095;
2712  break;
2713  case AArch64::StoreSwiftAsyncContext:
2714  // Store is an STRXui, but there might be an ADDXri in the expansion too.
2715  Scale = TypeSize::Fixed(1);
2716  Width = 8;
2717  MinOffset = 0;
2718  MaxOffset = 4095;
2719  break;
2720  case AArch64::LDPWi:
2721  case AArch64::LDPSi:
2722  case AArch64::LDNPWi:
2723  case AArch64::LDNPSi:
2724  case AArch64::STPWi:
2725  case AArch64::STPSi:
2726  case AArch64::STNPWi:
2727  case AArch64::STNPSi:
2728  Scale = TypeSize::Fixed(4);
2729  Width = 8;
2730  MinOffset = -64;
2731  MaxOffset = 63;
2732  break;
2733  case AArch64::LDRWui:
2734  case AArch64::LDRSui:
2735  case AArch64::LDRSWui:
2736  case AArch64::STRWui:
2737  case AArch64::STRSui:
2738  Scale = TypeSize::Fixed(4);
2739  Width = 4;
2740  MinOffset = 0;
2741  MaxOffset = 4095;
2742  break;
2743  case AArch64::LDRHui:
2744  case AArch64::LDRHHui:
2745  case AArch64::LDRSHWui:
2746  case AArch64::LDRSHXui:
2747  case AArch64::STRHui:
2748  case AArch64::STRHHui:
2749  Scale = TypeSize::Fixed(2);
2750  Width = 2;
2751  MinOffset = 0;
2752  MaxOffset = 4095;
2753  break;
2754  case AArch64::LDRBui:
2755  case AArch64::LDRBBui:
2756  case AArch64::LDRSBWui:
2757  case AArch64::LDRSBXui:
2758  case AArch64::STRBui:
2759  case AArch64::STRBBui:
2760  Scale = TypeSize::Fixed(1);
2761  Width = 1;
2762  MinOffset = 0;
2763  MaxOffset = 4095;
2764  break;
2765  case AArch64::STPXpre:
2766  case AArch64::LDPXpost:
2767  case AArch64::STPDpre:
2768  case AArch64::LDPDpost:
2769  Scale = TypeSize::Fixed(8);
2770  Width = 8;
2771  MinOffset = -512;
2772  MaxOffset = 504;
2773  break;
2774  case AArch64::STPQpre:
2775  case AArch64::LDPQpost:
2776  Scale = TypeSize::Fixed(16);
2777  Width = 16;
2778  MinOffset = -1024;
2779  MaxOffset = 1008;
2780  break;
2781  case AArch64::STRXpre:
2782  case AArch64::STRDpre:
2783  case AArch64::LDRXpost:
2784  case AArch64::LDRDpost:
2785  Scale = TypeSize::Fixed(1);
2786  Width = 8;
2787  MinOffset = -256;
2788  MaxOffset = 255;
2789  break;
2790  case AArch64::STRQpre:
2791  case AArch64::LDRQpost:
2792  Scale = TypeSize::Fixed(1);
2793  Width = 16;
2794  MinOffset = -256;
2795  MaxOffset = 255;
2796  break;
2797  case AArch64::ADDG:
2798  Scale = TypeSize::Fixed(16);
2799  Width = 0;
2800  MinOffset = 0;
2801  MaxOffset = 63;
2802  break;
2803  case AArch64::TAGPstack:
2804  Scale = TypeSize::Fixed(16);
2805  Width = 0;
2806  // TAGP with a negative offset turns into SUBP, which has a maximum offset
2807  // of 63 (not 64!).
2808  MinOffset = -63;
2809  MaxOffset = 63;
2810  break;
2811  case AArch64::LDG:
2812  case AArch64::STGOffset:
2813  case AArch64::STZGOffset:
2814  Scale = TypeSize::Fixed(16);
2815  Width = 16;
2816  MinOffset = -256;
2817  MaxOffset = 255;
2818  break;
2819  case AArch64::STR_ZZZZXI:
2820  case AArch64::LDR_ZZZZXI:
2821  Scale = TypeSize::Scalable(16);
2822  Width = SVEMaxBytesPerVector * 4;
2823  MinOffset = -256;
2824  MaxOffset = 252;
2825  break;
2826  case AArch64::STR_ZZZXI:
2827  case AArch64::LDR_ZZZXI:
2828  Scale = TypeSize::Scalable(16);
2829  Width = SVEMaxBytesPerVector * 3;
2830  MinOffset = -256;
2831  MaxOffset = 253;
2832  break;
2833  case AArch64::STR_ZZXI:
2834  case AArch64::LDR_ZZXI:
2835  Scale = TypeSize::Scalable(16);
2836  Width = SVEMaxBytesPerVector * 2;
2837  MinOffset = -256;
2838  MaxOffset = 254;
2839  break;
2840  case AArch64::LDR_PXI:
2841  case AArch64::STR_PXI:
2842  Scale = TypeSize::Scalable(2);
2843  Width = SVEMaxBytesPerVector / 8;
2844  MinOffset = -256;
2845  MaxOffset = 255;
2846  break;
2847  case AArch64::LDR_ZXI:
2848  case AArch64::STR_ZXI:
2849  Scale = TypeSize::Scalable(16);
2850  Width = SVEMaxBytesPerVector;
2851  MinOffset = -256;
2852  MaxOffset = 255;
2853  break;
2854  case AArch64::LD1B_IMM:
2855  case AArch64::LD1H_IMM:
2856  case AArch64::LD1W_IMM:
2857  case AArch64::LD1D_IMM:
2858  case AArch64::ST1B_IMM:
2859  case AArch64::ST1H_IMM:
2860  case AArch64::ST1W_IMM:
2861  case AArch64::ST1D_IMM:
2862  // A full vectors worth of data
2863  // Width = mbytes * elements
2864  Scale = TypeSize::Scalable(16);
2865  Width = SVEMaxBytesPerVector;
2866  MinOffset = -8;
2867  MaxOffset = 7;
2868  break;
2869  case AArch64::LD1B_H_IMM:
2870  case AArch64::LD1SB_H_IMM:
2871  case AArch64::LD1H_S_IMM:
2872  case AArch64::LD1SH_S_IMM:
2873  case AArch64::LD1W_D_IMM:
2874  case AArch64::LD1SW_D_IMM:
2875  case AArch64::ST1B_H_IMM:
2876  case AArch64::ST1H_S_IMM:
2877  case AArch64::ST1W_D_IMM:
2878  // A half vector worth of data
2879  // Width = mbytes * elements
2880  Scale = TypeSize::Scalable(8);
2881  Width = SVEMaxBytesPerVector / 2;
2882  MinOffset = -8;
2883  MaxOffset = 7;
2884  break;
2885  case AArch64::LD1B_S_IMM:
2886  case AArch64::LD1SB_S_IMM:
2887  case AArch64::LD1H_D_IMM:
2888  case AArch64::LD1SH_D_IMM:
2889  case AArch64::ST1B_S_IMM:
2890  case AArch64::ST1H_D_IMM:
2891  // A quarter vector worth of data
2892  // Width = mbytes * elements
2893  Scale = TypeSize::Scalable(4);
2894  Width = SVEMaxBytesPerVector / 4;
2895  MinOffset = -8;
2896  MaxOffset = 7;
2897  break;
2898  case AArch64::LD1B_D_IMM:
2899  case AArch64::LD1SB_D_IMM:
2900  case AArch64::ST1B_D_IMM:
2901  // A eighth vector worth of data
2902  // Width = mbytes * elements
2903  Scale = TypeSize::Scalable(2);
2904  Width = SVEMaxBytesPerVector / 8;
2905  MinOffset = -8;
2906  MaxOffset = 7;
2907  break;
2908  case AArch64::ST2GOffset:
2909  case AArch64::STZ2GOffset:
2910  Scale = TypeSize::Fixed(16);
2911  Width = 32;
2912  MinOffset = -256;
2913  MaxOffset = 255;
2914  break;
2915  case AArch64::STGPi:
2916  Scale = TypeSize::Fixed(16);
2917  Width = 16;
2918  MinOffset = -64;
2919  MaxOffset = 63;
2920  break;
2921  case AArch64::LD1RB_IMM:
2922  case AArch64::LD1RB_H_IMM:
2923  case AArch64::LD1RB_S_IMM:
2924  case AArch64::LD1RB_D_IMM:
2925  case AArch64::LD1RSB_H_IMM:
2926  case AArch64::LD1RSB_S_IMM:
2927  case AArch64::LD1RSB_D_IMM:
2928  Scale = TypeSize::Fixed(1);
2929  Width = 1;
2930  MinOffset = 0;
2931  MaxOffset = 63;
2932  break;
2933  case AArch64::LD1RH_IMM:
2934  case AArch64::LD1RH_S_IMM:
2935  case AArch64::LD1RH_D_IMM:
2936  case AArch64::LD1RSH_S_IMM:
2937  case AArch64::LD1RSH_D_IMM:
2938  Scale = TypeSize::Fixed(2);
2939  Width = 2;
2940  MinOffset = 0;
2941  MaxOffset = 63;
2942  break;
2943  case AArch64::LD1RW_IMM:
2944  case AArch64::LD1RW_D_IMM:
2945  case AArch64::LD1RSW_IMM:
2946  Scale = TypeSize::Fixed(4);
2947  Width = 4;
2948  MinOffset = 0;
2949  MaxOffset = 63;
2950  break;
2951  case AArch64::LD1RD_IMM:
2952  Scale = TypeSize::Fixed(8);
2953  Width = 8;
2954  MinOffset = 0;
2955  MaxOffset = 63;
2956  break;
2957  }
2958 
2959  return true;
2960 }
2961 
2962 // Scaling factor for unscaled load or store.
2964  switch (Opc) {
2965  default:
2966  llvm_unreachable("Opcode has unknown scale!");
2967  case AArch64::LDRBBui:
2968  case AArch64::LDURBBi:
2969  case AArch64::LDRSBWui:
2970  case AArch64::LDURSBWi:
2971  case AArch64::STRBBui:
2972  case AArch64::STURBBi:
2973  return 1;
2974  case AArch64::LDRHHui:
2975  case AArch64::LDURHHi:
2976  case AArch64::LDRSHWui:
2977  case AArch64::LDURSHWi:
2978  case AArch64::STRHHui:
2979  case AArch64::STURHHi:
2980  return 2;
2981  case AArch64::LDRSui:
2982  case AArch64::LDURSi:
2983  case AArch64::LDRSpre:
2984  case AArch64::LDRSWui:
2985  case AArch64::LDURSWi:
2986  case AArch64::LDRWpre:
2987  case AArch64::LDRWui:
2988  case AArch64::LDURWi:
2989  case AArch64::STRSui:
2990  case AArch64::STURSi:
2991  case AArch64::STRSpre:
2992  case AArch64::STRWui:
2993  case AArch64::STURWi:
2994  case AArch64::STRWpre:
2995  case AArch64::LDPSi:
2996  case AArch64::LDPSWi:
2997  case AArch64::LDPWi:
2998  case AArch64::STPSi:
2999  case AArch64::STPWi:
3000  return 4;
3001  case AArch64::LDRDui:
3002  case AArch64::LDURDi:
3003  case AArch64::LDRDpre:
3004  case AArch64::LDRXui:
3005  case AArch64::LDURXi:
3006  case AArch64::LDRXpre:
3007  case AArch64::STRDui:
3008  case AArch64::STURDi:
3009  case AArch64::STRDpre:
3010  case AArch64::STRXui:
3011  case AArch64::STURXi:
3012  case AArch64::STRXpre:
3013  case AArch64::LDPDi:
3014  case AArch64::LDPXi:
3015  case AArch64::STPDi:
3016  case AArch64::STPXi:
3017  return 8;
3018  case AArch64::LDRQui:
3019  case AArch64::LDURQi:
3020  case AArch64::STRQui:
3021  case AArch64::STURQi:
3022  case AArch64::STRQpre:
3023  case AArch64::LDPQi:
3024  case AArch64::LDRQpre:
3025  case AArch64::STPQi:
3026  case AArch64::STGOffset:
3027  case AArch64::STZGOffset:
3028  case AArch64::ST2GOffset:
3029  case AArch64::STZ2GOffset:
3030  case AArch64::STGPi:
3031  return 16;
3032  }
3033 }
3034 
3036  switch (MI.getOpcode()) {
3037  default:
3038  return false;
3039  case AArch64::LDRWpre:
3040  case AArch64::LDRXpre:
3041  case AArch64::LDRSpre:
3042  case AArch64::LDRDpre:
3043  case AArch64::LDRQpre:
3044  return true;
3045  }
3046 }
3047 
3049  switch (MI.getOpcode()) {
3050  default:
3051  return false;
3052  case AArch64::STRWpre:
3053  case AArch64::STRXpre:
3054  case AArch64::STRSpre:
3055  case AArch64::STRDpre:
3056  case AArch64::STRQpre:
3057  return true;
3058  }
3059 }
3060 
3062  return isPreLd(MI) || isPreSt(MI);
3063 }
3064 
3065 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
3066 // scaled.
3067 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
3068  int Scale = AArch64InstrInfo::getMemScale(Opc);
3069 
3070  // If the byte-offset isn't a multiple of the stride, we can't scale this
3071  // offset.
3072  if (Offset % Scale != 0)
3073  return false;
3074 
3075  // Convert the byte-offset used by unscaled into an "element" offset used
3076  // by the scaled pair load/store instructions.
3077  Offset /= Scale;
3078  return true;
3079 }
3080 
3081 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
3082  if (FirstOpc == SecondOpc)
3083  return true;
3084  // We can also pair sign-ext and zero-ext instructions.
3085  switch (FirstOpc) {
3086  default:
3087  return false;
3088  case AArch64::LDRWui:
3089  case AArch64::LDURWi:
3090  return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
3091  case AArch64::LDRSWui:
3092  case AArch64::LDURSWi:
3093  return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
3094  }
3095  // These instructions can't be paired based on their opcodes.
3096  return false;
3097 }
3098 
3099 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
3100  int64_t Offset1, unsigned Opcode1, int FI2,
3101  int64_t Offset2, unsigned Opcode2) {
3102  // Accesses through fixed stack object frame indices may access a different
3103  // fixed stack slot. Check that the object offsets + offsets match.
3104  if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
3105  int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
3106  int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
3107  assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
3108  // Convert to scaled object offsets.
3109  int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
3110  if (ObjectOffset1 % Scale1 != 0)
3111  return false;
3112  ObjectOffset1 /= Scale1;
3113  int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
3114  if (ObjectOffset2 % Scale2 != 0)
3115  return false;
3116  ObjectOffset2 /= Scale2;
3117  ObjectOffset1 += Offset1;
3118  ObjectOffset2 += Offset2;
3119  return ObjectOffset1 + 1 == ObjectOffset2;
3120  }
3121 
3122  return FI1 == FI2;
3123 }
3124 
3125 /// Detect opportunities for ldp/stp formation.
3126 ///
3127 /// Only called for LdSt for which getMemOperandWithOffset returns true.
3130  ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
3131  unsigned NumBytes) const {
3132  assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
3133  const MachineOperand &BaseOp1 = *BaseOps1.front();
3134  const MachineOperand &BaseOp2 = *BaseOps2.front();
3135  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
3136  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
3137  if (BaseOp1.getType() != BaseOp2.getType())
3138  return false;
3139 
3140  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
3141  "Only base registers and frame indices are supported.");
3142 
3143  // Check for both base regs and base FI.
3144  if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
3145  return false;
3146 
3147  // Only cluster up to a single pair.
3148  if (NumLoads > 2)
3149  return false;
3150 
3151  if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
3152  return false;
3153 
3154  // Can we pair these instructions based on their opcodes?
3155  unsigned FirstOpc = FirstLdSt.getOpcode();
3156  unsigned SecondOpc = SecondLdSt.getOpcode();
3157  if (!canPairLdStOpc(FirstOpc, SecondOpc))
3158  return false;
3159 
3160  // Can't merge volatiles or load/stores that have a hint to avoid pair
3161  // formation, for example.
3162  if (!isCandidateToMergeOrPair(FirstLdSt) ||
3163  !isCandidateToMergeOrPair(SecondLdSt))
3164  return false;
3165 
3166  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
3167  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
3168  if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
3169  return false;
3170 
3171  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
3172  if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
3173  return false;
3174 
3175  // Pairwise instructions have a 7-bit signed offset field.
3176  if (Offset1 > 63 || Offset1 < -64)
3177  return false;
3178 
3179  // The caller should already have ordered First/SecondLdSt by offset.
3180  // Note: except for non-equal frame index bases
3181  if (BaseOp1.isFI()) {
3182  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
3183  "Caller should have ordered offsets.");
3184 
3185  const MachineFrameInfo &MFI =
3186  FirstLdSt.getParent()->getParent()->getFrameInfo();
3187  return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
3188  BaseOp2.getIndex(), Offset2, SecondOpc);
3189  }
3190 
3191  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
3192 
3193  return Offset1 + 1 == Offset2;
3194 }
3195 
3197  unsigned Reg, unsigned SubIdx,
3198  unsigned State,
3199  const TargetRegisterInfo *TRI) {
3200  if (!SubIdx)
3201  return MIB.addReg(Reg, State);
3202 
3204  return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
3205  return MIB.addReg(Reg, State, SubIdx);
3206 }
3207 
3208 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
3209  unsigned NumRegs) {
3210  // We really want the positive remainder mod 32 here, that happens to be
3211  // easily obtainable with a mask.
3212  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
3213 }
3214 
3217  const DebugLoc &DL, MCRegister DestReg,
3218  MCRegister SrcReg, bool KillSrc,
3219  unsigned Opcode,
3220  ArrayRef<unsigned> Indices) const {
3221  assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
3223  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
3224  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
3225  unsigned NumRegs = Indices.size();
3226 
3227  int SubReg = 0, End = NumRegs, Incr = 1;
3228  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
3229  SubReg = NumRegs - 1;
3230  End = -1;
3231  Incr = -1;
3232  }
3233 
3234  for (; SubReg != End; SubReg += Incr) {
3235  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
3236  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
3237  AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
3238  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
3239  }
3240 }
3241 
3244  DebugLoc DL, unsigned DestReg,
3245  unsigned SrcReg, bool KillSrc,
3246  unsigned Opcode, unsigned ZeroReg,
3247  llvm::ArrayRef<unsigned> Indices) const {
3249  unsigned NumRegs = Indices.size();
3250 
3251 #ifndef NDEBUG
3252  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
3253  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
3254  assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
3255  "GPR reg sequences should not be able to overlap");
3256 #endif
3257 
3258  for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
3259  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
3260  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
3261  MIB.addReg(ZeroReg);
3262  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
3263  MIB.addImm(0);
3264  }
3265 }
3266 
3269  const DebugLoc &DL, MCRegister DestReg,
3270  MCRegister SrcReg, bool KillSrc) const {
3271  if (AArch64::GPR32spRegClass.contains(DestReg) &&
3272  (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
3274 
3275  if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
3276  // If either operand is WSP, expand to ADD #0.
3277  if (Subtarget.hasZeroCycleRegMove()) {
3278  // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
3279  MCRegister DestRegX = TRI->getMatchingSuperReg(
3280  DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3281  MCRegister SrcRegX = TRI->getMatchingSuperReg(
3282  SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3283  // This instruction is reading and writing X registers. This may upset
3284  // the register scavenger and machine verifier, so we need to indicate
3285  // that we are reading an undefined value from SrcRegX, but a proper
3286  // value from SrcReg.
3287  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
3288  .addReg(SrcRegX, RegState::Undef)
3289  .addImm(0)
3291  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
3292  } else {
3293  BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
3294  .addReg(SrcReg, getKillRegState(KillSrc))
3295  .addImm(0)
3297  }
3298  } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
3299  BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
3300  .addImm(0)
3302  } else {
3303  if (Subtarget.hasZeroCycleRegMove()) {
3304  // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
3305  MCRegister DestRegX = TRI->getMatchingSuperReg(
3306  DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3307  MCRegister SrcRegX = TRI->getMatchingSuperReg(
3308  SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3309  // This instruction is reading and writing X registers. This may upset
3310  // the register scavenger and machine verifier, so we need to indicate
3311  // that we are reading an undefined value from SrcRegX, but a proper
3312  // value from SrcReg.
3313  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
3314  .addReg(AArch64::XZR)
3315  .addReg(SrcRegX, RegState::Undef)
3316  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
3317  } else {
3318  // Otherwise, expand to ORR WZR.
3319  BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
3320  .addReg(AArch64::WZR)
3321  .addReg(SrcReg, getKillRegState(KillSrc));
3322  }
3323  }
3324  return;
3325  }
3326 
3327  // Copy a Predicate register by ORRing with itself.
3328  if (AArch64::PPRRegClass.contains(DestReg) &&
3329  AArch64::PPRRegClass.contains(SrcReg)) {
3330  assert(Subtarget.hasSVE() && "Unexpected SVE register.");
3331  BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
3332  .addReg(SrcReg) // Pg
3333  .addReg(SrcReg)
3334  .addReg(SrcReg, getKillRegState(KillSrc));
3335  return;
3336  }
3337 
3338  // Copy a Z register by ORRing with itself.
3339  if (AArch64::ZPRRegClass.contains(DestReg) &&
3340  AArch64::ZPRRegClass.contains(SrcReg)) {
3341  assert(Subtarget.hasSVE() && "Unexpected SVE register.");
3342  BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
3343  .addReg(SrcReg)
3344  .addReg(SrcReg, getKillRegState(KillSrc));
3345  return;
3346  }
3347 
3348  // Copy a Z register pair by copying the individual sub-registers.
3349  if (AArch64::ZPR2RegClass.contains(DestReg) &&
3350  AArch64::ZPR2RegClass.contains(SrcReg)) {
3351  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
3352  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3353  Indices);
3354  return;
3355  }
3356 
3357  // Copy a Z register triple by copying the individual sub-registers.
3358  if (AArch64::ZPR3RegClass.contains(DestReg) &&
3359  AArch64::ZPR3RegClass.contains(SrcReg)) {
3360  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
3361  AArch64::zsub2};
3362  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3363  Indices);
3364  return;
3365  }
3366 
3367  // Copy a Z register quad by copying the individual sub-registers.
3368  if (AArch64::ZPR4RegClass.contains(DestReg) &&
3369  AArch64::ZPR4RegClass.contains(SrcReg)) {
3370  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
3371  AArch64::zsub2, AArch64::zsub3};
3372  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3373  Indices);
3374  return;
3375  }
3376 
3377  if (AArch64::GPR64spRegClass.contains(DestReg) &&
3378  (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
3379  if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
3380  // If either operand is SP, expand to ADD #0.
3381  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
3382  .addReg(SrcReg, getKillRegState(KillSrc))
3383  .addImm(0)
3385  } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
3386  BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
3387  .addImm(0)
3389  } else {
3390  // Otherwise, expand to ORR XZR.
3391  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
3392  .addReg(AArch64::XZR)
3393  .addReg(SrcReg, getKillRegState(KillSrc));
3394  }
3395  return;
3396  }
3397 
3398  // Copy a DDDD register quad by copying the individual sub-registers.
3399  if (AArch64::DDDDRegClass.contains(DestReg) &&
3400  AArch64::DDDDRegClass.contains(SrcReg)) {
3401  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
3402  AArch64::dsub2, AArch64::dsub3};
3403  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3404  Indices);
3405  return;
3406  }
3407 
3408  // Copy a DDD register triple by copying the individual sub-registers.
3409  if (AArch64::DDDRegClass.contains(DestReg) &&
3410  AArch64::DDDRegClass.contains(SrcReg)) {
3411  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
3412  AArch64::dsub2};
3413  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3414  Indices);
3415  return;
3416  }
3417 
3418  // Copy a DD register pair by copying the individual sub-registers.
3419  if (AArch64::DDRegClass.contains(DestReg) &&
3420  AArch64::DDRegClass.contains(SrcReg)) {
3421  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
3422  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3423  Indices);
3424  return;
3425  }
3426 
3427  // Copy a QQQQ register quad by copying the individual sub-registers.
3428  if (AArch64::QQQQRegClass.contains(DestReg) &&
3429  AArch64::QQQQRegClass.contains(SrcReg)) {
3430  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
3431  AArch64::qsub2, AArch64::qsub3};
3432  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3433  Indices);
3434  return;
3435  }
3436 
3437  // Copy a QQQ register triple by copying the individual sub-registers.
3438  if (AArch64::QQQRegClass.contains(DestReg) &&
3439  AArch64::QQQRegClass.contains(SrcReg)) {
3440  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
3441  AArch64::qsub2};
3442  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3443  Indices);
3444  return;
3445  }
3446 
3447  // Copy a QQ register pair by copying the individual sub-registers.
3448  if (AArch64::QQRegClass.contains(DestReg) &&
3449  AArch64::QQRegClass.contains(SrcReg)) {
3450  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
3451  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3452  Indices);
3453  return;
3454  }
3455 
3456  if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
3457  AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
3458  static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
3459  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
3460  AArch64::XZR, Indices);
3461  return;
3462  }
3463 
3464  if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
3465  AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
3466  static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
3467  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
3468  AArch64::WZR, Indices);
3469  return;
3470  }
3471 
3472  if (AArch64::FPR128RegClass.contains(DestReg) &&
3473  AArch64::FPR128RegClass.contains(SrcReg)) {
3474  if (Subtarget.hasNEON()) {
3475  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3476  .addReg(SrcReg)
3477  .addReg(SrcReg, getKillRegState(KillSrc));
3478  } else {
3479  BuildMI(MBB, I, DL, get(AArch64::STRQpre))
3480  .addReg(AArch64::SP, RegState::Define)
3481  .addReg(SrcReg, getKillRegState(KillSrc))
3482  .addReg(AArch64::SP)
3483  .addImm(-16);
3484  BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
3485  .addReg(AArch64::SP, RegState::Define)
3486  .addReg(DestReg, RegState::Define)
3487  .addReg(AArch64::SP)
3488  .addImm(16);
3489  }
3490  return;
3491  }
3492 
3493  if (AArch64::FPR64RegClass.contains(DestReg) &&
3494  AArch64::FPR64RegClass.contains(SrcReg)) {
3495  BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
3496  .addReg(SrcReg, getKillRegState(KillSrc));
3497  return;
3498  }
3499 
3500  if (AArch64::FPR32RegClass.contains(DestReg) &&
3501  AArch64::FPR32RegClass.contains(SrcReg)) {
3502  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3503  .addReg(SrcReg, getKillRegState(KillSrc));
3504  return;
3505  }
3506 
3507  if (AArch64::FPR16RegClass.contains(DestReg) &&
3508  AArch64::FPR16RegClass.contains(SrcReg)) {
3509  DestReg =
3510  RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
3511  SrcReg =
3512  RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
3513  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3514  .addReg(SrcReg, getKillRegState(KillSrc));
3515  return;
3516  }
3517 
3518  if (AArch64::FPR8RegClass.contains(DestReg) &&
3519  AArch64::FPR8RegClass.contains(SrcReg)) {
3520  DestReg =
3521  RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
3522  SrcReg =
3523  RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
3524  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3525  .addReg(SrcReg, getKillRegState(KillSrc));
3526  return;
3527  }
3528 
3529  // Copies between GPR64 and FPR64.
3530  if (AArch64::FPR64RegClass.contains(DestReg) &&
3531  AArch64::GPR64RegClass.contains(SrcReg)) {
3532  BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
3533  .addReg(SrcReg, getKillRegState(KillSrc));
3534  return;
3535  }
3536  if (AArch64::GPR64RegClass.contains(DestReg) &&
3537  AArch64::FPR64RegClass.contains(SrcReg)) {
3538  BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
3539  .addReg(SrcReg, getKillRegState(KillSrc));
3540  return;
3541  }
3542  // Copies between GPR32 and FPR32.
3543  if (AArch64::FPR32RegClass.contains(DestReg) &&
3544  AArch64::GPR32RegClass.contains(SrcReg)) {
3545  BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
3546  .addReg(SrcReg, getKillRegState(KillSrc));
3547  return;
3548  }
3549  if (AArch64::GPR32RegClass.contains(DestReg) &&
3550  AArch64::FPR32RegClass.contains(SrcReg)) {
3551  BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
3552  .addReg(SrcReg, getKillRegState(KillSrc));
3553  return;
3554  }
3555 
3556  if (DestReg == AArch64::NZCV) {
3557  assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
3558  BuildMI(MBB, I, DL, get(AArch64::MSR))
3559  .addImm(AArch64SysReg::NZCV)
3560  .addReg(SrcReg, getKillRegState(KillSrc))
3561  .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
3562  return;
3563  }
3564 
3565  if (SrcReg == AArch64::NZCV) {
3566  assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
3567  BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
3568  .addImm(AArch64SysReg::NZCV)
3569  .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
3570  return;
3571  }
3572 
3573 #ifndef NDEBUG
3575  errs() << TRI.getRegAsmName(DestReg) << " = COPY "
3576  << TRI.getRegAsmName(SrcReg) << "\n";
3577 #endif
3578  llvm_unreachable("unimplemented reg-to-reg copy");
3579 }
3580 
3583  MachineBasicBlock::iterator InsertBefore,
3584  const MCInstrDesc &MCID,
3585  Register SrcReg, bool IsKill,
3586  unsigned SubIdx0, unsigned SubIdx1, int FI,
3587  MachineMemOperand *MMO) {
3588  Register SrcReg0 = SrcReg;
3589  Register SrcReg1 = SrcReg;
3590  if (Register::isPhysicalRegister(SrcReg)) {
3591  SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
3592  SubIdx0 = 0;
3593  SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
3594  SubIdx1 = 0;
3595  }
3596  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3597  .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
3598  .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
3599  .addFrameIndex(FI)
3600  .addImm(0)
3601  .addMemOperand(MMO);
3602 }
3603 
3606  bool isKill, int FI, const TargetRegisterClass *RC,
3607  const TargetRegisterInfo *TRI) const {
3608  MachineFunction &MF = *MBB.getParent();
3609  MachineFrameInfo &MFI = MF.getFrameInfo();
3610 
3612  MachineMemOperand *MMO =
3614  MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3615  unsigned Opc = 0;
3616  bool Offset = true;
3617  unsigned StackID = TargetStackID::Default;
3618  switch (TRI->getSpillSize(*RC)) {
3619  case 1:
3620  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3621  Opc = AArch64::STRBui;
3622  break;
3623  case 2:
3624  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3625  Opc = AArch64::STRHui;
3626  else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3627  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3628  Opc = AArch64::STR_PXI;
3630  }
3631  break;
3632  case 4:
3633  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3634  Opc = AArch64::STRWui;
3635  if (Register::isVirtualRegister(SrcReg))
3636  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
3637  else
3638  assert(SrcReg != AArch64::WSP);
3639  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3640  Opc = AArch64::STRSui;
3641  break;
3642  case 8:
3643  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3644  Opc = AArch64::STRXui;
3645  if (Register::isVirtualRegister(SrcReg))
3646  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3647  else
3648  assert(SrcReg != AArch64::SP);
3649  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3650  Opc = AArch64::STRDui;
3651  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3653  get(AArch64::STPWi), SrcReg, isKill,
3654  AArch64::sube32, AArch64::subo32, FI, MMO);
3655  return;
3656  }
3657  break;
3658  case 16:
3659  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3660  Opc = AArch64::STRQui;
3661  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3662  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3663  Opc = AArch64::ST1Twov1d;
3664  Offset = false;
3665  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3667  get(AArch64::STPXi), SrcReg, isKill,
3668  AArch64::sube64, AArch64::subo64, FI, MMO);
3669  return;
3670  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3671  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3672  Opc = AArch64::STR_ZXI;
3674  }
3675  break;
3676  case 24:
3677  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3678  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3679  Opc = AArch64::ST1Threev1d;
3680  Offset = false;
3681  }
3682  break;
3683  case 32:
3684  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3685  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3686  Opc = AArch64::ST1Fourv1d;
3687  Offset = false;
3688  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3689  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3690  Opc = AArch64::ST1Twov2d;
3691  Offset = false;
3692  } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3693  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3694  Opc = AArch64::STR_ZZXI;
3696  }
3697  break;
3698  case 48:
3699  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3700  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3701  Opc = AArch64::ST1Threev2d;
3702  Offset = false;
3703  } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3704  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3705  Opc = AArch64::STR_ZZZXI;
3707  }
3708  break;
3709  case 64:
3710  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3711  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3712  Opc = AArch64::ST1Fourv2d;
3713  Offset = false;
3714  } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3715  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3716  Opc = AArch64::STR_ZZZZXI;
3718  }
3719  break;
3720  }
3721  assert(Opc && "Unknown register class");
3722  MFI.setStackID(FI, StackID);
3723 
3724  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3725  .addReg(SrcReg, getKillRegState(isKill))
3726  .addFrameIndex(FI);
3727 
3728  if (Offset)
3729  MI.addImm(0);
3730  MI.addMemOperand(MMO);
3731 }
3732 
3735  MachineBasicBlock::iterator InsertBefore,
3736  const MCInstrDesc &MCID,
3737  Register DestReg, unsigned SubIdx0,
3738  unsigned SubIdx1, int FI,
3739  MachineMemOperand *MMO) {
3740  Register DestReg0 = DestReg;
3741  Register DestReg1 = DestReg;
3742  bool IsUndef = true;
3743  if (Register::isPhysicalRegister(DestReg)) {
3744  DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
3745  SubIdx0 = 0;
3746  DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
3747  SubIdx1 = 0;
3748  IsUndef = false;
3749  }
3750  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3751  .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
3752  .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
3753  .addFrameIndex(FI)
3754  .addImm(0)
3755  .addMemOperand(MMO);
3756 }
3757 
3760  int FI, const TargetRegisterClass *RC,
3761  const TargetRegisterInfo *TRI) const {
3762  MachineFunction &MF = *MBB.getParent();
3763  MachineFrameInfo &MFI = MF.getFrameInfo();
3765  MachineMemOperand *MMO =
3767  MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3768 
3769  unsigned Opc = 0;
3770  bool Offset = true;
3771  unsigned StackID = TargetStackID::Default;
3772  switch (TRI->getSpillSize(*RC)) {
3773  case 1:
3774  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3775  Opc = AArch64::LDRBui;
3776  break;
3777  case 2:
3778  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3779  Opc = AArch64::LDRHui;
3780  else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3781  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3782  Opc = AArch64::LDR_PXI;
3784  }
3785  break;
3786  case 4:
3787  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3788  Opc = AArch64::LDRWui;
3789  if (Register::isVirtualRegister(DestReg))
3790  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
3791  else
3792  assert(DestReg != AArch64::WSP);
3793  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3794  Opc = AArch64::LDRSui;
3795  break;
3796  case 8:
3797  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3798  Opc = AArch64::LDRXui;
3799  if (Register::isVirtualRegister(DestReg))
3800  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
3801  else
3802  assert(DestReg != AArch64::SP);
3803  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3804  Opc = AArch64::LDRDui;
3805  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3807  get(AArch64::LDPWi), DestReg, AArch64::sube32,
3808  AArch64::subo32, FI, MMO);
3809  return;
3810  }
3811  break;
3812  case 16:
3813  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3814  Opc = AArch64::LDRQui;
3815  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3816  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3817  Opc = AArch64::LD1Twov1d;
3818  Offset = false;
3819  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3821  get(AArch64::LDPXi), DestReg, AArch64::sube64,
3822  AArch64::subo64, FI, MMO);
3823  return;
3824  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3825  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3826  Opc = AArch64::LDR_ZXI;
3828  }
3829  break;
3830  case 24:
3831  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3832  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3833  Opc = AArch64::LD1Threev1d;
3834  Offset = false;
3835  }
3836  break;
3837  case 32:
3838  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3839  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3840  Opc = AArch64::LD1Fourv1d;
3841  Offset = false;
3842  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3843  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3844  Opc = AArch64::LD1Twov2d;
3845  Offset = false;
3846  } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3847  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3848  Opc = AArch64::LDR_ZZXI;
3850  }
3851  break;
3852  case 48:
3853  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3854  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3855  Opc = AArch64::LD1Threev2d;
3856  Offset = false;
3857  } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3858  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3859  Opc = AArch64::LDR_ZZZXI;
3861  }
3862  break;
3863  case 64:
3864  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3865  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3866  Opc = AArch64::LD1Fourv2d;
3867  Offset = false;
3868  } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3869  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3870  Opc = AArch64::LDR_ZZZZXI;
3872  }
3873  break;
3874  }
3875 
3876  assert(Opc && "Unknown register class");
3877  MFI.setStackID(FI, StackID);
3878 
3879  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3880  .addReg(DestReg, getDefRegState(true))
3881  .addFrameIndex(FI);
3882  if (Offset)
3883  MI.addImm(0);
3884  MI.addMemOperand(MMO);
3885 }
3886 
3888  const MachineInstr &UseMI,
3889  const TargetRegisterInfo *TRI) {
3890  return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
3891  UseMI.getIterator()),
3892  [TRI](const MachineInstr &I) {
3893  return I.modifiesRegister(AArch64::NZCV, TRI) ||
3894  I.readsRegister(AArch64::NZCV, TRI);
3895  });
3896 }
3897 
3899  const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
3900  // The smallest scalable element supported by scaled SVE addressing
3901  // modes are predicates, which are 2 scalable bytes in size. So the scalable
3902  // byte offset must always be a multiple of 2.
3903  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3904 
3905  // VGSized offsets are divided by '2', because the VG register is the
3906  // the number of 64bit granules as opposed to 128bit vector chunks,
3907  // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
3908  // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
3909  // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
3910  ByteSized = Offset.getFixed();
3911  VGSized = Offset.getScalable() / 2;
3912 }
3913 
3914 /// Returns the offset in parts to which this frame offset can be
3915 /// decomposed for the purpose of describing a frame offset.
3916 /// For non-scalable offsets this is simply its byte size.
3918  const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
3919  int64_t &NumDataVectors) {
3920  // The smallest scalable element supported by scaled SVE addressing
3921  // modes are predicates, which are 2 scalable bytes in size. So the scalable
3922  // byte offset must always be a multiple of 2.
3923  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3924 
3925  NumBytes = Offset.getFixed();
3926  NumDataVectors = 0;
3927  NumPredicateVectors = Offset.getScalable() / 2;
3928  // This method is used to get the offsets to adjust the frame offset.
3929  // If the function requires ADDPL to be used and needs more than two ADDPL
3930  // instructions, part of the offset is folded into NumDataVectors so that it
3931  // uses ADDVL for part of it, reducing the number of ADDPL instructions.
3932  if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
3933  NumPredicateVectors > 62) {
3934  NumDataVectors = NumPredicateVectors / 8;
3935  NumPredicateVectors -= NumDataVectors * 8;
3936  }
3937 }
3938 
3939 // Helper function to emit a frame offset adjustment from a given
3940 // pointer (SrcReg), stored into DestReg. This function is explicit
3941 // in that it requires the opcode.
3944  const DebugLoc &DL, unsigned DestReg,
3945  unsigned SrcReg, int64_t Offset, unsigned Opc,
3946  const TargetInstrInfo *TII,
3947  MachineInstr::MIFlag Flag, bool NeedsWinCFI,
3948  bool *HasWinCFI) {
3949  int Sign = 1;
3950  unsigned MaxEncoding, ShiftSize;
3951  switch (Opc) {
3952  case AArch64::ADDXri:
3953  case AArch64::ADDSXri:
3954  case AArch64::SUBXri:
3955  case AArch64::SUBSXri:
3956  MaxEncoding = 0xfff;
3957  ShiftSize = 12;
3958  break;
3959  case AArch64::ADDVL_XXI:
3960  case AArch64::ADDPL_XXI:
3961  MaxEncoding = 31;
3962  ShiftSize = 0;
3963  if (Offset < 0) {
3964  MaxEncoding = 32;
3965  Sign = -1;
3966  Offset = -Offset;
3967  }
3968  break;
3969  default:
3970  llvm_unreachable("Unsupported opcode");
3971  }
3972 
3973  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
3974  // scratch register. If DestReg is a virtual register, use it as the
3975  // scratch register; otherwise, create a new virtual register (to be
3976  // replaced by the scavenger at the end of PEI). That case can be optimized
3977  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
3978  // register can be loaded with offset%8 and the add/sub can use an extending
3979  // instruction with LSL#3.
3980  // Currently the function handles any offsets but generates a poor sequence
3981  // of code.
3982  // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
3983 
3984  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
3985  Register TmpReg = DestReg;
3986  if (TmpReg == AArch64::XZR)
3988  &AArch64::GPR64RegClass);
3989  do {
3990  uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
3991  unsigned LocalShiftSize = 0;
3992  if (ThisVal > MaxEncoding) {
3993  ThisVal = ThisVal >> ShiftSize;
3994  LocalShiftSize = ShiftSize;
3995  }
3996  assert((ThisVal >> ShiftSize) <= MaxEncoding &&
3997  "Encoding cannot handle value that big");
3998 
3999  Offset -= ThisVal << LocalShiftSize;
4000  if (Offset == 0)
4001  TmpReg = DestReg;
4002  auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
4003  .addReg(SrcReg)
4004  .addImm(Sign * (int)ThisVal);
4005  if (ShiftSize)
4006  MBI = MBI.addImm(
4007  AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
4008  MBI = MBI.setMIFlag(Flag);
4009 
4010  if (NeedsWinCFI) {
4011  assert(Sign == 1 && "SEH directives should always have a positive sign");
4012  int Imm = (int)(ThisVal << LocalShiftSize);
4013  if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
4014  (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
4015  if (HasWinCFI)
4016  *HasWinCFI = true;
4017  if (Imm == 0)
4018  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
4019  else
4020  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
4021  .addImm(Imm)
4022  .setMIFlag(Flag);
4023  assert(Offset == 0 && "Expected remaining offset to be zero to "
4024  "emit a single SEH directive");
4025  } else if (DestReg == AArch64::SP) {
4026  if (HasWinCFI)
4027  *HasWinCFI = true;
4028  assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
4029  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
4030  .addImm(Imm)
4031  .setMIFlag(Flag);
4032  }
4033  if (HasWinCFI)
4034  *HasWinCFI = true;
4035  }
4036 
4037  SrcReg = TmpReg;
4038  } while (Offset);
4039 }
4040 
4043  unsigned DestReg, unsigned SrcReg,
4045  MachineInstr::MIFlag Flag, bool SetNZCV,
4046  bool NeedsWinCFI, bool *HasWinCFI) {
4047  int64_t Bytes, NumPredicateVectors, NumDataVectors;
4049  Offset, Bytes, NumPredicateVectors, NumDataVectors);
4050 
4051  // First emit non-scalable frame offsets, or a simple 'mov'.
4052  if (Bytes || (!Offset && SrcReg != DestReg)) {
4053  assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
4054  "SP increment/decrement not 8-byte aligned");
4055  unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
4056  if (Bytes < 0) {
4057  Bytes = -Bytes;
4058  Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
4059  }
4060  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
4061  NeedsWinCFI, HasWinCFI);
4062  SrcReg = DestReg;
4063  }
4064 
4065  assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
4066  "SetNZCV not supported with SVE vectors");
4067  assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
4068  "WinCFI not supported with SVE vectors");
4069 
4070  if (NumDataVectors) {
4071  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
4072  AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
4073  SrcReg = DestReg;
4074  }
4075 
4076  if (NumPredicateVectors) {
4077  assert(DestReg != AArch64::SP && "Unaligned access to SP");
4078  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
4079  AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
4080  }
4081 }
4082 
4086  LiveIntervals *LIS, VirtRegMap *VRM) const {
4087  // This is a bit of a hack. Consider this instruction:
4088  //
4089  // %0 = COPY %sp; GPR64all:%0
4090  //
4091  // We explicitly chose GPR64all for the virtual register so such a copy might
4092  // be eliminated by RegisterCoalescer. However, that may not be possible, and
4093  // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
4094  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
4095  //
4096  // To prevent that, we are going to constrain the %0 register class here.
4097  //
4098  // <rdar://problem/11522048>
4099  //
4100  if (MI.isFullCopy()) {
4101  Register DstReg = MI.getOperand(0).getReg();
4102  Register SrcReg = MI.getOperand(1).getReg();
4103  if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
4104  MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
4105  return nullptr;
4106  }
4107  if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
4108  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4109  return nullptr;
4110  }
4111  }
4112 
4113  // Handle the case where a copy is being spilled or filled but the source
4114  // and destination register class don't match. For example:
4115  //
4116  // %0 = COPY %xzr; GPR64common:%0
4117  //
4118  // In this case we can still safely fold away the COPY and generate the
4119  // following spill code:
4120  //
4121  // STRXui %xzr, %stack.0
4122  //
4123  // This also eliminates spilled cross register class COPYs (e.g. between x and
4124  // d regs) of the same size. For example:
4125  //
4126  // %0 = COPY %1; GPR64:%0, FPR64:%1
4127  //
4128  // will be filled as
4129  //
4130  // LDRDui %0, fi<#0>
4131  //
4132  // instead of
4133  //
4134  // LDRXui %Temp, fi<#0>
4135  // %0 = FMOV %Temp
4136  //
4137  if (MI.isCopy() && Ops.size() == 1 &&
4138  // Make sure we're only folding the explicit COPY defs/uses.
4139  (Ops[0] == 0 || Ops[0] == 1)) {
4140  bool IsSpill = Ops[0] == 0;
4141  bool IsFill = !IsSpill;
4143  const MachineRegisterInfo &MRI = MF.getRegInfo();
4144  MachineBasicBlock &MBB = *MI.getParent();
4145  const MachineOperand &DstMO = MI.getOperand(0);
4146  const MachineOperand &SrcMO = MI.getOperand(1);
4147  Register DstReg = DstMO.getReg();
4148  Register SrcReg = SrcMO.getReg();
4149  // This is slightly expensive to compute for physical regs since
4150  // getMinimalPhysRegClass is slow.
4151  auto getRegClass = [&](unsigned Reg) {
4154  };
4155 
4156  if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
4157  assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
4158  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
4159  "Mismatched register size in non subreg COPY");
4160  if (IsSpill)
4161  storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
4162  getRegClass(SrcReg), &TRI);
4163  else
4164  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
4165  getRegClass(DstReg), &TRI);
4166  return &*--InsertPt;
4167  }
4168 
4169  // Handle cases like spilling def of:
4170  //
4171  // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
4172  //
4173  // where the physical register source can be widened and stored to the full
4174  // virtual reg destination stack slot, in this case producing:
4175  //
4176  // STRXui %xzr, %stack.0
4177  //
4178  if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
4179  assert(SrcMO.getSubReg() == 0 &&
4180  "Unexpected subreg on physical register");
4181  const TargetRegisterClass *SpillRC;
4182  unsigned SpillSubreg;
4183  switch (DstMO.getSubReg()) {
4184  default:
4185  SpillRC = nullptr;
4186  break;
4187  case AArch64::sub_32:
4188  case AArch64::ssub:
4189  if (AArch64::GPR32RegClass.contains(SrcReg)) {
4190  SpillRC = &AArch64::GPR64RegClass;
4191  SpillSubreg = AArch64::sub_32;
4192  } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
4193  SpillRC = &AArch64::FPR64RegClass;
4194  SpillSubreg = AArch64::ssub;
4195  } else
4196  SpillRC = nullptr;
4197  break;
4198  case AArch64::dsub:
4199  if (AArch64::FPR64RegClass.contains(SrcReg)) {
4200  SpillRC = &AArch64::FPR128RegClass;
4201  SpillSubreg = AArch64::dsub;
4202  } else
4203  SpillRC = nullptr;
4204  break;
4205  }
4206 
4207  if (SpillRC)
4208  if (unsigned WidenedSrcReg =
4209  TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
4210  storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
4211  FrameIndex, SpillRC, &TRI);
4212  return &*--InsertPt;
4213  }
4214  }
4215 
4216  // Handle cases like filling use of:
4217  //
4218  // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
4219  //
4220  // where we can load the full virtual reg source stack slot, into the subreg
4221  // destination, in this case producing:
4222  //
4223  // LDRWui %0:sub_32<def,read-undef>, %stack.0
4224  //
4225  if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
4226  const TargetRegisterClass *FillRC;
4227  switch (DstMO.getSubReg()) {
4228  default:
4229  FillRC = nullptr;
4230  break;
4231  case AArch64::sub_32:
4232  FillRC = &AArch64::GPR32RegClass;
4233  break;
4234  case AArch64::ssub:
4235  FillRC = &AArch64::FPR32RegClass;
4236  break;
4237  case AArch64::dsub:
4238  FillRC = &AArch64::FPR64RegClass;
4239  break;
4240  }
4241 
4242  if (FillRC) {
4243  assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
4244  TRI.getRegSizeInBits(*FillRC) &&
4245  "Mismatched regclass size on folded subreg COPY");
4246  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
4247  MachineInstr &LoadMI = *--InsertPt;
4248  MachineOperand &LoadDst = LoadMI.getOperand(0);
4249  assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
4250  LoadDst.setSubReg(DstMO.getSubReg());
4251  LoadDst.setIsUndef();
4252  return &LoadMI;
4253  }
4254  }
4255  }
4256 
4257  // Cannot fold.
4258  return nullptr;
4259 }
4260 
4262  StackOffset &SOffset,
4263  bool *OutUseUnscaledOp,
4264  unsigned *OutUnscaledOp,
4265  int64_t *EmittableOffset) {
4266  // Set output values in case of early exit.
4267  if (EmittableOffset)
4268  *EmittableOffset = 0;
4269  if (OutUseUnscaledOp)
4270  *OutUseUnscaledOp = false;
4271  if (OutUnscaledOp)
4272  *OutUnscaledOp = 0;
4273 
4274  // Exit early for structured vector spills/fills as they can't take an
4275  // immediate offset.
4276  switch (MI.getOpcode()) {
4277  default:
4278  break;
4279  case AArch64::LD1Twov2d:
4280  case AArch64::LD1Threev2d:
4281  case AArch64::LD1Fourv2d:
4282  case AArch64::LD1Twov1d:
4283  case AArch64::LD1Threev1d:
4284  case AArch64::LD1Fourv1d:
4285  case AArch64::ST1Twov2d:
4286  case AArch64::ST1Threev2d:
4287  case AArch64::ST1Fourv2d:
4288  case AArch64::ST1Twov1d:
4289  case AArch64::ST1Threev1d:
4290  case AArch64::ST1Fourv1d:
4291  case AArch64::IRG:
4292  case AArch64::IRGstack:
4293  case AArch64::STGloop:
4294  case AArch64::STZGloop:
4296  }
4297 
4298  // Get the min/max offset and the scale.
4299  TypeSize ScaleValue(0U, false);
4300  unsigned Width;
4301  int64_t MinOff, MaxOff;
4302  if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
4303  MaxOff))
4304  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
4305 
4306  // Construct the complete offset.
4307  bool IsMulVL = ScaleValue.isScalable();
4308  unsigned Scale = ScaleValue.getKnownMinSize();
4309  int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
4310 
4311  const MachineOperand &ImmOpnd =
4312  MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
4313  Offset += ImmOpnd.getImm() * Scale;
4314 
4315  // If the offset doesn't match the scale, we rewrite the instruction to
4316  // use the unscaled instruction instead. Likewise, if we have a negative
4317  // offset and there is an unscaled op to use.
4318  Optional<unsigned> UnscaledOp =
4320  bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
4321  if (useUnscaledOp &&
4322  !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
4323  MaxOff))
4324  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
4325 
4326  Scale = ScaleValue.getKnownMinSize();
4327  assert(IsMulVL == ScaleValue.isScalable() &&
4328  "Unscaled opcode has different value for scalable");
4329 
4330  int64_t Remainder = Offset % Scale;
4331  assert(!(Remainder && useUnscaledOp) &&
4332  "Cannot have remainder when using unscaled op");
4333 
4334  assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
4335  int64_t NewOffset = Offset / Scale;
4336  if (MinOff <= NewOffset && NewOffset <= MaxOff)
4337  Offset = Remainder;
4338  else {
4339  NewOffset = NewOffset < 0 ? MinOff : MaxOff;
4340  Offset = Offset - NewOffset * Scale + Remainder;
4341  }
4342 
4343  if (EmittableOffset)
4344  *EmittableOffset = NewOffset;
4345  if (OutUseUnscaledOp)
4346  *OutUseUnscaledOp = useUnscaledOp;
4347  if (OutUnscaledOp && UnscaledOp)
4348  *OutUnscaledOp = *UnscaledOp;
4349 
4350  if (IsMulVL)
4351  SOffset = StackOffset::get(SOffset.getFixed(), Offset);
4352  else
4353  SOffset = StackOffset::get(Offset, SOffset.getScalable());
4355  (SOffset ? 0 : AArch64FrameOffsetIsLegal);
4356 }
4357 
4358 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
4359  unsigned FrameReg, StackOffset &Offset,
4360  const AArch64InstrInfo *TII) {
4361  unsigned Opcode = MI.getOpcode();
4362  unsigned ImmIdx = FrameRegIdx + 1;
4363 
4364  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
4365  Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
4366  emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
4367  MI.getOperand(0).getReg(), FrameReg, Offset, TII,
4368  MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
4369  MI.eraseFromParent();
4370  Offset = StackOffset();
4371  return true;
4372  }
4373 
4374  int64_t NewOffset;
4375  unsigned UnscaledOp;
4376  bool UseUnscaledOp;
4377  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
4378  &UnscaledOp, &NewOffset);
4381  // Replace the FrameIndex with FrameReg.
4382  MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
4383  if (UseUnscaledOp)
4384  MI.setDesc(TII->get(UnscaledOp));
4385 
4386  MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
4387  return !Offset;
4388  }
4389 
4390  return false;
4391 }
4392 
4394  return MCInstBuilder(AArch64::HINT).addImm(0);
4395 }
4396 
4397 // AArch64 supports MachineCombiner.
4398 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
4399 
4400 // True when Opc sets flag
4401 static bool isCombineInstrSettingFlag(unsigned Opc) {
4402  switch (Opc) {
4403  case AArch64::ADDSWrr:
4404  case AArch64::ADDSWri:
4405  case AArch64::ADDSXrr:
4406  case AArch64::ADDSXri:
4407  case AArch64::SUBSWrr:
4408  case AArch64::SUBSXrr:
4409  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4410  case AArch64::SUBSWri:
4411  case AArch64::SUBSXri:
4412  return true;
4413  default:
4414  break;
4415  }
4416  return false;
4417 }
4418 
4419 // 32b Opcodes that can be combined with a MUL
4420 static bool isCombineInstrCandidate32(unsigned Opc) {
4421  switch (Opc) {
4422  case AArch64::ADDWrr:
4423  case AArch64::ADDWri:
4424  case AArch64::SUBWrr:
4425  case AArch64::ADDSWrr:
4426  case AArch64::ADDSWri:
4427  case AArch64::SUBSWrr:
4428  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4429  case AArch64::SUBWri:
4430  case AArch64::SUBSWri:
4431  return true;
4432  default:
4433  break;
4434  }
4435  return false;
4436 }
4437 
4438 // 64b Opcodes that can be combined with a MUL
4439 static bool isCombineInstrCandidate64(unsigned Opc) {
4440  switch (Opc) {
4441  case AArch64::ADDXrr:
4442  case AArch64::ADDXri:
4443  case AArch64::SUBXrr:
4444  case AArch64::ADDSXrr:
4445  case AArch64::ADDSXri:
4446  case AArch64::SUBSXrr:
4447  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4448  case AArch64::SUBXri:
4449  case AArch64::SUBSXri:
4450  case AArch64::ADDv8i8:
4451  case AArch64::ADDv16i8:
4452  case AArch64::ADDv4i16:
4453  case AArch64::ADDv8i16:
4454  case AArch64::ADDv2i32:
4455  case AArch64::ADDv4i32:
4456  case AArch64::SUBv8i8:
4457  case AArch64::SUBv16i8:
4458  case AArch64::SUBv4i16:
4459  case AArch64::SUBv8i16:
4460  case AArch64::SUBv2i32:
4461  case AArch64::SUBv4i32:
4462  return true;
4463  default:
4464  break;
4465  }
4466  return false;
4467 }
4468 
4469 // FP Opcodes that can be combined with a FMUL.
4470 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
4471  switch (Inst.getOpcode()) {
4472  default:
4473  break;
4474  case AArch64::FADDHrr:
4475  case AArch64::FADDSrr:
4476  case AArch64::FADDDrr:
4477  case AArch64::FADDv4f16:
4478  case AArch64::FADDv8f16:
4479  case AArch64::FADDv2f32:
4480  case AArch64::FADDv2f64:
4481  case AArch64::FADDv4f32:
4482  case AArch64::FSUBHrr:
4483  case AArch64::FSUBSrr:
4484  case AArch64::FSUBDrr:
4485  case AArch64::FSUBv4f16:
4486  case AArch64::FSUBv8f16:
4487  case AArch64::FSUBv2f32:
4488  case AArch64::FSUBv2f64:
4489  case AArch64::FSUBv4f32:
4491  // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
4492  // the target options or if FADD/FSUB has the contract fast-math flag.
4493  return Options.UnsafeFPMath ||
4494  Options.AllowFPOpFusion == FPOpFusion::Fast ||
4496  return true;
4497  }
4498  return false;
4499 }
4500 
4501 // Opcodes that can be combined with a MUL
4502 static bool isCombineInstrCandidate(unsigned Opc) {
4504 }
4505 
4506 //
4507 // Utility routine that checks if \param MO is defined by an
4508 // \param CombineOpc instruction in the basic block \param MBB
4510  unsigned CombineOpc, unsigned ZeroReg = 0,
4511  bool CheckZeroReg = false) {
4513  MachineInstr *MI = nullptr;
4514 
4515  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
4516  MI = MRI.getUniqueVRegDef(MO.getReg());
4517  // And it needs to be in the trace (otherwise, it won't have a depth).
4518  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
4519  return false;
4520  // Must only used by the user we combine with.
4521  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
4522  return false;
4523 
4524  if (CheckZeroReg) {
4525  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
4526  MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
4527  MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
4528  // The third input reg must be zero.
4529  if (MI->getOperand(3).getReg() != ZeroReg)
4530  return false;
4531  }
4532 
4533  return true;
4534 }
4535 
4536 //
4537 // Is \param MO defined by an integer multiply and can be combined?
4539  unsigned MulOpc, unsigned ZeroReg) {
4540  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
4541 }
4542 
4543 //
4544 // Is \param MO defined by a floating-point multiply and can be combined?
4546  unsigned MulOpc) {
4547  return canCombine(MBB, MO, MulOpc);
4548 }
4549 
4550 // TODO: There are many more machine instruction opcodes to match:
4551 // 1. Other data types (integer, vectors)
4552 // 2. Other math / logic operations (xor, or)
4553 // 3. Other forms of the same operation (intrinsics and other variants)
4555  const MachineInstr &Inst) const {
4556  switch (Inst.getOpcode()) {
4557  case AArch64::FADDDrr:
4558  case AArch64::FADDSrr:
4559  case AArch64::FADDv2f32:
4560  case AArch64::FADDv2f64:
4561  case AArch64::FADDv4f32:
4562  case AArch64::FMULDrr:
4563  case AArch64::FMULSrr:
4564  case AArch64::FMULX32:
4565  case AArch64::FMULX64:
4566  case AArch64::FMULXv2f32:
4567  case AArch64::FMULXv2f64:
4568  case AArch64::FMULXv4f32:
4569  case AArch64::FMULv2f32:
4570  case AArch64::FMULv2f64:
4571  case AArch64::FMULv4f32:
4572  return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
4573  default:
4574  return false;
4575  }
4576 }
4577 
4578 /// Find instructions that can be turned into madd.
4579 static bool getMaddPatterns(MachineInstr &Root,
4581  unsigned Opc = Root.getOpcode();
4582  MachineBasicBlock &MBB = *Root.getParent();
4583  bool Found = false;
4584 
4585  if (!isCombineInstrCandidate(Opc))
4586  return false;
4587  if (isCombineInstrSettingFlag(Opc)) {
4588  int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
4589  // When NZCV is live bail out.
4590  if (Cmp_NZCV == -1)
4591  return false;
4592  unsigned NewOpc = convertToNonFlagSettingOpc(Root);
4593  // When opcode can't change bail out.
4594  // CHECKME: do we miss any cases for opcode conversion?
4595  if (NewOpc == Opc)
4596  return false;
4597  Opc = NewOpc;
4598  }
4599 
4600  auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
4602  if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
4603  Patterns.push_back(Pattern);
4604  Found = true;
4605  }
4606  };
4607 
4608  auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
4609  if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
4610  Patterns.push_back(Pattern);
4611  Found = true;
4612  }
4613  };
4614 
4615  typedef MachineCombinerPattern MCP;
4616 
4617  switch (Opc) {
4618  default:
4619  break;
4620  case AArch64::ADDWrr:
4621  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4622  "ADDWrr does not have register operands");
4623  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
4624  setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
4625  break;
4626  case AArch64::ADDXrr:
4627  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
4628  setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
4629  break;
4630  case AArch64::SUBWrr:
4631  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
4632  setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
4633  break;
4634  case AArch64::SUBXrr:
4635  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
4636  setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
4637  break;
4638  case AArch64::ADDWri:
4639  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
4640  break;
4641  case AArch64::ADDXri:
4642  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
4643  break;
4644  case AArch64::SUBWri:
4645  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
4646  break;
4647  case AArch64::SUBXri:
4648  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
4649  break;
4650  case AArch64::ADDv8i8:
4651  setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
4652  setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
4653  break;
4654  case AArch64::ADDv16i8:
4655  setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
4656  setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
4657  break;
4658  case AArch64::ADDv4i16:
4659  setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
4660  setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
4661  setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
4662  setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
4663  break;
4664  case AArch64::ADDv8i16:
4665  setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
4666  setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
4667  setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
4668  setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
4669  break;
4670  case AArch64::ADDv2i32:
4671  setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
4672  setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
4673  setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
4674  setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
4675  break;
4676  case AArch64::ADDv4i32:
4677  setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
4678  setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
4679  setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
4680  setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
4681  break;
4682  case AArch64::SUBv8i8:
4683  setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
4684  setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
4685  break;
4686  case AArch64::SUBv16i8:
4687  setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
4688  setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
4689  break;
4690  case AArch64::SUBv4i16:
4691  setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
4692  setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
4693  setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
4694  setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
4695  break;
4696  case AArch64::SUBv8i16:
4697  setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
4698  setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
4699  setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
4700  setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
4701  break;
4702  case AArch64::SUBv2i32:
4703  setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
4704  setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
4705  setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
4706  setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
4707  break;
4708  case AArch64::SUBv4i32:
4709  setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
4710  setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
4711  setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
4712  setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
4713  break;
4714  }
4715  return Found;
4716 }
4717 /// Floating-Point Support
4718 
4719 /// Find instructions that can be turned into madd.
4720 static bool getFMAPatterns(MachineInstr &Root,
4722 
4723  if (!isCombineInstrCandidateFP(Root))
4724  return false;
4725 
4726  MachineBasicBlock &MBB = *Root.getParent();
4727  bool Found = false;
4728 
4729  auto Match = [&](int Opcode, int Operand,
4730  MachineCombinerPattern Pattern) -> bool {
4731  if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
4732  Patterns.push_back(Pattern);
4733  return true;
4734  }
4735  return false;
4736  };
4737 
4738  typedef MachineCombinerPattern MCP;
4739 
4740  switch (Root.getOpcode()) {
4741  default:
4742  assert(false && "Unsupported FP instruction in combiner\n");
4743  break;
4744  case AArch64::FADDHrr:
4745  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4746  "FADDHrr does not have register operands");
4747 
4748  Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
4749  Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
4750  break;
4751  case AArch64::FADDSrr:
4752  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4753  "FADDSrr does not have register operands");
4754 
4755  Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
4756  Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
4757 
4758  Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
4759  Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
4760  break;
4761  case AArch64::FADDDrr:
4762  Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
4763  Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
4764 
4765  Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
4766  Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
4767  break;
4768  case AArch64::FADDv4f16:
4769  Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
4770  Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
4771 
4772  Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
4773  Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
4774  break;
4775  case AArch64::FADDv8f16:
4776  Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
4777  Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
4778 
4779  Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
4780  Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
4781  break;
4782  case AArch64::FADDv2f32:
4783  Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
4784  Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
4785 
4786  Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
4787  Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
4788  break;
4789  case AArch64::FADDv2f64:
4790  Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
4791  Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
4792 
4793  Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
4794  Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
4795  break;
4796  case AArch64::FADDv4f32:
4797  Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
4798  Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
4799 
4800  Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
4801  Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
4802  break;
4803  case AArch64::FSUBHrr:
4804  Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
4805  Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
4806  Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
4807  break;
4808  case AArch64::FSUBSrr:
4809  Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
4810 
4811  Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
4812  Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
4813 
4814  Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
4815  break;
4816  case AArch64::FSUBDrr:
4817  Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
4818 
4819  Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
4820  Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
4821 
4822  Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
4823  break;
4824  case AArch64::FSUBv4f16:
4825  Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
4826  Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
4827 
4828  Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
4829  Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
4830  break;
4831  case AArch64::FSUBv8f16:
4832  Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
4833  Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
4834 
4835  Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
4836  Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
4837  break;
4838  case AArch64::FSUBv2f32:
4839  Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
4840  Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
4841 
4842  Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
4843  Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
4844  break;
4845  case AArch64::FSUBv2f64:
4846  Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
4847  Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
4848 
4849  Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
4850  Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
4851  break;
4852  case AArch64::FSUBv4f32:
4853  Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
4854  Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
4855 
4856  Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
4857  Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
4858  break;
4859  }
4860  return Found;
4861 }
4862 
4863 /// Return true when a code sequence can improve throughput. It
4864 /// should be called only for instructions in loops.
4865 /// \param Pattern - combiner pattern
4868  switch (Pattern) {
4869  default:
4870  break;
4966  return true;
4967  } // end switch (Pattern)
4968  return false;
4969 }
4970 /// Return true when there is potentially a faster code sequence for an
4971 /// instruction chain ending in \p Root. All potential patterns are listed in
4972 /// the \p Pattern vector. Pattern should be sorted in priority order since the
4973 /// pattern evaluator stops checking as soon as it finds a faster sequence.
4974 
4977  bool DoRegPressureReduce) const {
4978  // Integer patterns
4979  if (getMaddPatterns(Root, Patterns))
4980  return true;
4981  // Floating point patterns
4982  if (getFMAPatterns(Root, Patterns))
4983  return true;
4984 
4985  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
4986  DoRegPressureReduce);
4987 }
4988 
4989 enum class FMAInstKind { Default, Indexed, Accumulator };
4990 /// genFusedMultiply - Generate fused multiply instructions.
4991 /// This function supports both integer and floating point instructions.
4992 /// A typical example:
4993 /// F|MUL I=A,B,0
4994 /// F|ADD R,I,C
4995 /// ==> F|MADD R,A,B,C
4996 /// \param MF Containing MachineFunction
4997 /// \param MRI Register information
4998 /// \param TII Target information
4999 /// \param Root is the F|ADD instruction
5000 /// \param [out] InsInstrs is a vector of machine instructions and will
5001 /// contain the generated madd instruction
5002 /// \param IdxMulOpd is index of operand in Root that is the result of
5003 /// the F|MUL. In the example above IdxMulOpd is 1.
5004 /// \param MaddOpc the opcode fo the f|madd instruction
5005 /// \param RC Register class of operands
5006 /// \param kind of fma instruction (addressing mode) to be generated
5007 /// \param ReplacedAddend is the result register from the instruction
5008 /// replacing the non-combined operand, if any.
5009 static MachineInstr *
5011  const TargetInstrInfo *TII, MachineInstr &Root,
5012  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
5013  unsigned MaddOpc, const TargetRegisterClass *RC,
5015  const Register *ReplacedAddend = nullptr) {
5016  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
5017 
5018  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
5019  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
5020  Register ResultReg = Root.getOperand(0).getReg();
5021  Register SrcReg0 = MUL->getOperand(1).getReg();
5022  bool Src0IsKill = MUL->getOperand(1).isKill();
5023  Register SrcReg1 = MUL->getOperand(2).getReg();
5024  bool Src1IsKill = MUL->getOperand(2).isKill();
5025 
5026  unsigned SrcReg2;
5027  bool Src2IsKill;
5028  if (ReplacedAddend) {
5029  // If we just generated a new addend, we must be it's only use.
5030  SrcReg2 = *ReplacedAddend;
5031  Src2IsKill = true;
5032  } else {
5033  SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
5034  Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
5035  }
5036 
5037  if (Register::isVirtualRegister(ResultReg))
5038  MRI.constrainRegClass(ResultReg, RC);
5039  if (Register::isVirtualRegister(SrcReg0))
5040  MRI.constrainRegClass(SrcReg0, RC);
5041  if (Register::isVirtualRegister(SrcReg1))
5042  MRI.constrainRegClass(SrcReg1, RC);
5043  if (Register::isVirtualRegister(SrcReg2))
5044  MRI.constrainRegClass(SrcReg2, RC);
5045 
5046  MachineInstrBuilder MIB;
5047  if (kind == FMAInstKind::Default)
5048  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
5049  .addReg(SrcReg0, getKillRegState(Src0IsKill))
5050  .addReg(SrcReg1, getKillRegState(Src1IsKill))
5051  .addReg(SrcReg2, getKillRegState(Src2IsKill));
5052  else if (kind == FMAInstKind::Indexed)
5053  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
5054  .addReg(SrcReg2, getKillRegState(Src2IsKill))
5055  .addReg(SrcReg0, getKillRegState(Src0IsKill))
5056  .addReg(SrcReg1, getKillRegState(Src1IsKill))
5057  .addImm(MUL->getOperand(3).getImm());
5058  else if (kind == FMAInstKind::Accumulator)
5059  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
5060  .addReg(SrcReg2, getKillRegState(Src2IsKill))
5061  .addReg(SrcReg0, getKillRegState(Src0IsKill))
5062  .addReg(SrcReg1, getKillRegState(Src1IsKill));
5063  else
5064  assert(false && "Invalid FMA instruction kind \n");
5065  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
5066  InsInstrs.push_back(MIB);
5067  return MUL;
5068 }
5069 
5070 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
5071 /// instructions.
5072 ///
5073 /// \see genFusedMultiply
5077  unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
5078  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
5080 }
5081 
5082 /// genNeg - Helper to generate an intermediate negation of the second operand
5083 /// of Root
5085  const TargetInstrInfo *TII, MachineInstr &Root,
5087  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
5088  unsigned MnegOpc, const TargetRegisterClass *RC) {
5089  Register NewVR = MRI.createVirtualRegister(RC);
5090  MachineInstrBuilder MIB =
5091  BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
5092  .add(Root.getOperand(2));
5093  InsInstrs.push_back(MIB);
5094 
5095  assert(InstrIdxForVirtReg.empty());
5096  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5097 
5098  return NewVR;
5099 }
5100 
5101 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
5102 /// instructions with an additional negation of the accumulator
5106  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
5107  unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
5108  assert(IdxMulOpd == 1);
5109 
5110  Register NewVR =
5111  genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
5112  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
5113  FMAInstKind::Accumulator, &NewVR);
5114 }
5115 
5116 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
5117 /// instructions.
5118 ///
5119 /// \see genFusedMultiply
5123  unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
5124  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
5126 }
5127 
5128 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
5129 /// instructions with an additional negation of the accumulator
5133  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
5134  unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
5135  assert(IdxMulOpd == 1);
5136 
5137  Register NewVR =
5138  genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
5139 
5140  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
5141  FMAInstKind::Indexed, &NewVR);
5142 }
5143 
5144 /// genMaddR - Generate madd instruction and combine mul and add using
5145 /// an extra virtual register
5146 /// Example - an ADD intermediate needs to be stored in a register:
5147 /// MUL I=A,B,0
5148 /// ADD R,I,Imm
5149 /// ==> ORR V, ZR, Imm
5150 /// ==> MADD R,A,B,V
5151 /// \param MF Containing MachineFunction
5152 /// \param MRI Register information
5153 /// \param TII Target information
5154 /// \param Root is the ADD instruction
5155 /// \param [out] InsInstrs is a vector of machine instructions and will
5156 /// contain the generated madd instruction
5157 /// \param IdxMulOpd is index of operand in Root that is the result of
5158 /// the MUL. In the example above IdxMulOpd is 1.
5159 /// \param MaddOpc the opcode fo the madd instruction
5160 /// \param VR is a virtual register that holds the value of an ADD operand
5161 /// (V in the example above).
5162 /// \param RC Register class of operands
5164  const TargetInstrInfo *TII, MachineInstr &Root,
5166  unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
5167  const TargetRegisterClass *RC) {
5168  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
5169 
5170  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
5171  Register ResultReg = Root.getOperand(0).getReg();
5172  Register SrcReg0 = MUL->getOperand(1).getReg();
5173  bool Src0IsKill = MUL->getOperand(1).isKill();
5174  Register SrcReg1 = MUL->getOperand(2).getReg();
5175  bool Src1IsKill = MUL->getOperand(2).isKill();
5176 
5177  if (Register::isVirtualRegister(ResultReg))
5178  MRI.constrainRegClass(ResultReg, RC);
5179  if (Register::isVirtualRegister(SrcReg0))
5180  MRI.constrainRegClass(SrcReg0, RC);
5181  if (Register::isVirtualRegister(SrcReg1))
5182  MRI.constrainRegClass(SrcReg1, RC);
5184  MRI.constrainRegClass(VR, RC);
5185 
5186  MachineInstrBuilder MIB =
5187  BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
5188  .addReg(SrcReg0, getKillRegState(Src0IsKill))
5189  .addReg(SrcReg1, getKillRegState(Src1IsKill))
5190  .addReg(VR);
5191  // Insert the MADD
5192  InsInstrs.push_back(MIB);
5193  return MUL;
5194 }
5195 
5196 /// When getMachineCombinerPatterns() finds potential patterns,
5197 /// this function generates the instructions that could replace the
5198 /// original code sequence
5203  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
5204  MachineBasicBlock &MBB = *Root.getParent();
5206  MachineFunction &MF = *MBB.getParent();
5207  const