LLVM  13.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/CodeGen/StackMaps.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/MC/MCAsmInfo.h"
37 #include "llvm/MC/MCInst.h"
38 #include "llvm/MC/MCInstBuilder.h"
39 #include "llvm/MC/MCInstrDesc.h"
40 #include "llvm/Support/Casting.h"
41 #include "llvm/Support/CodeGen.h"
43 #include "llvm/Support/Compiler.h"
48 #include <cassert>
49 #include <cstdint>
50 #include <iterator>
51 #include <utility>
52 
53 using namespace llvm;
54 
55 #define GET_INSTRINFO_CTOR_DTOR
56 #include "AArch64GenInstrInfo.inc"
57 
59  "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
60  cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
61 
63  "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
64  cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
65 
66 static cl::opt<unsigned>
67  BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
68  cl::desc("Restrict range of Bcc instructions (DEBUG)"));
69 
71  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
72  AArch64::CATCHRET),
73  RI(STI.getTargetTriple()), Subtarget(STI) {}
74 
75 /// GetInstSize - Return the number of bytes of code the specified
76 /// instruction may be. This returns the maximum number of bytes.
78  const MachineBasicBlock &MBB = *MI.getParent();
79  const MachineFunction *MF = MBB.getParent();
80  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
81 
82  {
83  auto Op = MI.getOpcode();
85  return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
86  }
87 
88  // Meta-instructions emit no code.
89  if (MI.isMetaInstruction())
90  return 0;
91 
92  // FIXME: We currently only handle pseudoinstructions that don't get expanded
93  // before the assembly printer.
94  unsigned NumBytes = 0;
95  const MCInstrDesc &Desc = MI.getDesc();
96  switch (Desc.getOpcode()) {
97  default:
98  // Anything not explicitly designated otherwise is a normal 4-byte insn.
99  NumBytes = 4;
100  break;
101  case TargetOpcode::STACKMAP:
102  // The upper bound for a stackmap intrinsic is the full length of its shadow
103  NumBytes = StackMapOpers(&MI).getNumPatchBytes();
104  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
105  break;
106  case TargetOpcode::PATCHPOINT:
107  // The size of the patchpoint intrinsic is the number of bytes requested
108  NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
109  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
110  break;
111  case TargetOpcode::STATEPOINT:
112  NumBytes = StatepointOpers(&MI).getNumPatchBytes();
113  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
114  // No patch bytes means a normal call inst is emitted
115  if (NumBytes == 0)
116  NumBytes = 4;
117  break;
119  // This gets lowered to an instruction sequence which takes 16 bytes
120  NumBytes = 16;
121  break;
122  case AArch64::SpeculationBarrierISBDSBEndBB:
123  // This gets lowered to 2 4-byte instructions.
124  NumBytes = 8;
125  break;
126  case AArch64::SpeculationBarrierSBEndBB:
127  // This gets lowered to 1 4-byte instructions.
128  NumBytes = 4;
129  break;
130  case AArch64::JumpTableDest32:
131  case AArch64::JumpTableDest16:
132  case AArch64::JumpTableDest8:
133  NumBytes = 12;
134  break;
135  case AArch64::SPACE:
136  NumBytes = MI.getOperand(1).getImm();
137  break;
138  case TargetOpcode::BUNDLE:
139  NumBytes = getInstBundleLength(MI);
140  break;
141  }
142 
143  return NumBytes;
144 }
145 
146 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
147  unsigned Size = 0;
149  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
150  while (++I != E && I->isInsideBundle()) {
151  assert(!I->isBundle() && "No nested bundle!");
153  }
154  return Size;
155 }
156 
159  // Block ends with fall-through condbranch.
160  switch (LastInst->getOpcode()) {
161  default:
162  llvm_unreachable("Unknown branch instruction?");
163  case AArch64::Bcc:
164  Target = LastInst->getOperand(1).getMBB();
165  Cond.push_back(LastInst->getOperand(0));
166  break;
167  case AArch64::CBZW:
168  case AArch64::CBZX:
169  case AArch64::CBNZW:
170  case AArch64::CBNZX:
171  Target = LastInst->getOperand(1).getMBB();
172  Cond.push_back(MachineOperand::CreateImm(-1));
173  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
174  Cond.push_back(LastInst->getOperand(0));
175  break;
176  case AArch64::TBZW:
177  case AArch64::TBZX:
178  case AArch64::TBNZW:
179  case AArch64::TBNZX:
180  Target = LastInst->getOperand(2).getMBB();
181  Cond.push_back(MachineOperand::CreateImm(-1));
182  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
183  Cond.push_back(LastInst->getOperand(0));
184  Cond.push_back(LastInst->getOperand(1));
185  }
186 }
187 
188 static unsigned getBranchDisplacementBits(unsigned Opc) {
189  switch (Opc) {
190  default:
191  llvm_unreachable("unexpected opcode!");
192  case AArch64::B:
193  return 64;
194  case AArch64::TBNZW:
195  case AArch64::TBZW:
196  case AArch64::TBNZX:
197  case AArch64::TBZX:
198  return TBZDisplacementBits;
199  case AArch64::CBNZW:
200  case AArch64::CBZW:
201  case AArch64::CBNZX:
202  case AArch64::CBZX:
203  return CBZDisplacementBits;
204  case AArch64::Bcc:
205  return BCCDisplacementBits;
206  }
207 }
208 
210  int64_t BrOffset) const {
211  unsigned Bits = getBranchDisplacementBits(BranchOp);
212  assert(Bits >= 3 && "max branch displacement must be enough to jump"
213  "over conditional branch expansion");
214  return isIntN(Bits, BrOffset / 4);
215 }
216 
219  switch (MI.getOpcode()) {
220  default:
221  llvm_unreachable("unexpected opcode!");
222  case AArch64::B:
223  return MI.getOperand(0).getMBB();
224  case AArch64::TBZW:
225  case AArch64::TBNZW:
226  case AArch64::TBZX:
227  case AArch64::TBNZX:
228  return MI.getOperand(2).getMBB();
229  case AArch64::CBZW:
230  case AArch64::CBNZW:
231  case AArch64::CBZX:
232  case AArch64::CBNZX:
233  case AArch64::Bcc:
234  return MI.getOperand(1).getMBB();
235  }
236 }
237 
238 // Branch analysis.
240  MachineBasicBlock *&TBB,
241  MachineBasicBlock *&FBB,
243  bool AllowModify) const {
244  // If the block has no terminators, it just falls into the block after it.
246  if (I == MBB.end())
247  return false;
248 
249  // Skip over SpeculationBarrierEndBB terminators
250  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
251  I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
252  --I;
253  }
254 
255  if (!isUnpredicatedTerminator(*I))
256  return false;
257 
258  // Get the last instruction in the block.
259  MachineInstr *LastInst = &*I;
260 
261  // If there is only one terminator instruction, process it.
262  unsigned LastOpc = LastInst->getOpcode();
263  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
264  if (isUncondBranchOpcode(LastOpc)) {
265  TBB = LastInst->getOperand(0).getMBB();
266  return false;
267  }
268  if (isCondBranchOpcode(LastOpc)) {
269  // Block ends with fall-through condbranch.
270  parseCondBranch(LastInst, TBB, Cond);
271  return false;
272  }
273  return true; // Can't handle indirect branch.
274  }
275 
276  // Get the instruction before it if it is a terminator.
277  MachineInstr *SecondLastInst = &*I;
278  unsigned SecondLastOpc = SecondLastInst->getOpcode();
279 
280  // If AllowModify is true and the block ends with two or more unconditional
281  // branches, delete all but the first unconditional branch.
282  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
283  while (isUncondBranchOpcode(SecondLastOpc)) {
284  LastInst->eraseFromParent();
285  LastInst = SecondLastInst;
286  LastOpc = LastInst->getOpcode();
287  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
288  // Return now the only terminator is an unconditional branch.
289  TBB = LastInst->getOperand(0).getMBB();
290  return false;
291  } else {
292  SecondLastInst = &*I;
293  SecondLastOpc = SecondLastInst->getOpcode();
294  }
295  }
296  }
297 
298  // If we're allowed to modify and the block ends in a unconditional branch
299  // which could simply fallthrough, remove the branch. (Note: This case only
300  // matters when we can't understand the whole sequence, otherwise it's also
301  // handled by BranchFolding.cpp.)
302  if (AllowModify && isUncondBranchOpcode(LastOpc) &&
304  LastInst->eraseFromParent();
305  LastInst = SecondLastInst;
306  LastOpc = LastInst->getOpcode();
307  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
308  assert(!isUncondBranchOpcode(LastOpc) &&
309  "unreachable unconditional branches removed above");
310 
311  if (isCondBranchOpcode(LastOpc)) {
312  // Block ends with fall-through condbranch.
313  parseCondBranch(LastInst, TBB, Cond);
314  return false;
315  }
316  return true; // Can't handle indirect branch.
317  } else {
318  SecondLastInst = &*I;
319  SecondLastOpc = SecondLastInst->getOpcode();
320  }
321  }
322 
323  // If there are three terminators, we don't know what sort of block this is.
324  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
325  return true;
326 
327  // If the block ends with a B and a Bcc, handle it.
328  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
329  parseCondBranch(SecondLastInst, TBB, Cond);
330  FBB = LastInst->getOperand(0).getMBB();
331  return false;
332  }
333 
334  // If the block ends with two unconditional branches, handle it. The second
335  // one is not executed, so remove it.
336  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
337  TBB = SecondLastInst->getOperand(0).getMBB();
338  I = LastInst;
339  if (AllowModify)
340  I->eraseFromParent();
341  return false;
342  }
343 
344  // ...likewise if it ends with an indirect branch followed by an unconditional
345  // branch.
346  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
347  I = LastInst;
348  if (AllowModify)
349  I->eraseFromParent();
350  return true;
351  }
352 
353  // Otherwise, can't handle this.
354  return true;
355 }
356 
358  MachineBranchPredicate &MBP,
359  bool AllowModify) const {
360  // For the moment, handle only a block which ends with a cb(n)zx followed by
361  // a fallthrough. Why this? Because it is a common form.
362  // TODO: Should we handle b.cc?
363 
365  if (I == MBB.end())
366  return true;
367 
368  // Skip over SpeculationBarrierEndBB terminators
369  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
370  I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
371  --I;
372  }
373 
374  if (!isUnpredicatedTerminator(*I))
375  return true;
376 
377  // Get the last instruction in the block.
378  MachineInstr *LastInst = &*I;
379  unsigned LastOpc = LastInst->getOpcode();
380  if (!isCondBranchOpcode(LastOpc))
381  return true;
382 
383  switch (LastOpc) {
384  default:
385  return true;
386  case AArch64::CBZW:
387  case AArch64::CBZX:
388  case AArch64::CBNZW:
389  case AArch64::CBNZX:
390  break;
391  };
392 
393  MBP.TrueDest = LastInst->getOperand(1).getMBB();
394  assert(MBP.TrueDest && "expected!");
395  MBP.FalseDest = MBB.getNextNode();
396 
397  MBP.ConditionDef = nullptr;
398  MBP.SingleUseCondition = false;
399 
400  MBP.LHS = LastInst->getOperand(0);
401  MBP.RHS = MachineOperand::CreateImm(0);
402  MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
404  return false;
405 }
406 
409  if (Cond[0].getImm() != -1) {
410  // Regular Bcc
411  AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
412  Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
413  } else {
414  // Folded compare-and-branch
415  switch (Cond[1].getImm()) {
416  default:
417  llvm_unreachable("Unknown conditional branch!");
418  case AArch64::CBZW:
419  Cond[1].setImm(AArch64::CBNZW);
420  break;
421  case AArch64::CBNZW:
422  Cond[1].setImm(AArch64::CBZW);
423  break;
424  case AArch64::CBZX:
425  Cond[1].setImm(AArch64::CBNZX);
426  break;
427  case AArch64::CBNZX:
428  Cond[1].setImm(AArch64::CBZX);
429  break;
430  case AArch64::TBZW:
431  Cond[1].setImm(AArch64::TBNZW);
432  break;
433  case AArch64::TBNZW:
434  Cond[1].setImm(AArch64::TBZW);
435  break;
436  case AArch64::TBZX:
437  Cond[1].setImm(AArch64::TBNZX);
438  break;
439  case AArch64::TBNZX:
440  Cond[1].setImm(AArch64::TBZX);
441  break;
442  }
443  }
444 
445  return false;
446 }
447 
449  int *BytesRemoved) const {
451  if (I == MBB.end())
452  return 0;
453 
454  if (!isUncondBranchOpcode(I->getOpcode()) &&
455  !isCondBranchOpcode(I->getOpcode()))
456  return 0;
457 
458  // Remove the branch.
459  I->eraseFromParent();
460 
461  I = MBB.end();
462 
463  if (I == MBB.begin()) {
464  if (BytesRemoved)
465  *BytesRemoved = 4;
466  return 1;
467  }
468  --I;
469  if (!isCondBranchOpcode(I->getOpcode())) {
470  if (BytesRemoved)
471  *BytesRemoved = 4;
472  return 1;
473  }
474 
475  // Remove the branch.
476  I->eraseFromParent();
477  if (BytesRemoved)
478  *BytesRemoved = 8;
479 
480  return 2;
481 }
482 
483 void AArch64InstrInfo::instantiateCondBranch(
486  if (Cond[0].getImm() != -1) {
487  // Regular Bcc
488  BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
489  } else {
490  // Folded compare-and-branch
491  // Note that we use addOperand instead of addReg to keep the flags.
492  const MachineInstrBuilder MIB =
493  BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
494  if (Cond.size() > 3)
495  MIB.addImm(Cond[3].getImm());
496  MIB.addMBB(TBB);
497  }
498 }
499 
502  ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
503  // Shouldn't be a fall through.
504  assert(TBB && "insertBranch must not be told to insert a fallthrough");
505 
506  if (!FBB) {
507  if (Cond.empty()) // Unconditional branch?
508  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
509  else
510  instantiateCondBranch(MBB, DL, TBB, Cond);
511 
512  if (BytesAdded)
513  *BytesAdded = 4;
514 
515  return 1;
516  }
517 
518  // Two-way conditional branch.
519  instantiateCondBranch(MBB, DL, TBB, Cond);
520  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
521 
522  if (BytesAdded)
523  *BytesAdded = 8;
524 
525  return 2;
526 }
527 
528 // Find the original register that VReg is copied from.
529 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
530  while (Register::isVirtualRegister(VReg)) {
531  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
532  if (!DefMI->isFullCopy())
533  return VReg;
534  VReg = DefMI->getOperand(1).getReg();
535  }
536  return VReg;
537 }
538 
539 // Determine if VReg is defined by an instruction that can be folded into a
540 // csel instruction. If so, return the folded opcode, and the replacement
541 // register.
542 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
543  unsigned *NewVReg = nullptr) {
544  VReg = removeCopies(MRI, VReg);
545  if (!Register::isVirtualRegister(VReg))
546  return 0;
547 
548  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
549  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
550  unsigned Opc = 0;
551  unsigned SrcOpNum = 0;
552  switch (DefMI->getOpcode()) {
553  case AArch64::ADDSXri:
554  case AArch64::ADDSWri:
555  // if NZCV is used, do not fold.
556  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
557  return 0;
558  // fall-through to ADDXri and ADDWri.
560  case AArch64::ADDXri:
561  case AArch64::ADDWri:
562  // add x, 1 -> csinc.
563  if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
564  DefMI->getOperand(3).getImm() != 0)
565  return 0;
566  SrcOpNum = 1;
567  Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
568  break;
569 
570  case AArch64::ORNXrr:
571  case AArch64::ORNWrr: {
572  // not x -> csinv, represented as orn dst, xzr, src.
573  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
574  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
575  return 0;
576  SrcOpNum = 2;
577  Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
578  break;
579  }
580 
581  case AArch64::SUBSXrr:
582  case AArch64::SUBSWrr:
583  // if NZCV is used, do not fold.
584  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
585  return 0;
586  // fall-through to SUBXrr and SUBWrr.
588  case AArch64::SUBXrr:
589  case AArch64::SUBWrr: {
590  // neg x -> csneg, represented as sub dst, xzr, src.
591  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
592  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
593  return 0;
594  SrcOpNum = 2;
595  Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
596  break;
597  }
598  default:
599  return 0;
600  }
601  assert(Opc && SrcOpNum && "Missing parameters");
602 
603  if (NewVReg)
604  *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
605  return Opc;
606 }
607 
610  Register DstReg, Register TrueReg,
611  Register FalseReg, int &CondCycles,
612  int &TrueCycles,
613  int &FalseCycles) const {
614  // Check register classes.
616  const TargetRegisterClass *RC =
617  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
618  if (!RC)
619  return false;
620 
621  // Also need to check the dest regclass, in case we're trying to optimize
622  // something like:
623  // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
624  if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
625  return false;
626 
627  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
628  unsigned ExtraCondLat = Cond.size() != 1;
629 
630  // GPRs are handled by csel.
631  // FIXME: Fold in x+1, -x, and ~x when applicable.
632  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
633  AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
634  // Single-cycle csel, csinc, csinv, and csneg.
635  CondCycles = 1 + ExtraCondLat;
636  TrueCycles = FalseCycles = 1;
637  if (canFoldIntoCSel(MRI, TrueReg))
638  TrueCycles = 0;
639  else if (canFoldIntoCSel(MRI, FalseReg))
640  FalseCycles = 0;
641  return true;
642  }
643 
644  // Scalar floating point is handled by fcsel.
645  // FIXME: Form fabs, fmin, and fmax when applicable.
646  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
647  AArch64::FPR32RegClass.hasSubClassEq(RC)) {
648  CondCycles = 5 + ExtraCondLat;
649  TrueCycles = FalseCycles = 2;
650  return true;
651  }
652 
653  // Can't do vectors.
654  return false;
655 }
656 
659  const DebugLoc &DL, Register DstReg,
661  Register TrueReg, Register FalseReg) const {
663 
664  // Parse the condition code, see parseCondBranch() above.
666  switch (Cond.size()) {
667  default:
668  llvm_unreachable("Unknown condition opcode in Cond");
669  case 1: // b.cc
670  CC = AArch64CC::CondCode(Cond[0].getImm());
671  break;
672  case 3: { // cbz/cbnz
673  // We must insert a compare against 0.
674  bool Is64Bit;
675  switch (Cond[1].getImm()) {
676  default:
677  llvm_unreachable("Unknown branch opcode in Cond");
678  case AArch64::CBZW:
679  Is64Bit = false;
680  CC = AArch64CC::EQ;
681  break;
682  case AArch64::CBZX:
683  Is64Bit = true;
684  CC = AArch64CC::EQ;
685  break;
686  case AArch64::CBNZW:
687  Is64Bit = false;
688  CC = AArch64CC::NE;
689  break;
690  case AArch64::CBNZX:
691  Is64Bit = true;
692  CC = AArch64CC::NE;
693  break;
694  }
695  Register SrcReg = Cond[2].getReg();
696  if (Is64Bit) {
697  // cmp reg, #0 is actually subs xzr, reg, #0.
698  MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
699  BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
700  .addReg(SrcReg)
701  .addImm(0)
702  .addImm(0);
703  } else {
704  MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
705  BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
706  .addReg(SrcReg)
707  .addImm(0)
708  .addImm(0);
709  }
710  break;
711  }
712  case 4: { // tbz/tbnz
713  // We must insert a tst instruction.
714  switch (Cond[1].getImm()) {
715  default:
716  llvm_unreachable("Unknown branch opcode in Cond");
717  case AArch64::TBZW:
718  case AArch64::TBZX:
719  CC = AArch64CC::EQ;
720  break;
721  case AArch64::TBNZW:
722  case AArch64::TBNZX:
723  CC = AArch64CC::NE;
724  break;
725  }
726  // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
727  if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
728  BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
729  .addReg(Cond[2].getReg())
730  .addImm(
731  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
732  else
733  BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
734  .addReg(Cond[2].getReg())
735  .addImm(
736  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
737  break;
738  }
739  }
740 
741  unsigned Opc = 0;
742  const TargetRegisterClass *RC = nullptr;
743  bool TryFold = false;
744  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
745  RC = &AArch64::GPR64RegClass;
746  Opc = AArch64::CSELXr;
747  TryFold = true;
748  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
749  RC = &AArch64::GPR32RegClass;
750  Opc = AArch64::CSELWr;
751  TryFold = true;
752  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
753  RC = &AArch64::FPR64RegClass;
754  Opc = AArch64::FCSELDrrr;
755  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
756  RC = &AArch64::FPR32RegClass;
757  Opc = AArch64::FCSELSrrr;
758  }
759  assert(RC && "Unsupported regclass");
760 
761  // Try folding simple instructions into the csel.
762  if (TryFold) {
763  unsigned NewVReg = 0;
764  unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
765  if (FoldedOpc) {
766  // The folded opcodes csinc, csinc and csneg apply the operation to
767  // FalseReg, so we need to invert the condition.
769  TrueReg = FalseReg;
770  } else
771  FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
772 
773  // Fold the operation. Leave any dead instructions for DCE to clean up.
774  if (FoldedOpc) {
775  FalseReg = NewVReg;
776  Opc = FoldedOpc;
777  // The extends the live range of NewVReg.
778  MRI.clearKillFlags(NewVReg);
779  }
780  }
781 
782  // Pull all virtual register into the appropriate class.
783  MRI.constrainRegClass(TrueReg, RC);
784  MRI.constrainRegClass(FalseReg, RC);
785 
786  // Insert the csel.
787  BuildMI(MBB, I, DL, get(Opc), DstReg)
788  .addReg(TrueReg)
789  .addReg(FalseReg)
790  .addImm(CC);
791 }
792 
793 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
794 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
795  uint64_t Imm = MI.getOperand(1).getImm();
796  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
797  uint64_t Encoding;
798  return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
799 }
800 
801 // FIXME: this implementation should be micro-architecture dependent, so a
802 // micro-architecture target hook should be introduced here in future.
804  if (!Subtarget.hasCustomCheapAsMoveHandling())
805  return MI.isAsCheapAsAMove();
806 
807  const unsigned Opcode = MI.getOpcode();
808 
809  // Firstly, check cases gated by features.
810 
811  if (Subtarget.hasZeroCycleZeroingFP()) {
812  if (Opcode == AArch64::FMOVH0 ||
813  Opcode == AArch64::FMOVS0 ||
814  Opcode == AArch64::FMOVD0)
815  return true;
816  }
817 
818  if (Subtarget.hasZeroCycleZeroingGP()) {
819  if (Opcode == TargetOpcode::COPY &&
820  (MI.getOperand(1).getReg() == AArch64::WZR ||
821  MI.getOperand(1).getReg() == AArch64::XZR))
822  return true;
823  }
824 
825  // Secondly, check cases specific to sub-targets.
826 
827  if (Subtarget.hasExynosCheapAsMoveHandling()) {
828  if (isExynosCheapAsMove(MI))
829  return true;
830 
831  return MI.isAsCheapAsAMove();
832  }
833 
834  // Finally, check generic cases.
835 
836  switch (Opcode) {
837  default:
838  return false;
839 
840  // add/sub on register without shift
841  case AArch64::ADDWri:
842  case AArch64::ADDXri:
843  case AArch64::SUBWri:
844  case AArch64::SUBXri:
845  return (MI.getOperand(3).getImm() == 0);
846 
847  // logical ops on immediate
848  case AArch64::ANDWri:
849  case AArch64::ANDXri:
850  case AArch64::EORWri:
851  case AArch64::EORXri:
852  case AArch64::ORRWri:
853  case AArch64::ORRXri:
854  return true;
855 
856  // logical ops on register without shift
857  case AArch64::ANDWrr:
858  case AArch64::ANDXrr:
859  case AArch64::BICWrr:
860  case AArch64::BICXrr:
861  case AArch64::EONWrr:
862  case AArch64::EONXrr:
863  case AArch64::EORWrr:
864  case AArch64::EORXrr:
865  case AArch64::ORNWrr:
866  case AArch64::ORNXrr:
867  case AArch64::ORRWrr:
868  case AArch64::ORRXrr:
869  return true;
870 
871  // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
872  // ORRXri, it is as cheap as MOV
873  case AArch64::MOVi32imm:
874  return canBeExpandedToORR(MI, 32);
875  case AArch64::MOVi64imm:
876  return canBeExpandedToORR(MI, 64);
877  }
878 
879  llvm_unreachable("Unknown opcode to check as cheap as a move!");
880 }
881 
883  switch (MI.getOpcode()) {
884  default:
885  return false;
886 
887  case AArch64::ADDWrs:
888  case AArch64::ADDXrs:
889  case AArch64::ADDSWrs:
890  case AArch64::ADDSXrs: {
891  unsigned Imm = MI.getOperand(3).getImm();
892  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
893  if (ShiftVal == 0)
894  return true;
895  return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
896  }
897 
898  case AArch64::ADDWrx:
899  case AArch64::ADDXrx:
900  case AArch64::ADDXrx64:
901  case AArch64::ADDSWrx:
902  case AArch64::ADDSXrx:
903  case AArch64::ADDSXrx64: {
904  unsigned Imm = MI.getOperand(3).getImm();
905  switch (AArch64_AM::getArithExtendType(Imm)) {
906  default:
907  return false;
908  case AArch64_AM::UXTB:
909  case AArch64_AM::UXTH:
910  case AArch64_AM::UXTW:
911  case AArch64_AM::UXTX:
912  return AArch64_AM::getArithShiftValue(Imm) <= 4;
913  }
914  }
915 
916  case AArch64::SUBWrs:
917  case AArch64::SUBSWrs: {
918  unsigned Imm = MI.getOperand(3).getImm();
919  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
920  return ShiftVal == 0 ||
921  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
922  }
923 
924  case AArch64::SUBXrs:
925  case AArch64::SUBSXrs: {
926  unsigned Imm = MI.getOperand(3).getImm();
927  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
928  return ShiftVal == 0 ||
929  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
930  }
931 
932  case AArch64::SUBWrx:
933  case AArch64::SUBXrx:
934  case AArch64::SUBXrx64:
935  case AArch64::SUBSWrx:
936  case AArch64::SUBSXrx:
937  case AArch64::SUBSXrx64: {
938  unsigned Imm = MI.getOperand(3).getImm();
939  switch (AArch64_AM::getArithExtendType(Imm)) {
940  default:
941  return false;
942  case AArch64_AM::UXTB:
943  case AArch64_AM::UXTH:
944  case AArch64_AM::UXTW:
945  case AArch64_AM::UXTX:
946  return AArch64_AM::getArithShiftValue(Imm) == 0;
947  }
948  }
949 
950  case AArch64::LDRBBroW:
951  case AArch64::LDRBBroX:
952  case AArch64::LDRBroW:
953  case AArch64::LDRBroX:
954  case AArch64::LDRDroW:
955  case AArch64::LDRDroX:
956  case AArch64::LDRHHroW:
957  case AArch64::LDRHHroX:
958  case AArch64::LDRHroW:
959  case AArch64::LDRHroX:
960  case AArch64::LDRQroW:
961  case AArch64::LDRQroX:
962  case AArch64::LDRSBWroW:
963  case AArch64::LDRSBWroX:
964  case AArch64::LDRSBXroW:
965  case AArch64::LDRSBXroX:
966  case AArch64::LDRSHWroW:
967  case AArch64::LDRSHWroX:
968  case AArch64::LDRSHXroW:
969  case AArch64::LDRSHXroX:
970  case AArch64::LDRSWroW:
971  case AArch64::LDRSWroX:
972  case AArch64::LDRSroW:
973  case AArch64::LDRSroX:
974  case AArch64::LDRWroW:
975  case AArch64::LDRWroX:
976  case AArch64::LDRXroW:
977  case AArch64::LDRXroX:
978  case AArch64::PRFMroW:
979  case AArch64::PRFMroX:
980  case AArch64::STRBBroW:
981  case AArch64::STRBBroX:
982  case AArch64::STRBroW:
983  case AArch64::STRBroX:
984  case AArch64::STRDroW:
985  case AArch64::STRDroX:
986  case AArch64::STRHHroW:
987  case AArch64::STRHHroX:
988  case AArch64::STRHroW:
989  case AArch64::STRHroX:
990  case AArch64::STRQroW:
991  case AArch64::STRQroX:
992  case AArch64::STRSroW:
993  case AArch64::STRSroX:
994  case AArch64::STRWroW:
995  case AArch64::STRWroX:
996  case AArch64::STRXroW:
997  case AArch64::STRXroX: {
998  unsigned IsSigned = MI.getOperand(3).getImm();
999  return !IsSigned;
1000  }
1001  }
1002 }
1003 
1005  unsigned Opc = MI.getOpcode();
1006  switch (Opc) {
1007  default:
1008  return false;
1009  case AArch64::SEH_StackAlloc:
1010  case AArch64::SEH_SaveFPLR:
1011  case AArch64::SEH_SaveFPLR_X:
1012  case AArch64::SEH_SaveReg:
1013  case AArch64::SEH_SaveReg_X:
1014  case AArch64::SEH_SaveRegP:
1015  case AArch64::SEH_SaveRegP_X:
1016  case AArch64::SEH_SaveFReg:
1017  case AArch64::SEH_SaveFReg_X:
1018  case AArch64::SEH_SaveFRegP:
1019  case AArch64::SEH_SaveFRegP_X:
1020  case AArch64::SEH_SetFP:
1021  case AArch64::SEH_AddFP:
1022  case AArch64::SEH_Nop:
1023  case AArch64::SEH_PrologEnd:
1024  case AArch64::SEH_EpilogStart:
1025  case AArch64::SEH_EpilogEnd:
1026  return true;
1027  }
1028 }
1029 
1031  Register &SrcReg, Register &DstReg,
1032  unsigned &SubIdx) const {
1033  switch (MI.getOpcode()) {
1034  default:
1035  return false;
1036  case AArch64::SBFMXri: // aka sxtw
1037  case AArch64::UBFMXri: // aka uxtw
1038  // Check for the 32 -> 64 bit extension case, these instructions can do
1039  // much more.
1040  if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1041  return false;
1042  // This is a signed or unsigned 32 -> 64 bit extension.
1043  SrcReg = MI.getOperand(1).getReg();
1044  DstReg = MI.getOperand(0).getReg();
1045  SubIdx = AArch64::sub_32;
1046  return true;
1047  }
1048 }
1049 
1051  const MachineInstr &MIa, const MachineInstr &MIb) const {
1053  const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1054  int64_t OffsetA = 0, OffsetB = 0;
1055  unsigned WidthA = 0, WidthB = 0;
1056  bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1057 
1058  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1059  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1060 
1061  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1063  return false;
1064 
1065  // Retrieve the base, offset from the base and width. Width
1066  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1067  // base are identical, and the offset of a lower memory access +
1068  // the width doesn't overlap the offset of a higher memory access,
1069  // then the memory accesses are different.
1070  // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1071  // are assumed to have the same scale (vscale).
1072  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1073  WidthA, TRI) &&
1074  getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1075  WidthB, TRI)) {
1076  if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1077  OffsetAIsScalable == OffsetBIsScalable) {
1078  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1079  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1080  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1081  if (LowOffset + LowWidth <= HighOffset)
1082  return true;
1083  }
1084  }
1085  return false;
1086 }
1087 
1089  const MachineBasicBlock *MBB,
1090  const MachineFunction &MF) const {
1092  return true;
1093  switch (MI.getOpcode()) {
1094  case AArch64::HINT:
1095  // CSDB hints are scheduling barriers.
1096  if (MI.getOperand(0).getImm() == 0x14)
1097  return true;
1098  break;
1099  case AArch64::DSB:
1100  case AArch64::ISB:
1101  // DSB and ISB also are scheduling barriers.
1102  return true;
1103  default:;
1104  }
1105  return isSEHInstruction(MI);
1106 }
1107 
1108 /// analyzeCompare - For a comparison instruction, return the source registers
1109 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1110 /// Return true if the comparison instruction can be analyzed.
1112  Register &SrcReg2, int &CmpMask,
1113  int &CmpValue) const {
1114  // The first operand can be a frame index where we'd normally expect a
1115  // register.
1116  assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1117  if (!MI.getOperand(1).isReg())
1118  return false;
1119 
1120  switch (MI.getOpcode()) {
1121  default:
1122  break;
1123  case AArch64::PTEST_PP:
1124  SrcReg = MI.getOperand(0).getReg();
1125  SrcReg2 = MI.getOperand(1).getReg();
1126  // Not sure about the mask and value for now...
1127  CmpMask = ~0;
1128  CmpValue = 0;
1129  return true;
1130  case AArch64::SUBSWrr:
1131  case AArch64::SUBSWrs:
1132  case AArch64::SUBSWrx:
1133  case AArch64::SUBSXrr:
1134  case AArch64::SUBSXrs:
1135  case AArch64::SUBSXrx:
1136  case AArch64::ADDSWrr:
1137  case AArch64::ADDSWrs:
1138  case AArch64::ADDSWrx:
1139  case AArch64::ADDSXrr:
1140  case AArch64::ADDSXrs:
1141  case AArch64::ADDSXrx:
1142  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1143  SrcReg = MI.getOperand(1).getReg();
1144  SrcReg2 = MI.getOperand(2).getReg();
1145  CmpMask = ~0;
1146  CmpValue = 0;
1147  return true;
1148  case AArch64::SUBSWri:
1149  case AArch64::ADDSWri:
1150  case AArch64::SUBSXri:
1151  case AArch64::ADDSXri:
1152  SrcReg = MI.getOperand(1).getReg();
1153  SrcReg2 = 0;
1154  CmpMask = ~0;
1155  // FIXME: In order to convert CmpValue to 0 or 1
1156  CmpValue = MI.getOperand(2).getImm() != 0;
1157  return true;
1158  case AArch64::ANDSWri:
1159  case AArch64::ANDSXri:
1160  // ANDS does not use the same encoding scheme as the others xxxS
1161  // instructions.
1162  SrcReg = MI.getOperand(1).getReg();
1163  SrcReg2 = 0;
1164  CmpMask = ~0;
1165  // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1166  // while the type of CmpValue is int. When converting uint64_t to int,
1167  // the high 32 bits of uint64_t will be lost.
1168  // In fact it causes a bug in spec2006-483.xalancbmk
1169  // CmpValue is only used to compare with zero in OptimizeCompareInstr
1171  MI.getOperand(2).getImm(),
1172  MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1173  return true;
1174  }
1175 
1176  return false;
1177 }
1178 
1180  MachineBasicBlock *MBB = Instr.getParent();
1181  assert(MBB && "Can't get MachineBasicBlock here");
1182  MachineFunction *MF = MBB->getParent();
1183  assert(MF && "Can't get MachineFunction here");
1184  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1187 
1188  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1189  ++OpIdx) {
1190  MachineOperand &MO = Instr.getOperand(OpIdx);
1191  const TargetRegisterClass *OpRegCstraints =
1192  Instr.getRegClassConstraint(OpIdx, TII, TRI);
1193 
1194  // If there's no constraint, there's nothing to do.
1195  if (!OpRegCstraints)
1196  continue;
1197  // If the operand is a frame index, there's nothing to do here.
1198  // A frame index operand will resolve correctly during PEI.
1199  if (MO.isFI())
1200  continue;
1201 
1202  assert(MO.isReg() &&
1203  "Operand has register constraints without being a register!");
1204 
1205  Register Reg = MO.getReg();
1207  if (!OpRegCstraints->contains(Reg))
1208  return false;
1209  } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1210  !MRI->constrainRegClass(Reg, OpRegCstraints))
1211  return false;
1212  }
1213 
1214  return true;
1215 }
1216 
1217 /// Return the opcode that does not set flags when possible - otherwise
1218 /// return the original opcode. The caller is responsible to do the actual
1219 /// substitution and legality checking.
1220 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1221  // Don't convert all compare instructions, because for some the zero register
1222  // encoding becomes the sp register.
1223  bool MIDefinesZeroReg = false;
1224  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1225  MIDefinesZeroReg = true;
1226 
1227  switch (MI.getOpcode()) {
1228  default:
1229  return MI.getOpcode();
1230  case AArch64::ADDSWrr:
1231  return AArch64::ADDWrr;
1232  case AArch64::ADDSWri:
1233  return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1234  case AArch64::ADDSWrs:
1235  return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1236  case AArch64::ADDSWrx:
1237  return AArch64::ADDWrx;
1238  case AArch64::ADDSXrr:
1239  return AArch64::ADDXrr;
1240  case AArch64::ADDSXri:
1241  return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1242  case AArch64::ADDSXrs:
1243  return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1244  case AArch64::ADDSXrx:
1245  return AArch64::ADDXrx;
1246  case AArch64::SUBSWrr:
1247  return AArch64::SUBWrr;
1248  case AArch64::SUBSWri:
1249  return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1250  case AArch64::SUBSWrs:
1251  return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1252  case AArch64::SUBSWrx:
1253  return AArch64::SUBWrx;
1254  case AArch64::SUBSXrr:
1255  return AArch64::SUBXrr;
1256  case AArch64::SUBSXri:
1257  return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1258  case AArch64::SUBSXrs:
1259  return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1260  case AArch64::SUBSXrx:
1261  return AArch64::SUBXrx;
1262  }
1263 }
1264 
1265 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1266 
1267 /// True when condition flags are accessed (either by writing or reading)
1268 /// on the instruction trace starting at From and ending at To.
1269 ///
1270 /// Note: If From and To are from different blocks it's assumed CC are accessed
1271 /// on the path.
1274  const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1275  // Early exit if To is at the beginning of the BB.
1276  if (To == To->getParent()->begin())
1277  return true;
1278 
1279  // Check whether the instructions are in the same basic block
1280  // If not, assume the condition flags might get modified somewhere.
1281  if (To->getParent() != From->getParent())
1282  return true;
1283 
1284  // From must be above To.
1286  ++To.getReverse(), To->getParent()->rend(),
1287  [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1288 
1289  // We iterate backward starting at \p To until we hit \p From.
1290  for (const MachineInstr &Instr :
1291  instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1292  if (((AccessToCheck & AK_Write) &&
1293  Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1294  ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1295  return true;
1296  }
1297  return false;
1298 }
1299 
1300 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1301 /// operation which could set the flags in an identical manner
1302 bool AArch64InstrInfo::optimizePTestInstr(
1303  MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1304  const MachineRegisterInfo *MRI) const {
1305  auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1306  auto *Pred = MRI->getUniqueVRegDef(PredReg);
1307  auto NewOp = Pred->getOpcode();
1308  bool OpChanged = false;
1309 
1310  unsigned MaskOpcode = Mask->getOpcode();
1311  unsigned PredOpcode = Pred->getOpcode();
1312  bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1313  bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1314 
1315  if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) {
1316  // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't
1317  // deactivate any lanes OTHER_INST might set.
1318  uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode);
1319  uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1320 
1321  // Must be an all active predicate of matching element size.
1322  if ((PredElementSize != MaskElementSize) ||
1323  (Mask->getOperand(1).getImm() != 31))
1324  return false;
1325 
1326  // Fallthough to simply remove the PTEST.
1327  } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) {
1328  // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1329  // instruction that sets the flags as PTEST would.
1330 
1331  // Fallthough to simply remove the PTEST.
1332  } else if (PredIsPTestLike) {
1333  // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both
1334  // instructions use the same predicate.
1335  auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1336  if (Mask != PTestLikeMask)
1337  return false;
1338 
1339  // Fallthough to simply remove the PTEST.
1340  } else {
1341  switch (Pred->getOpcode()) {
1342  case AArch64::BRKB_PPzP:
1343  case AArch64::BRKPB_PPzPP: {
1344  // Op 0 is chain, 1 is the mask, 2 the previous predicate to
1345  // propagate, 3 the new predicate.
1346 
1347  // Check to see if our mask is the same as the brkpb's. If
1348  // not the resulting flag bits may be different and we
1349  // can't remove the ptest.
1350  auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1351  if (Mask != PredMask)
1352  return false;
1353 
1354  // Switch to the new opcode
1355  NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP
1356  : AArch64::BRKPBS_PPzPP;
1357  OpChanged = true;
1358  break;
1359  }
1360  case AArch64::BRKN_PPzP: {
1361  auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1362  if (Mask != PredMask)
1363  return false;
1364 
1365  NewOp = AArch64::BRKNS_PPzP;
1366  OpChanged = true;
1367  break;
1368  }
1369  default:
1370  // Bail out if we don't recognize the input
1371  return false;
1372  }
1373  }
1374 
1376 
1377  // If the predicate is in a different block (possibly because its been
1378  // hoisted out), then assume the flags are set in between statements.
1379  if (Pred->getParent() != PTest->getParent())
1380  return false;
1381 
1382  // If another instruction between the propagation and test sets the
1383  // flags, don't remove the ptest.
1384  MachineBasicBlock::iterator I = Pred, E = PTest;
1385  ++I; // Skip past the predicate op itself.
1386  for (; I != E; ++I) {
1387  const MachineInstr &Inst = *I;
1388 
1389  // TODO: If the ptest flags are unused, we could still remove it.
1390  if (Inst.modifiesRegister(AArch64::NZCV, TRI))
1391  return false;
1392  }
1393 
1394  // If we pass all the checks, it's safe to remove the PTEST and use the flags
1395  // as they are prior to PTEST. Sometimes this requires the tested PTEST
1396  // operand to be replaced with an equivalent instruction that also sets the
1397  // flags.
1398  Pred->setDesc(get(NewOp));
1399  PTest->eraseFromParent();
1400  if (OpChanged) {
1401  bool succeeded = UpdateOperandRegClass(*Pred);
1402  (void)succeeded;
1403  assert(succeeded && "Operands have incompatible register classes!");
1404  Pred->addRegisterDefined(AArch64::NZCV, TRI);
1405  }
1406 
1407  // Ensure that the flags def is live.
1408  if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1409  unsigned i = 0, e = Pred->getNumOperands();
1410  for (; i != e; ++i) {
1411  MachineOperand &MO = Pred->getOperand(i);
1412  if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1413  MO.setIsDead(false);
1414  break;
1415  }
1416  }
1417  }
1418  return true;
1419 }
1420 
1421 /// Try to optimize a compare instruction. A compare instruction is an
1422 /// instruction which produces AArch64::NZCV. It can be truly compare
1423 /// instruction
1424 /// when there are no uses of its destination register.
1425 ///
1426 /// The following steps are tried in order:
1427 /// 1. Convert CmpInstr into an unconditional version.
1428 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1429 /// condition code or an instruction which can be converted into such an
1430 /// instruction.
1431 /// Only comparison with zero is supported.
1433  MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
1434  int CmpValue, const MachineRegisterInfo *MRI) const {
1435  assert(CmpInstr.getParent());
1436  assert(MRI);
1437 
1438  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1439  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1440  if (DeadNZCVIdx != -1) {
1441  if (CmpInstr.definesRegister(AArch64::WZR) ||
1442  CmpInstr.definesRegister(AArch64::XZR)) {
1443  CmpInstr.eraseFromParent();
1444  return true;
1445  }
1446  unsigned Opc = CmpInstr.getOpcode();
1447  unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1448  if (NewOpc == Opc)
1449  return false;
1450  const MCInstrDesc &MCID = get(NewOpc);
1451  CmpInstr.setDesc(MCID);
1452  CmpInstr.RemoveOperand(DeadNZCVIdx);
1453  bool succeeded = UpdateOperandRegClass(CmpInstr);
1454  (void)succeeded;
1455  assert(succeeded && "Some operands reg class are incompatible!");
1456  return true;
1457  }
1458 
1459  if (CmpInstr.getOpcode() == AArch64::PTEST_PP)
1460  return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1461 
1462  // Continue only if we have a "ri" where immediate is zero.
1463  // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1464  // function.
1465  assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1466  if (SrcReg2 != 0)
1467  return false;
1468 
1469  // CmpInstr is a Compare instruction if destination register is not used.
1470  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1471  return false;
1472 
1473  if (!CmpValue && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1474  return true;
1475  return removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1476 }
1477 
1478 /// Get opcode of S version of Instr.
1479 /// If Instr is S version its opcode is returned.
1480 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1481 /// or we are not interested in it.
1482 static unsigned sForm(MachineInstr &Instr) {
1483  switch (Instr.getOpcode()) {
1484  default:
1485  return AArch64::INSTRUCTION_LIST_END;
1486 
1487  case AArch64::ADDSWrr:
1488  case AArch64::ADDSWri:
1489  case AArch64::ADDSXrr:
1490  case AArch64::ADDSXri:
1491  case AArch64::SUBSWrr:
1492  case AArch64::SUBSWri:
1493  case AArch64::SUBSXrr:
1494  case AArch64::SUBSXri:
1495  return Instr.getOpcode();
1496 
1497  case AArch64::ADDWrr:
1498  return AArch64::ADDSWrr;
1499  case AArch64::ADDWri:
1500  return AArch64::ADDSWri;
1501  case AArch64::ADDXrr:
1502  return AArch64::ADDSXrr;
1503  case AArch64::ADDXri:
1504  return AArch64::ADDSXri;
1505  case AArch64::ADCWr:
1506  return AArch64::ADCSWr;
1507  case AArch64::ADCXr:
1508  return AArch64::ADCSXr;
1509  case AArch64::SUBWrr:
1510  return AArch64::SUBSWrr;
1511  case AArch64::SUBWri:
1512  return AArch64::SUBSWri;
1513  case AArch64::SUBXrr:
1514  return AArch64::SUBSXrr;
1515  case AArch64::SUBXri:
1516  return AArch64::SUBSXri;
1517  case AArch64::SBCWr:
1518  return AArch64::SBCSWr;
1519  case AArch64::SBCXr:
1520  return AArch64::SBCSXr;
1521  case AArch64::ANDWri:
1522  return AArch64::ANDSWri;
1523  case AArch64::ANDXri:
1524  return AArch64::ANDSXri;
1525  }
1526 }
1527 
1528 /// Check if AArch64::NZCV should be alive in successors of MBB.
1530  for (auto *BB : MBB->successors())
1531  if (BB->isLiveIn(AArch64::NZCV))
1532  return true;
1533  return false;
1534 }
1535 
1536 /// \returns The condition code operand index for \p Instr if it is a branch
1537 /// or select and -1 otherwise.
1538 static int
1540  switch (Instr.getOpcode()) {
1541  default:
1542  return -1;
1543 
1544  case AArch64::Bcc: {
1545  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1546  assert(Idx >= 2);
1547  return Idx - 2;
1548  }
1549 
1550  case AArch64::CSINVWr:
1551  case AArch64::CSINVXr:
1552  case AArch64::CSINCWr:
1553  case AArch64::CSINCXr:
1554  case AArch64::CSELWr:
1555  case AArch64::CSELXr:
1556  case AArch64::CSNEGWr:
1557  case AArch64::CSNEGXr:
1558  case AArch64::FCSELSrrr:
1559  case AArch64::FCSELDrrr: {
1560  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1561  assert(Idx >= 1);
1562  return Idx - 1;
1563  }
1564  }
1565 }
1566 
1567 namespace {
1568 
1569 struct UsedNZCV {
1570  bool N = false;
1571  bool Z = false;
1572  bool C = false;
1573  bool V = false;
1574 
1575  UsedNZCV() = default;
1576 
1577  UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1578  this->N |= UsedFlags.N;
1579  this->Z |= UsedFlags.Z;
1580  this->C |= UsedFlags.C;
1581  this->V |= UsedFlags.V;
1582  return *this;
1583  }
1584 };
1585 
1586 } // end anonymous namespace
1587 
1588 /// Find a condition code used by the instruction.
1589 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1590 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1592  int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1593  return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1594  Instr.getOperand(CCIdx).getImm())
1596 }
1597 
1598 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1599  assert(CC != AArch64CC::Invalid);
1600  UsedNZCV UsedFlags;
1601  switch (CC) {
1602  default:
1603  break;
1604 
1605  case AArch64CC::EQ: // Z set
1606  case AArch64CC::NE: // Z clear
1607  UsedFlags.Z = true;
1608  break;
1609 
1610  case AArch64CC::HI: // Z clear and C set
1611  case AArch64CC::LS: // Z set or C clear
1612  UsedFlags.Z = true;
1614  case AArch64CC::HS: // C set
1615  case AArch64CC::LO: // C clear
1616  UsedFlags.C = true;
1617  break;
1618 
1619  case AArch64CC::MI: // N set
1620  case AArch64CC::PL: // N clear
1621  UsedFlags.N = true;
1622  break;
1623 
1624  case AArch64CC::VS: // V set
1625  case AArch64CC::VC: // V clear
1626  UsedFlags.V = true;
1627  break;
1628 
1629  case AArch64CC::GT: // Z clear, N and V the same
1630  case AArch64CC::LE: // Z set, N and V differ
1631  UsedFlags.Z = true;
1633  case AArch64CC::GE: // N and V the same
1634  case AArch64CC::LT: // N and V differ
1635  UsedFlags.N = true;
1636  UsedFlags.V = true;
1637  break;
1638  }
1639  return UsedFlags;
1640 }
1641 
1642 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if they
1643 /// are not containing C or V flags and NZCV flags are not alive in successors
1644 /// of the same \p CmpInstr and \p MI parent. \returns None otherwise.
1645 ///
1646 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1647 static Optional<UsedNZCV>
1649  const TargetRegisterInfo &TRI,
1650  SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) {
1651  MachineBasicBlock *CmpParent = CmpInstr.getParent();
1652  if (MI.getParent() != CmpParent)
1653  return None;
1654 
1655  if (areCFlagsAliveInSuccessors(CmpParent))
1656  return None;
1657 
1658  UsedNZCV NZCVUsedAfterCmp;
1659  for (MachineInstr &Instr : instructionsWithoutDebug(
1660  std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1661  if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1663  if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1664  return None;
1665  NZCVUsedAfterCmp |= getUsedNZCV(CC);
1666  if (CCUseInstrs)
1667  CCUseInstrs->push_back(&Instr);
1668  }
1669  if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1670  break;
1671  }
1672  if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V)
1673  return None;
1674  return NZCVUsedAfterCmp;
1675 }
1676 
1677 static bool isADDSRegImm(unsigned Opcode) {
1678  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1679 }
1680 
1681 static bool isSUBSRegImm(unsigned Opcode) {
1682  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1683 }
1684 
1685 /// Check if CmpInstr can be substituted by MI.
1686 ///
1687 /// CmpInstr can be substituted:
1688 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1689 /// - and, MI and CmpInstr are from the same MachineBB
1690 /// - and, condition flags are not alive in successors of the CmpInstr parent
1691 /// - and, if MI opcode is the S form there must be no defs of flags between
1692 /// MI and CmpInstr
1693 /// or if MI opcode is not the S form there must be neither defs of flags
1694 /// nor uses of flags between MI and CmpInstr.
1695 /// - and C/V flags are not used after CmpInstr
1697  const TargetRegisterInfo &TRI) {
1698  assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1699 
1700  const unsigned CmpOpcode = CmpInstr.getOpcode();
1701  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1702  return false;
1703 
1704  if (!examineCFlagsUse(MI, CmpInstr, TRI))
1705  return false;
1706 
1707  AccessKind AccessToCheck = AK_Write;
1708  if (sForm(MI) != MI.getOpcode())
1709  AccessToCheck = AK_All;
1710  return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1711 }
1712 
1713 /// Substitute an instruction comparing to zero with another instruction
1714 /// which produces needed condition flags.
1715 ///
1716 /// Return true on success.
1717 bool AArch64InstrInfo::substituteCmpToZero(
1718  MachineInstr &CmpInstr, unsigned SrcReg,
1719  const MachineRegisterInfo &MRI) const {
1720  // Get the unique definition of SrcReg.
1721  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1722  if (!MI)
1723  return false;
1724 
1726 
1727  unsigned NewOpc = sForm(*MI);
1728  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1729  return false;
1730 
1731  if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1732  return false;
1733 
1734  // Update the instruction to set NZCV.
1735  MI->setDesc(get(NewOpc));
1736  CmpInstr.eraseFromParent();
1737  bool succeeded = UpdateOperandRegClass(*MI);
1738  (void)succeeded;
1739  assert(succeeded && "Some operands reg class are incompatible!");
1740  MI->addRegisterDefined(AArch64::NZCV, &TRI);
1741  return true;
1742 }
1743 
1744 /// \returns True if \p CmpInstr can be removed.
1745 ///
1746 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1747 /// codes used in \p CCUseInstrs must be inverted.
1749  int CmpValue, const TargetRegisterInfo &TRI,
1750  SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1751  bool &IsInvertCC) {
1752  assert((CmpValue == 0 || CmpValue == 1) &&
1753  "Only comparisons to 0 or 1 considered for removal!");
1754 
1755  // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1756  unsigned MIOpc = MI.getOpcode();
1757  if (MIOpc == AArch64::CSINCWr) {
1758  if (MI.getOperand(1).getReg() != AArch64::WZR ||
1759  MI.getOperand(2).getReg() != AArch64::WZR)
1760  return false;
1761  } else if (MIOpc == AArch64::CSINCXr) {
1762  if (MI.getOperand(1).getReg() != AArch64::XZR ||
1763  MI.getOperand(2).getReg() != AArch64::XZR)
1764  return false;
1765  } else {
1766  return false;
1767  }
1769  if (MICC == AArch64CC::Invalid)
1770  return false;
1771 
1772  // NZCV needs to be defined
1773  if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
1774  return false;
1775 
1776  // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1777  const unsigned CmpOpcode = CmpInstr.getOpcode();
1778  bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1779  if (CmpValue && !IsSubsRegImm)
1780  return false;
1781  if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1782  return false;
1783 
1784  // MI conditions allowed: eq, ne, mi, pl
1785  UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1786  if (MIUsedNZCV.C || MIUsedNZCV.V)
1787  return false;
1788 
1789  Optional<UsedNZCV> NZCVUsedAfterCmp =
1790  examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1791  // Condition flags are not used in CmpInstr basic block successors and only
1792  // Z or N flags allowed to be used after CmpInstr within its basic block
1793  if (!NZCVUsedAfterCmp)
1794  return false;
1795  // Z or N flag used after CmpInstr must correspond to the flag used in MI
1796  if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1797  (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1798  return false;
1799  // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1800  if (MIUsedNZCV.N && !CmpValue)
1801  return false;
1802 
1803  // There must be no defs of flags between MI and CmpInstr
1804  if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1805  return false;
1806 
1807  // Condition code is inverted in the following cases:
1808  // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1809  // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1810  IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1811  (!CmpValue && MICC == AArch64CC::NE);
1812  return true;
1813 }
1814 
1815 /// Remove comparision in csinc-cmp sequence
1816 ///
1817 /// Examples:
1818 /// 1. \code
1819 /// csinc w9, wzr, wzr, ne
1820 /// cmp w9, #0
1821 /// b.eq
1822 /// \endcode
1823 /// to
1824 /// \code
1825 /// csinc w9, wzr, wzr, ne
1826 /// b.ne
1827 /// \endcode
1828 ///
1829 /// 2. \code
1830 /// csinc x2, xzr, xzr, mi
1831 /// cmp x2, #1
1832 /// b.pl
1833 /// \endcode
1834 /// to
1835 /// \code
1836 /// csinc x2, xzr, xzr, mi
1837 /// b.pl
1838 /// \endcode
1839 ///
1840 /// \param CmpInstr comparison instruction
1841 /// \return True when comparison removed
1842 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1843  MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1844  const MachineRegisterInfo &MRI) const {
1845  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1846  if (!MI)
1847  return false;
1849  SmallVector<MachineInstr *, 4> CCUseInstrs;
1850  bool IsInvertCC = false;
1851  if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1852  IsInvertCC))
1853  return false;
1854  // Make transformation
1855  CmpInstr.eraseFromParent();
1856  if (IsInvertCC) {
1857  // Invert condition codes in CmpInstr CC users
1858  for (MachineInstr *CCUseInstr : CCUseInstrs) {
1859  int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1860  assert(Idx >= 0 && "Unexpected instruction using CC.");
1861  MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1863  static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1864  CCOperand.setImm(CCUse);
1865  }
1866  }
1867  return true;
1868 }
1869 
1871  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1872  MI.getOpcode() != AArch64::CATCHRET)
1873  return false;
1874 
1875  MachineBasicBlock &MBB = *MI.getParent();
1876  auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1877  auto TRI = Subtarget.getRegisterInfo();
1878  DebugLoc DL = MI.getDebugLoc();
1879 
1880  if (MI.getOpcode() == AArch64::CATCHRET) {
1881  // Skip to the first instruction before the epilog.
1882  const TargetInstrInfo *TII =
1884  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1886  MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1887  while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1888  FirstEpilogSEH != MBB.begin())
1889  FirstEpilogSEH = std::prev(FirstEpilogSEH);
1890  if (FirstEpilogSEH != MBB.begin())
1891  FirstEpilogSEH = std::next(FirstEpilogSEH);
1892  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1893  .addReg(AArch64::X0, RegState::Define)
1894  .addMBB(TargetMBB);
1895  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1896  .addReg(AArch64::X0, RegState::Define)
1897  .addReg(AArch64::X0)
1898  .addMBB(TargetMBB)
1899  .addImm(0);
1900  return true;
1901  }
1902 
1903  Register Reg = MI.getOperand(0).getReg();
1904  const GlobalValue *GV =
1905  cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1906  const TargetMachine &TM = MBB.getParent()->getTarget();
1907  unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1908  const unsigned char MO_NC = AArch64II::MO_NC;
1909 
1910  if ((OpFlags & AArch64II::MO_GOT) != 0) {
1912  .addGlobalAddress(GV, 0, OpFlags);
1913  if (Subtarget.isTargetILP32()) {
1914  unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1915  BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1916  .addDef(Reg32, RegState::Dead)
1918  .addImm(0)
1919  .addMemOperand(*MI.memoperands_begin())
1921  } else {
1922  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1924  .addImm(0)
1925  .addMemOperand(*MI.memoperands_begin());
1926  }
1927  } else if (TM.getCodeModel() == CodeModel::Large) {
1928  assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1929  BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1931  .addImm(0);
1932  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1935  .addImm(16);
1936  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1939  .addImm(32);
1940  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1943  .addImm(48);
1944  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1946  .addImm(0)
1947  .addMemOperand(*MI.memoperands_begin());
1948  } else if (TM.getCodeModel() == CodeModel::Tiny) {
1950  .addGlobalAddress(GV, 0, OpFlags);
1951  } else {
1953  .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1954  unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1955  if (Subtarget.isTargetILP32()) {
1956  unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1957  BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1958  .addDef(Reg32, RegState::Dead)
1960  .addGlobalAddress(GV, 0, LoFlags)
1961  .addMemOperand(*MI.memoperands_begin())
1963  } else {
1964  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1966  .addGlobalAddress(GV, 0, LoFlags)
1967  .addMemOperand(*MI.memoperands_begin());
1968  }
1969  }
1970 
1971  MBB.erase(MI);
1972 
1973  return true;
1974 }
1975 
1976 // Return true if this instruction simply sets its single destination register
1977 // to zero. This is equivalent to a register rename of the zero-register.
1979  switch (MI.getOpcode()) {
1980  default:
1981  break;
1982  case AArch64::MOVZWi:
1983  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1984  if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1985  assert(MI.getDesc().getNumOperands() == 3 &&
1986  MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1987  return true;
1988  }
1989  break;
1990  case AArch64::ANDWri: // and Rd, Rzr, #imm
1991  return MI.getOperand(1).getReg() == AArch64::WZR;
1992  case AArch64::ANDXri:
1993  return MI.getOperand(1).getReg() == AArch64::XZR;
1994  case TargetOpcode::COPY:
1995  return MI.getOperand(1).getReg() == AArch64::WZR;
1996  }
1997  return false;
1998 }
1999 
2000 // Return true if this instruction simply renames a general register without
2001 // modifying bits.
2003  switch (MI.getOpcode()) {
2004  default:
2005  break;
2006  case TargetOpcode::COPY: {
2007  // GPR32 copies will by lowered to ORRXrs
2008  Register DstReg = MI.getOperand(0).getReg();
2009  return (AArch64::GPR32RegClass.contains(DstReg) ||
2010  AArch64::GPR64RegClass.contains(DstReg));
2011  }
2012  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2013  if (MI.getOperand(1).getReg() == AArch64::XZR) {
2014  assert(MI.getDesc().getNumOperands() == 4 &&
2015  MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2016  return true;
2017  }
2018  break;
2019  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2020  if (MI.getOperand(2).getImm() == 0) {
2021  assert(MI.getDesc().getNumOperands() == 4 &&
2022  MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2023  return true;
2024  }
2025  break;
2026  }
2027  return false;
2028 }
2029 
2030 // Return true if this instruction simply renames a general register without
2031 // modifying bits.
2033  switch (MI.getOpcode()) {
2034  default:
2035  break;
2036  case TargetOpcode::COPY: {
2037  // FPR64 copies will by lowered to ORR.16b
2038  Register DstReg = MI.getOperand(0).getReg();
2039  return (AArch64::FPR64RegClass.contains(DstReg) ||
2040  AArch64::FPR128RegClass.contains(DstReg));
2041  }
2042  case AArch64::ORRv16i8:
2043  if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2044  assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2045  "invalid ORRv16i8 operands");
2046  return true;
2047  }
2048  break;
2049  }
2050  return false;
2051 }
2052 
2054  int &FrameIndex) const {
2055  switch (MI.getOpcode()) {
2056  default:
2057  break;
2058  case AArch64::LDRWui:
2059  case AArch64::LDRXui:
2060  case AArch64::LDRBui:
2061  case AArch64::LDRHui:
2062  case AArch64::LDRSui:
2063  case AArch64::LDRDui:
2064  case AArch64::LDRQui:
2065  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2066  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2067  FrameIndex = MI.getOperand(1).getIndex();
2068  return MI.getOperand(0).getReg();
2069  }
2070  break;
2071  }
2072 
2073  return 0;
2074 }
2075 
2077  int &FrameIndex) const {
2078  switch (MI.getOpcode()) {
2079  default:
2080  break;
2081  case AArch64::STRWui:
2082  case AArch64::STRXui:
2083  case AArch64::STRBui:
2084  case AArch64::STRHui:
2085  case AArch64::STRSui:
2086  case AArch64::STRDui:
2087  case AArch64::STRQui:
2088  case AArch64::LDR_PXI:
2089  case AArch64::STR_PXI:
2090  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2091  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2092  FrameIndex = MI.getOperand(1).getIndex();
2093  return MI.getOperand(0).getReg();
2094  }
2095  break;
2096  }
2097  return 0;
2098 }
2099 
2100 /// Check all MachineMemOperands for a hint to suppress pairing.
2102  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2103  return MMO->getFlags() & MOSuppressPair;
2104  });
2105 }
2106 
2107 /// Set a flag on the first MachineMemOperand to suppress pairing.
2109  if (MI.memoperands_empty())
2110  return;
2111  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2112 }
2113 
2114 /// Check all MachineMemOperands for a hint that the load/store is strided.
2116  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2117  return MMO->getFlags() & MOStridedAccess;
2118  });
2119 }
2120 
2122  switch (Opc) {
2123  default:
2124  return false;
2125  case AArch64::STURSi:
2126  case AArch64::STURDi:
2127  case AArch64::STURQi:
2128  case AArch64::STURBBi:
2129  case AArch64::STURHHi:
2130  case AArch64::STURWi:
2131  case AArch64::STURXi:
2132  case AArch64::LDURSi:
2133  case AArch64::LDURDi:
2134  case AArch64::LDURQi:
2135  case AArch64::LDURWi:
2136  case AArch64::LDURXi:
2137  case AArch64::LDURSWi:
2138  case AArch64::LDURHHi:
2139  case AArch64::LDURBBi:
2140  case AArch64::LDURSBWi:
2141  case AArch64::LDURSHWi:
2142  return true;
2143  }
2144 }
2145 
2147  switch (Opc) {
2148  default: return {};
2149  case AArch64::PRFMui: return AArch64::PRFUMi;
2150  case AArch64::LDRXui: return AArch64::LDURXi;
2151  case AArch64::LDRWui: return AArch64::LDURWi;
2152  case AArch64::LDRBui: return AArch64::LDURBi;
2153  case AArch64::LDRHui: return AArch64::LDURHi;
2154  case AArch64::LDRSui: return AArch64::LDURSi;
2155  case AArch64::LDRDui: return AArch64::LDURDi;
2156  case AArch64::LDRQui: return AArch64::LDURQi;
2157  case AArch64::LDRBBui: return AArch64::LDURBBi;
2158  case AArch64::LDRHHui: return AArch64::LDURHHi;
2159  case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2160  case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2161  case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2162  case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2163  case AArch64::LDRSWui: return AArch64::LDURSWi;
2164  case AArch64::STRXui: return AArch64::STURXi;
2165  case AArch64::STRWui: return AArch64::STURWi;
2166  case AArch64::STRBui: return AArch64::STURBi;
2167  case AArch64::STRHui: return AArch64::STURHi;
2168  case AArch64::STRSui: return AArch64::STURSi;
2169  case AArch64::STRDui: return AArch64::STURDi;
2170  case AArch64::STRQui: return AArch64::STURQi;
2171  case AArch64::STRBBui: return AArch64::STURBBi;
2172  case AArch64::STRHHui: return AArch64::STURHHi;
2173  }
2174 }
2175 
2177  switch (Opc) {
2178  default:
2179  return 2;
2180  case AArch64::LDPXi:
2181  case AArch64::LDPDi:
2182  case AArch64::STPXi:
2183  case AArch64::STPDi:
2184  case AArch64::LDNPXi:
2185  case AArch64::LDNPDi:
2186  case AArch64::STNPXi:
2187  case AArch64::STNPDi:
2188  case AArch64::LDPQi:
2189  case AArch64::STPQi:
2190  case AArch64::LDNPQi:
2191  case AArch64::STNPQi:
2192  case AArch64::LDPWi:
2193  case AArch64::LDPSi:
2194  case AArch64::STPWi:
2195  case AArch64::STPSi:
2196  case AArch64::LDNPWi:
2197  case AArch64::LDNPSi:
2198  case AArch64::STNPWi:
2199  case AArch64::STNPSi:
2200  case AArch64::LDG:
2201  case AArch64::STGPi:
2202  case AArch64::LD1B_IMM:
2203  case AArch64::LD1H_IMM:
2204  case AArch64::LD1W_IMM:
2205  case AArch64::LD1D_IMM:
2206  case AArch64::ST1B_IMM:
2207  case AArch64::ST1H_IMM:
2208  case AArch64::ST1W_IMM:
2209  case AArch64::ST1D_IMM:
2210  case AArch64::LD1B_H_IMM:
2211  case AArch64::LD1SB_H_IMM:
2212  case AArch64::LD1H_S_IMM:
2213  case AArch64::LD1SH_S_IMM:
2214  case AArch64::LD1W_D_IMM:
2215  case AArch64::LD1SW_D_IMM:
2216  case AArch64::ST1B_H_IMM:
2217  case AArch64::ST1H_S_IMM:
2218  case AArch64::ST1W_D_IMM:
2219  case AArch64::LD1B_S_IMM:
2220  case AArch64::LD1SB_S_IMM:
2221  case AArch64::LD1H_D_IMM:
2222  case AArch64::LD1SH_D_IMM:
2223  case AArch64::ST1B_S_IMM:
2224  case AArch64::ST1H_D_IMM:
2225  case AArch64::LD1B_D_IMM:
2226  case AArch64::LD1SB_D_IMM:
2227  case AArch64::ST1B_D_IMM:
2228  return 3;
2229  case AArch64::ADDG:
2230  case AArch64::STGOffset:
2231  case AArch64::LDR_PXI:
2232  case AArch64::STR_PXI:
2233  return 2;
2234  }
2235 }
2236 
2238  switch (MI.getOpcode()) {
2239  default:
2240  return false;
2241  // Scaled instructions.
2242  case AArch64::STRSui:
2243  case AArch64::STRDui:
2244  case AArch64::STRQui:
2245  case AArch64::STRXui:
2246  case AArch64::STRWui:
2247  case AArch64::LDRSui:
2248  case AArch64::LDRDui:
2249  case AArch64::LDRQui:
2250  case AArch64::LDRXui:
2251  case AArch64::LDRWui:
2252  case AArch64::LDRSWui:
2253  // Unscaled instructions.
2254  case AArch64::STURSi:
2255  case AArch64::STURDi:
2256  case AArch64::STURQi:
2257  case AArch64::STURWi:
2258  case AArch64::STURXi:
2259  case AArch64::LDURSi:
2260  case AArch64::LDURDi:
2261  case AArch64::LDURQi:
2262  case AArch64::LDURWi:
2263  case AArch64::LDURXi:
2264  case AArch64::LDURSWi:
2265  return true;
2266  }
2267 }
2268 
2270  bool &Is64Bit) {
2271  switch (Opc) {
2272  default:
2273  llvm_unreachable("Opcode has no flag setting equivalent!");
2274  // 32-bit cases:
2275  case AArch64::ADDWri:
2276  Is64Bit = false;
2277  return AArch64::ADDSWri;
2278  case AArch64::ADDWrr:
2279  Is64Bit = false;
2280  return AArch64::ADDSWrr;
2281  case AArch64::ADDWrs:
2282  Is64Bit = false;
2283  return AArch64::ADDSWrs;
2284  case AArch64::ADDWrx:
2285  Is64Bit = false;
2286  return AArch64::ADDSWrx;
2287  case AArch64::ANDWri:
2288  Is64Bit = false;
2289  return AArch64::ANDSWri;
2290  case AArch64::ANDWrr:
2291  Is64Bit = false;
2292  return AArch64::ANDSWrr;
2293  case AArch64::ANDWrs:
2294  Is64Bit = false;
2295  return AArch64::ANDSWrs;
2296  case AArch64::BICWrr:
2297  Is64Bit = false;
2298  return AArch64::BICSWrr;
2299  case AArch64::BICWrs:
2300  Is64Bit = false;
2301  return AArch64::BICSWrs;
2302  case AArch64::SUBWri:
2303  Is64Bit = false;
2304  return AArch64::SUBSWri;
2305  case AArch64::SUBWrr:
2306  Is64Bit = false;
2307  return AArch64::SUBSWrr;
2308  case AArch64::SUBWrs:
2309  Is64Bit = false;
2310  return AArch64::SUBSWrs;
2311  case AArch64::SUBWrx:
2312  Is64Bit = false;
2313  return AArch64::SUBSWrx;
2314  // 64-bit cases:
2315  case AArch64::ADDXri:
2316  Is64Bit = true;
2317  return AArch64::ADDSXri;
2318  case AArch64::ADDXrr:
2319  Is64Bit = true;
2320  return AArch64::ADDSXrr;
2321  case AArch64::ADDXrs:
2322  Is64Bit = true;
2323  return AArch64::ADDSXrs;
2324  case AArch64::ADDXrx:
2325  Is64Bit = true;
2326  return AArch64::ADDSXrx;
2327  case AArch64::ANDXri:
2328  Is64Bit = true;
2329  return AArch64::ANDSXri;
2330  case AArch64::ANDXrr:
2331  Is64Bit = true;
2332  return AArch64::ANDSXrr;
2333  case AArch64::ANDXrs:
2334  Is64Bit = true;
2335  return AArch64::ANDSXrs;
2336  case AArch64::BICXrr:
2337  Is64Bit = true;
2338  return AArch64::BICSXrr;
2339  case AArch64::BICXrs:
2340  Is64Bit = true;
2341  return AArch64::BICSXrs;
2342  case AArch64::SUBXri:
2343  Is64Bit = true;
2344  return AArch64::SUBSXri;
2345  case AArch64::SUBXrr:
2346  Is64Bit = true;
2347  return AArch64::SUBSXrr;
2348  case AArch64::SUBXrs:
2349  Is64Bit = true;
2350  return AArch64::SUBSXrs;
2351  case AArch64::SUBXrx:
2352  Is64Bit = true;
2353  return AArch64::SUBSXrx;
2354  }
2355 }
2356 
2357 // Is this a candidate for ld/st merging or pairing? For example, we don't
2358 // touch volatiles or load/stores that have a hint to avoid pair formation.
2360  // If this is a volatile load/store, don't mess with it.
2361  if (MI.hasOrderedMemoryRef())
2362  return false;
2363 
2364  // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2365  assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
2366  "Expected a reg or frame index operand.");
2367  if (!MI.getOperand(2).isImm())
2368  return false;
2369 
2370  // Can't merge/pair if the instruction modifies the base register.
2371  // e.g., ldr x0, [x0]
2372  // This case will never occur with an FI base.
2373  if (MI.getOperand(1).isReg()) {
2374  Register BaseReg = MI.getOperand(1).getReg();
2376  if (MI.modifiesRegister(BaseReg, TRI))
2377  return false;
2378  }
2379 
2380  // Check if this load/store has a hint to avoid pair formation.
2381  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2382  if (isLdStPairSuppressed(MI))
2383  return false;
2384 
2385  // Do not pair any callee-save store/reload instructions in the
2386  // prologue/epilogue if the CFI information encoded the operations as separate
2387  // instructions, as that will cause the size of the actual prologue to mismatch
2388  // with the prologue size recorded in the Windows CFI.
2389  const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2390  bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2391  MI.getMF()->getFunction().needsUnwindTableEntry();
2392  if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2393  MI.getFlag(MachineInstr::FrameDestroy)))
2394  return false;
2395 
2396  // On some CPUs quad load/store pairs are slower than two single load/stores.
2397  if (Subtarget.isPaired128Slow()) {
2398  switch (MI.getOpcode()) {
2399  default:
2400  break;
2401  case AArch64::LDURQi:
2402  case AArch64::STURQi:
2403  case AArch64::LDRQui:
2404  case AArch64::STRQui:
2405  return false;
2406  }
2407  }
2408 
2409  return true;
2410 }
2411 
2414  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2415  const TargetRegisterInfo *TRI) const {
2416  if (!LdSt.mayLoadOrStore())
2417  return false;
2418 
2419  const MachineOperand *BaseOp;
2420  if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2421  Width, TRI))
2422  return false;
2423  BaseOps.push_back(BaseOp);
2424  return true;
2425 }
2426 
2429  const TargetRegisterInfo *TRI) const {
2430  const MachineOperand *Base; // Filled with the base operand of MI.
2431  int64_t Offset; // Filled with the offset of MI.
2432  bool OffsetIsScalable;
2433  if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2434  return None;
2435 
2436  if (!Base->isReg())
2437  return None;
2438  ExtAddrMode AM;
2439  AM.BaseReg = Base->getReg();
2440  AM.Displacement = Offset;
2441  AM.ScaledReg = 0;
2442  return AM;
2443 }
2444 
2446  const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
2447  bool &OffsetIsScalable, unsigned &Width,
2448  const TargetRegisterInfo *TRI) const {
2449  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2450  // Handle only loads/stores with base register followed by immediate offset.
2451  if (LdSt.getNumExplicitOperands() == 3) {
2452  // Non-paired instruction (e.g., ldr x1, [x0, #8]).
2453  if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
2454  !LdSt.getOperand(2).isImm())
2455  return false;
2456  } else if (LdSt.getNumExplicitOperands() == 4) {
2457  // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2458  if (!LdSt.getOperand(1).isReg() ||
2459  (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2460  !LdSt.getOperand(3).isImm())
2461  return false;
2462  } else
2463  return false;
2464 
2465  // Get the scaling factor for the instruction and set the width for the
2466  // instruction.
2467  TypeSize Scale(0U, false);
2468  int64_t Dummy1, Dummy2;
2469 
2470  // If this returns false, then it's an instruction we don't want to handle.
2471  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2472  return false;
2473 
2474  // Compute the offset. Offset is calculated as the immediate operand
2475  // multiplied by the scaling factor. Unscaled instructions have scaling factor
2476  // set to 1.
2477  if (LdSt.getNumExplicitOperands() == 3) {
2478  BaseOp = &LdSt.getOperand(1);
2479  Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
2480  } else {
2481  assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2482  BaseOp = &LdSt.getOperand(2);
2483  Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
2484  }
2485  OffsetIsScalable = Scale.isScalable();
2486 
2487  if (!BaseOp->isReg() && !BaseOp->isFI())
2488  return false;
2489 
2490  return true;
2491 }
2492 
2495  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2496  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2497  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2498  return OfsOp;
2499 }
2500 
2501 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
2502  unsigned &Width, int64_t &MinOffset,
2503  int64_t &MaxOffset) {
2504  const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
2505  switch (Opcode) {
2506  // Not a memory operation or something we want to handle.
2507  default:
2508  Scale = TypeSize::Fixed(0);
2509  Width = 0;
2510  MinOffset = MaxOffset = 0;
2511  return false;
2512  case AArch64::STRWpost:
2513  case AArch64::LDRWpost:
2514  Width = 32;
2515  Scale = TypeSize::Fixed(4);
2516  MinOffset = -256;
2517  MaxOffset = 255;
2518  break;
2519  case AArch64::LDURQi:
2520  case AArch64::STURQi:
2521  Width = 16;
2522  Scale = TypeSize::Fixed(1);
2523  MinOffset = -256;
2524  MaxOffset = 255;
2525  break;
2526  case AArch64::PRFUMi:
2527  case AArch64::LDURXi:
2528  case AArch64::LDURDi:
2529  case AArch64::STURXi:
2530  case AArch64::STURDi:
2531  Width = 8;
2532  Scale = TypeSize::Fixed(1);
2533  MinOffset = -256;
2534  MaxOffset = 255;
2535  break;
2536  case AArch64::LDURWi:
2537  case AArch64::LDURSi:
2538  case AArch64::LDURSWi:
2539  case AArch64::STURWi:
2540  case AArch64::STURSi:
2541  Width = 4;
2542  Scale = TypeSize::Fixed(1);
2543  MinOffset = -256;
2544  MaxOffset = 255;
2545  break;
2546  case AArch64::LDURHi:
2547  case AArch64::LDURHHi:
2548  case AArch64::LDURSHXi:
2549  case AArch64::LDURSHWi:
2550  case AArch64::STURHi:
2551  case AArch64::STURHHi:
2552  Width = 2;
2553  Scale = TypeSize::Fixed(1);
2554  MinOffset = -256;
2555  MaxOffset = 255;
2556  break;
2557  case AArch64::LDURBi:
2558  case AArch64::LDURBBi:
2559  case AArch64::LDURSBXi:
2560  case AArch64::LDURSBWi:
2561  case AArch64::STURBi:
2562  case AArch64::STURBBi:
2563  Width = 1;
2564  Scale = TypeSize::Fixed(1);
2565  MinOffset = -256;
2566  MaxOffset = 255;
2567  break;
2568  case AArch64::LDPQi:
2569  case AArch64::LDNPQi:
2570  case AArch64::STPQi:
2571  case AArch64::STNPQi:
2572  Scale = TypeSize::Fixed(16);
2573  Width = 32;
2574  MinOffset = -64;
2575  MaxOffset = 63;
2576  break;
2577  case AArch64::LDRQui:
2578  case AArch64::STRQui:
2579  Scale = TypeSize::Fixed(16);
2580  Width = 16;
2581  MinOffset = 0;
2582  MaxOffset = 4095;
2583  break;
2584  case AArch64::LDPXi:
2585  case AArch64::LDPDi:
2586  case AArch64::LDNPXi:
2587  case AArch64::LDNPDi:
2588  case AArch64::STPXi:
2589  case AArch64::STPDi:
2590  case AArch64::STNPXi:
2591  case AArch64::STNPDi:
2592  Scale = TypeSize::Fixed(8);
2593  Width = 16;
2594  MinOffset = -64;
2595  MaxOffset = 63;
2596  break;
2597  case AArch64::PRFMui:
2598  case AArch64::LDRXui:
2599  case AArch64::LDRDui:
2600  case AArch64::STRXui:
2601  case AArch64::STRDui:
2602  Scale = TypeSize::Fixed(8);
2603  Width = 8;
2604  MinOffset = 0;
2605  MaxOffset = 4095;
2606  break;
2607  case AArch64::LDPWi:
2608  case AArch64::LDPSi:
2609  case AArch64::LDNPWi:
2610  case AArch64::LDNPSi:
2611  case AArch64::STPWi:
2612  case AArch64::STPSi:
2613  case AArch64::STNPWi:
2614  case AArch64::STNPSi:
2615  Scale = TypeSize::Fixed(4);
2616  Width = 8;
2617  MinOffset = -64;
2618  MaxOffset = 63;
2619  break;
2620  case AArch64::LDRWui:
2621  case AArch64::LDRSui:
2622  case AArch64::LDRSWui:
2623  case AArch64::STRWui:
2624  case AArch64::STRSui:
2625  Scale = TypeSize::Fixed(4);
2626  Width = 4;
2627  MinOffset = 0;
2628  MaxOffset = 4095;
2629  break;
2630  case AArch64::LDRHui:
2631  case AArch64::LDRHHui:
2632  case AArch64::LDRSHWui:
2633  case AArch64::LDRSHXui:
2634  case AArch64::STRHui:
2635  case AArch64::STRHHui:
2636  Scale = TypeSize::Fixed(2);
2637  Width = 2;
2638  MinOffset = 0;
2639  MaxOffset = 4095;
2640  break;
2641  case AArch64::LDRBui:
2642  case AArch64::LDRBBui:
2643  case AArch64::LDRSBWui:
2644  case AArch64::LDRSBXui:
2645  case AArch64::STRBui:
2646  case AArch64::STRBBui:
2647  Scale = TypeSize::Fixed(1);
2648  Width = 1;
2649  MinOffset = 0;
2650  MaxOffset = 4095;
2651  break;
2652  case AArch64::ADDG:
2653  Scale = TypeSize::Fixed(16);
2654  Width = 0;
2655  MinOffset = 0;
2656  MaxOffset = 63;
2657  break;
2658  case AArch64::TAGPstack:
2659  Scale = TypeSize::Fixed(16);
2660  Width = 0;
2661  // TAGP with a negative offset turns into SUBP, which has a maximum offset
2662  // of 63 (not 64!).
2663  MinOffset = -63;
2664  MaxOffset = 63;
2665  break;
2666  case AArch64::LDG:
2667  case AArch64::STGOffset:
2668  case AArch64::STZGOffset:
2669  Scale = TypeSize::Fixed(16);
2670  Width = 16;
2671  MinOffset = -256;
2672  MaxOffset = 255;
2673  break;
2674  case AArch64::STR_ZZZZXI:
2675  case AArch64::LDR_ZZZZXI:
2676  Scale = TypeSize::Scalable(16);
2677  Width = SVEMaxBytesPerVector * 4;
2678  MinOffset = -256;
2679  MaxOffset = 252;
2680  break;
2681  case AArch64::STR_ZZZXI:
2682  case AArch64::LDR_ZZZXI:
2683  Scale = TypeSize::Scalable(16);
2684  Width = SVEMaxBytesPerVector * 3;
2685  MinOffset = -256;
2686  MaxOffset = 253;
2687  break;
2688  case AArch64::STR_ZZXI:
2689  case AArch64::LDR_ZZXI:
2690  Scale = TypeSize::Scalable(16);
2691  Width = SVEMaxBytesPerVector * 2;
2692  MinOffset = -256;
2693  MaxOffset = 254;
2694  break;
2695  case AArch64::LDR_PXI:
2696  case AArch64::STR_PXI:
2697  Scale = TypeSize::Scalable(2);
2698  Width = SVEMaxBytesPerVector / 8;
2699  MinOffset = -256;
2700  MaxOffset = 255;
2701  break;
2702  case AArch64::LDR_ZXI:
2703  case AArch64::STR_ZXI:
2704  Scale = TypeSize::Scalable(16);
2705  Width = SVEMaxBytesPerVector;
2706  MinOffset = -256;
2707  MaxOffset = 255;
2708  break;
2709  case AArch64::LD1B_IMM:
2710  case AArch64::LD1H_IMM:
2711  case AArch64::LD1W_IMM:
2712  case AArch64::LD1D_IMM:
2713  case AArch64::ST1B_IMM:
2714  case AArch64::ST1H_IMM:
2715  case AArch64::ST1W_IMM:
2716  case AArch64::ST1D_IMM:
2717  // A full vectors worth of data
2718  // Width = mbytes * elements
2719  Scale = TypeSize::Scalable(16);
2720  Width = SVEMaxBytesPerVector;
2721  MinOffset = -8;
2722  MaxOffset = 7;
2723  break;
2724  case AArch64::LD1B_H_IMM:
2725  case AArch64::LD1SB_H_IMM:
2726  case AArch64::LD1H_S_IMM:
2727  case AArch64::LD1SH_S_IMM:
2728  case AArch64::LD1W_D_IMM:
2729  case AArch64::LD1SW_D_IMM:
2730  case AArch64::ST1B_H_IMM:
2731  case AArch64::ST1H_S_IMM:
2732  case AArch64::ST1W_D_IMM:
2733  // A half vector worth of data
2734  // Width = mbytes * elements
2735  Scale = TypeSize::Scalable(8);
2736  Width = SVEMaxBytesPerVector / 2;
2737  MinOffset = -8;
2738  MaxOffset = 7;
2739  break;
2740  case AArch64::LD1B_S_IMM:
2741  case AArch64::LD1SB_S_IMM:
2742  case AArch64::LD1H_D_IMM:
2743  case AArch64::LD1SH_D_IMM:
2744  case AArch64::ST1B_S_IMM:
2745  case AArch64::ST1H_D_IMM:
2746  // A quarter vector worth of data
2747  // Width = mbytes * elements
2748  Scale = TypeSize::Scalable(4);
2749  Width = SVEMaxBytesPerVector / 4;
2750  MinOffset = -8;
2751  MaxOffset = 7;
2752  break;
2753  case AArch64::LD1B_D_IMM:
2754  case AArch64::LD1SB_D_IMM:
2755  case AArch64::ST1B_D_IMM:
2756  // A eighth vector worth of data
2757  // Width = mbytes * elements
2758  Scale = TypeSize::Scalable(2);
2759  Width = SVEMaxBytesPerVector / 8;
2760  MinOffset = -8;
2761  MaxOffset = 7;
2762  break;
2763  case AArch64::ST2GOffset:
2764  case AArch64::STZ2GOffset:
2765  Scale = TypeSize::Fixed(16);
2766  Width = 32;
2767  MinOffset = -256;
2768  MaxOffset = 255;
2769  break;
2770  case AArch64::STGPi:
2771  Scale = TypeSize::Fixed(16);
2772  Width = 16;
2773  MinOffset = -64;
2774  MaxOffset = 63;
2775  break;
2776  }
2777 
2778  return true;
2779 }
2780 
2781 // Scaling factor for unscaled load or store.
2783  switch (Opc) {
2784  default:
2785  llvm_unreachable("Opcode has unknown scale!");
2786  case AArch64::LDRBBui:
2787  case AArch64::LDURBBi:
2788  case AArch64::LDRSBWui:
2789  case AArch64::LDURSBWi:
2790  case AArch64::STRBBui:
2791  case AArch64::STURBBi:
2792  return 1;
2793  case AArch64::LDRHHui:
2794  case AArch64::LDURHHi:
2795  case AArch64::LDRSHWui:
2796  case AArch64::LDURSHWi:
2797  case AArch64::STRHHui:
2798  case AArch64::STURHHi:
2799  return 2;
2800  case AArch64::LDRSui:
2801  case AArch64::LDURSi:
2802  case AArch64::LDRSWui:
2803  case AArch64::LDURSWi:
2804  case AArch64::LDRWui:
2805  case AArch64::LDURWi:
2806  case AArch64::STRSui:
2807  case AArch64::STURSi:
2808  case AArch64::STRWui:
2809  case AArch64::STURWi:
2810  case AArch64::LDPSi:
2811  case AArch64::LDPSWi:
2812  case AArch64::LDPWi:
2813  case AArch64::STPSi:
2814  case AArch64::STPWi:
2815  return 4;
2816  case AArch64::LDRDui:
2817  case AArch64::LDURDi:
2818  case AArch64::LDRXui:
2819  case AArch64::LDURXi:
2820  case AArch64::STRDui:
2821  case AArch64::STURDi:
2822  case AArch64::STRXui:
2823  case AArch64::STURXi:
2824  case AArch64::LDPDi:
2825  case AArch64::LDPXi:
2826  case AArch64::STPDi:
2827  case AArch64::STPXi:
2828  return 8;
2829  case AArch64::LDRQui:
2830  case AArch64::LDURQi:
2831  case AArch64::STRQui:
2832  case AArch64::STURQi:
2833  case AArch64::LDPQi:
2834  case AArch64::STPQi:
2835  case AArch64::STGOffset:
2836  case AArch64::STZGOffset:
2837  case AArch64::ST2GOffset:
2838  case AArch64::STZ2GOffset:
2839  case AArch64::STGPi:
2840  return 16;
2841  }
2842 }
2843 
2844 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
2845 // scaled.
2846 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2847  int Scale = AArch64InstrInfo::getMemScale(Opc);
2848 
2849  // If the byte-offset isn't a multiple of the stride, we can't scale this
2850  // offset.
2851  if (Offset % Scale != 0)
2852  return false;
2853 
2854  // Convert the byte-offset used by unscaled into an "element" offset used
2855  // by the scaled pair load/store instructions.
2856  Offset /= Scale;
2857  return true;
2858 }
2859 
2860 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2861  if (FirstOpc == SecondOpc)
2862  return true;
2863  // We can also pair sign-ext and zero-ext instructions.
2864  switch (FirstOpc) {
2865  default:
2866  return false;
2867  case AArch64::LDRWui:
2868  case AArch64::LDURWi:
2869  return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2870  case AArch64::LDRSWui:
2871  case AArch64::LDURSWi:
2872  return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2873  }
2874  // These instructions can't be paired based on their opcodes.
2875  return false;
2876 }
2877 
2878 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2879  int64_t Offset1, unsigned Opcode1, int FI2,
2880  int64_t Offset2, unsigned Opcode2) {
2881  // Accesses through fixed stack object frame indices may access a different
2882  // fixed stack slot. Check that the object offsets + offsets match.
2883  if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2884  int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2885  int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2886  assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2887  // Convert to scaled object offsets.
2888  int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
2889  if (ObjectOffset1 % Scale1 != 0)
2890  return false;
2891  ObjectOffset1 /= Scale1;
2892  int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
2893  if (ObjectOffset2 % Scale2 != 0)
2894  return false;
2895  ObjectOffset2 /= Scale2;
2896  ObjectOffset1 += Offset1;
2897  ObjectOffset2 += Offset2;
2898  return ObjectOffset1 + 1 == ObjectOffset2;
2899  }
2900 
2901  return FI1 == FI2;
2902 }
2903 
2904 /// Detect opportunities for ldp/stp formation.
2905 ///
2906 /// Only called for LdSt for which getMemOperandWithOffset returns true.
2909  ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
2910  unsigned NumBytes) const {
2911  assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
2912  const MachineOperand &BaseOp1 = *BaseOps1.front();
2913  const MachineOperand &BaseOp2 = *BaseOps2.front();
2914  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
2915  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
2916  if (BaseOp1.getType() != BaseOp2.getType())
2917  return false;
2918 
2919  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2920  "Only base registers and frame indices are supported.");
2921 
2922  // Check for both base regs and base FI.
2923  if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2924  return false;
2925 
2926  // Only cluster up to a single pair.
2927  if (NumLoads > 2)
2928  return false;
2929 
2930  if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2931  return false;
2932 
2933  // Can we pair these instructions based on their opcodes?
2934  unsigned FirstOpc = FirstLdSt.getOpcode();
2935  unsigned SecondOpc = SecondLdSt.getOpcode();
2936  if (!canPairLdStOpc(FirstOpc, SecondOpc))
2937  return false;
2938 
2939  // Can't merge volatiles or load/stores that have a hint to avoid pair
2940  // formation, for example.
2941  if (!isCandidateToMergeOrPair(FirstLdSt) ||
2942  !isCandidateToMergeOrPair(SecondLdSt))
2943  return false;
2944 
2945  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2946  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2947  if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2948  return false;
2949 
2950  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2951  if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2952  return false;
2953 
2954  // Pairwise instructions have a 7-bit signed offset field.
2955  if (Offset1 > 63 || Offset1 < -64)
2956  return false;
2957 
2958  // The caller should already have ordered First/SecondLdSt by offset.
2959  // Note: except for non-equal frame index bases
2960  if (BaseOp1.isFI()) {
2961  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2962  "Caller should have ordered offsets.");
2963 
2964  const MachineFrameInfo &MFI =
2965  FirstLdSt.getParent()->getParent()->getFrameInfo();
2966  return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2967  BaseOp2.getIndex(), Offset2, SecondOpc);
2968  }
2969 
2970  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
2971 
2972  return Offset1 + 1 == Offset2;
2973 }
2974 
2976  unsigned Reg, unsigned SubIdx,
2977  unsigned State,
2978  const TargetRegisterInfo *TRI) {
2979  if (!SubIdx)
2980  return MIB.addReg(Reg, State);
2981 
2983  return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2984  return MIB.addReg(Reg, State, SubIdx);
2985 }
2986 
2987 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2988  unsigned NumRegs) {
2989  // We really want the positive remainder mod 32 here, that happens to be
2990  // easily obtainable with a mask.
2991  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2992 }
2993 
2996  const DebugLoc &DL, MCRegister DestReg,
2997  MCRegister SrcReg, bool KillSrc,
2998  unsigned Opcode,
2999  ArrayRef<unsigned> Indices) const {
3000  assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
3002  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
3003  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
3004  unsigned NumRegs = Indices.size();
3005 
3006  int SubReg = 0, End = NumRegs, Incr = 1;
3007  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
3008  SubReg = NumRegs - 1;
3009  End = -1;
3010  Incr = -1;
3011  }
3012 
3013  for (; SubReg != End; SubReg += Incr) {
3014  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
3015  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
3016  AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
3017  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
3018  }
3019 }
3020 
3023  DebugLoc DL, unsigned DestReg,
3024  unsigned SrcReg, bool KillSrc,
3025  unsigned Opcode, unsigned ZeroReg,
3026  llvm::ArrayRef<unsigned> Indices) const {
3028  unsigned NumRegs = Indices.size();
3029 
3030 #ifndef NDEBUG
3031  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
3032  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
3033  assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
3034  "GPR reg sequences should not be able to overlap");
3035 #endif
3036 
3037  for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
3038  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
3039  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
3040  MIB.addReg(ZeroReg);
3041  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
3042  MIB.addImm(0);
3043  }
3044 }
3045 
3048  const DebugLoc &DL, MCRegister DestReg,
3049  MCRegister SrcReg, bool KillSrc) const {
3050  if (AArch64::GPR32spRegClass.contains(DestReg) &&
3051  (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
3053 
3054  if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
3055  // If either operand is WSP, expand to ADD #0.
3056  if (Subtarget.hasZeroCycleRegMove()) {
3057  // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
3058  MCRegister DestRegX = TRI->getMatchingSuperReg(
3059  DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3060  MCRegister SrcRegX = TRI->getMatchingSuperReg(
3061  SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3062  // This instruction is reading and writing X registers. This may upset
3063  // the register scavenger and machine verifier, so we need to indicate
3064  // that we are reading an undefined value from SrcRegX, but a proper
3065  // value from SrcReg.
3066  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
3067  .addReg(SrcRegX, RegState::Undef)
3068  .addImm(0)
3070  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
3071  } else {
3072  BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
3073  .addReg(SrcReg, getKillRegState(KillSrc))
3074  .addImm(0)
3076  }
3077  } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
3078  BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
3079  .addImm(0)
3081  } else {
3082  if (Subtarget.hasZeroCycleRegMove()) {
3083  // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
3084  MCRegister DestRegX = TRI->getMatchingSuperReg(
3085  DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3086  MCRegister SrcRegX = TRI->getMatchingSuperReg(
3087  SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3088  // This instruction is reading and writing X registers. This may upset
3089  // the register scavenger and machine verifier, so we need to indicate
3090  // that we are reading an undefined value from SrcRegX, but a proper
3091  // value from SrcReg.
3092  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
3093  .addReg(AArch64::XZR)
3094  .addReg(SrcRegX, RegState::Undef)
3095  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
3096  } else {
3097  // Otherwise, expand to ORR WZR.
3098  BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
3099  .addReg(AArch64::WZR)
3100  .addReg(SrcReg, getKillRegState(KillSrc));
3101  }
3102  }
3103  return;
3104  }
3105 
3106  // Copy a Predicate register by ORRing with itself.
3107  if (AArch64::PPRRegClass.contains(DestReg) &&
3108  AArch64::PPRRegClass.contains(SrcReg)) {
3109  assert(Subtarget.hasSVE() && "Unexpected SVE register.");
3110  BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
3111  .addReg(SrcReg) // Pg
3112  .addReg(SrcReg)
3113  .addReg(SrcReg, getKillRegState(KillSrc));
3114  return;
3115  }
3116 
3117  // Copy a Z register by ORRing with itself.
3118  if (AArch64::ZPRRegClass.contains(DestReg) &&
3119  AArch64::ZPRRegClass.contains(SrcReg)) {
3120  assert(Subtarget.hasSVE() && "Unexpected SVE register.");
3121  BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
3122  .addReg(SrcReg)
3123  .addReg(SrcReg, getKillRegState(KillSrc));
3124  return;
3125  }
3126 
3127  // Copy a Z register pair by copying the individual sub-registers.
3128  if (AArch64::ZPR2RegClass.contains(DestReg) &&
3129  AArch64::ZPR2RegClass.contains(SrcReg)) {
3130  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
3131  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3132  Indices);
3133  return;
3134  }
3135 
3136  // Copy a Z register triple by copying the individual sub-registers.
3137  if (AArch64::ZPR3RegClass.contains(DestReg) &&
3138  AArch64::ZPR3RegClass.contains(SrcReg)) {
3139  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
3140  AArch64::zsub2};
3141  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3142  Indices);
3143  return;
3144  }
3145 
3146  // Copy a Z register quad by copying the individual sub-registers.
3147  if (AArch64::ZPR4RegClass.contains(DestReg) &&
3148  AArch64::ZPR4RegClass.contains(SrcReg)) {
3149  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
3150  AArch64::zsub2, AArch64::zsub3};
3151  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3152  Indices);
3153  return;
3154  }
3155 
3156  if (AArch64::GPR64spRegClass.contains(DestReg) &&
3157  (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
3158  if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
3159  // If either operand is SP, expand to ADD #0.
3160  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
3161  .addReg(SrcReg, getKillRegState(KillSrc))
3162  .addImm(0)
3164  } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
3165  BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
3166  .addImm(0)
3168  } else {
3169  // Otherwise, expand to ORR XZR.
3170  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
3171  .addReg(AArch64::XZR)
3172  .addReg(SrcReg, getKillRegState(KillSrc));
3173  }
3174  return;
3175  }
3176 
3177  // Copy a DDDD register quad by copying the individual sub-registers.
3178  if (AArch64::DDDDRegClass.contains(DestReg) &&
3179  AArch64::DDDDRegClass.contains(SrcReg)) {
3180  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
3181  AArch64::dsub2, AArch64::dsub3};
3182  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3183  Indices);
3184  return;
3185  }
3186 
3187  // Copy a DDD register triple by copying the individual sub-registers.
3188  if (AArch64::DDDRegClass.contains(DestReg) &&
3189  AArch64::DDDRegClass.contains(SrcReg)) {
3190  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
3191  AArch64::dsub2};
3192  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3193  Indices);
3194  return;
3195  }
3196 
3197  // Copy a DD register pair by copying the individual sub-registers.
3198  if (AArch64::DDRegClass.contains(DestReg) &&
3199  AArch64::DDRegClass.contains(SrcReg)) {
3200  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
3201  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3202  Indices);
3203  return;
3204  }
3205 
3206  // Copy a QQQQ register quad by copying the individual sub-registers.
3207  if (AArch64::QQQQRegClass.contains(DestReg) &&
3208  AArch64::QQQQRegClass.contains(SrcReg)) {
3209  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
3210  AArch64::qsub2, AArch64::qsub3};
3211  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3212  Indices);
3213  return;
3214  }
3215 
3216  // Copy a QQQ register triple by copying the individual sub-registers.
3217  if (AArch64::QQQRegClass.contains(DestReg) &&
3218  AArch64::QQQRegClass.contains(SrcReg)) {
3219  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
3220  AArch64::qsub2};
3221  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3222  Indices);
3223  return;
3224  }
3225 
3226  // Copy a QQ register pair by copying the individual sub-registers.
3227  if (AArch64::QQRegClass.contains(DestReg) &&
3228  AArch64::QQRegClass.contains(SrcReg)) {
3229  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
3230  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3231  Indices);
3232  return;
3233  }
3234 
3235  if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
3236  AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
3237  static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
3238  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
3239  AArch64::XZR, Indices);
3240  return;
3241  }
3242 
3243  if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
3244  AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
3245  static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
3246  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
3247  AArch64::WZR, Indices);
3248  return;
3249  }
3250 
3251  if (AArch64::FPR128RegClass.contains(DestReg) &&
3252  AArch64::FPR128RegClass.contains(SrcReg)) {
3253  if (Subtarget.hasNEON()) {
3254  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3255  .addReg(SrcReg)
3256  .addReg(SrcReg, getKillRegState(KillSrc));
3257  } else {
3258  BuildMI(MBB, I, DL, get(AArch64::STRQpre))
3259  .addReg(AArch64::SP, RegState::Define)
3260  .addReg(SrcReg, getKillRegState(KillSrc))
3261  .addReg(AArch64::SP)
3262  .addImm(-16);
3263  BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
3264  .addReg(AArch64::SP, RegState::Define)
3265  .addReg(DestReg, RegState::Define)
3266  .addReg(AArch64::SP)
3267  .addImm(16);
3268  }
3269  return;
3270  }
3271 
3272  if (AArch64::FPR64RegClass.contains(DestReg) &&
3273  AArch64::FPR64RegClass.contains(SrcReg)) {
3274  if (Subtarget.hasNEON()) {
3275  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
3276  &AArch64::FPR128RegClass);
3277  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
3278  &AArch64::FPR128RegClass);
3279  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3280  .addReg(SrcReg)
3281  .addReg(SrcReg, getKillRegState(KillSrc));
3282  } else {
3283  BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
3284  .addReg(SrcReg, getKillRegState(KillSrc));
3285  }
3286  return;
3287  }
3288 
3289  if (AArch64::FPR32RegClass.contains(DestReg) &&
3290  AArch64::FPR32RegClass.contains(SrcReg)) {
3291  if (Subtarget.hasNEON()) {
3292  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
3293  &AArch64::FPR128RegClass);
3294  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
3295  &AArch64::FPR128RegClass);
3296  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3297  .addReg(SrcReg)
3298  .addReg(SrcReg, getKillRegState(KillSrc));
3299  } else {
3300  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3301  .addReg(SrcReg, getKillRegState(KillSrc));
3302  }
3303  return;
3304  }
3305 
3306  if (AArch64::FPR16RegClass.contains(DestReg) &&
3307  AArch64::FPR16RegClass.contains(SrcReg)) {
3308  if (Subtarget.hasNEON()) {
3309  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
3310  &AArch64::FPR128RegClass);
3311  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
3312  &AArch64::FPR128RegClass);
3313  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3314  .addReg(SrcReg)
3315  .addReg(SrcReg, getKillRegState(KillSrc));
3316  } else {
3317  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
3318  &AArch64::FPR32RegClass);
3319  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
3320  &AArch64::FPR32RegClass);
3321  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3322  .addReg(SrcReg, getKillRegState(KillSrc));
3323  }
3324  return;
3325  }
3326 
3327  if (AArch64::FPR8RegClass.contains(DestReg) &&
3328  AArch64::FPR8RegClass.contains(SrcReg)) {
3329  if (Subtarget.hasNEON()) {
3330  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
3331  &AArch64::FPR128RegClass);
3332  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
3333  &AArch64::FPR128RegClass);
3334  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3335  .addReg(SrcReg)
3336  .addReg(SrcReg, getKillRegState(KillSrc));
3337  } else {
3338  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
3339  &AArch64::FPR32RegClass);
3340  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
3341  &AArch64::FPR32RegClass);
3342  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3343  .addReg(SrcReg, getKillRegState(KillSrc));
3344  }
3345  return;
3346  }
3347 
3348  // Copies between GPR64 and FPR64.
3349  if (AArch64::FPR64RegClass.contains(DestReg) &&
3350  AArch64::GPR64RegClass.contains(SrcReg)) {
3351  BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
3352  .addReg(SrcReg, getKillRegState(KillSrc));
3353  return;
3354  }
3355  if (AArch64::GPR64RegClass.contains(DestReg) &&
3356  AArch64::FPR64RegClass.contains(SrcReg)) {
3357  BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
3358  .addReg(SrcReg, getKillRegState(KillSrc));
3359  return;
3360  }
3361  // Copies between GPR32 and FPR32.
3362  if (AArch64::FPR32RegClass.contains(DestReg) &&
3363  AArch64::GPR32RegClass.contains(SrcReg)) {
3364  BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
3365  .addReg(SrcReg, getKillRegState(KillSrc));
3366  return;
3367  }
3368  if (AArch64::GPR32RegClass.contains(DestReg) &&
3369  AArch64::FPR32RegClass.contains(SrcReg)) {
3370  BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
3371  .addReg(SrcReg, getKillRegState(KillSrc));
3372  return;
3373  }
3374 
3375  if (DestReg == AArch64::NZCV) {
3376  assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
3377  BuildMI(MBB, I, DL, get(AArch64::MSR))
3378  .addImm(AArch64SysReg::NZCV)
3379  .addReg(SrcReg, getKillRegState(KillSrc))
3380  .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
3381  return;
3382  }
3383 
3384  if (SrcReg == AArch64::NZCV) {
3385  assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
3386  BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
3387  .addImm(AArch64SysReg::NZCV)
3388  .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
3389  return;
3390  }
3391 
3392  llvm_unreachable("unimplemented reg-to-reg copy");
3393 }
3394 
3397  MachineBasicBlock::iterator InsertBefore,
3398  const MCInstrDesc &MCID,
3399  Register SrcReg, bool IsKill,
3400  unsigned SubIdx0, unsigned SubIdx1, int FI,
3401  MachineMemOperand *MMO) {
3402  Register SrcReg0 = SrcReg;
3403  Register SrcReg1 = SrcReg;
3404  if (Register::isPhysicalRegister(SrcReg)) {
3405  SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
3406  SubIdx0 = 0;
3407  SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
3408  SubIdx1 = 0;
3409  }
3410  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3411  .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
3412  .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
3413  .addFrameIndex(FI)
3414  .addImm(0)
3415  .addMemOperand(MMO);
3416 }
3417 
3420  bool isKill, int FI, const TargetRegisterClass *RC,
3421  const TargetRegisterInfo *TRI) const {
3422  MachineFunction &MF = *MBB.getParent();
3423  MachineFrameInfo &MFI = MF.getFrameInfo();
3424 
3426  MachineMemOperand *MMO =
3428  MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3429  unsigned Opc = 0;
3430  bool Offset = true;
3431  unsigned StackID = TargetStackID::Default;
3432  switch (TRI->getSpillSize(*RC)) {
3433  case 1:
3434  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3435  Opc = AArch64::STRBui;
3436  break;
3437  case 2:
3438  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3439  Opc = AArch64::STRHui;
3440  else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3441  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3442  Opc = AArch64::STR_PXI;
3444  }
3445  break;
3446  case 4:
3447  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3448  Opc = AArch64::STRWui;
3449  if (Register::isVirtualRegister(SrcReg))
3450  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
3451  else
3452  assert(SrcReg != AArch64::WSP);
3453  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3454  Opc = AArch64::STRSui;
3455  break;
3456  case 8:
3457  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3458  Opc = AArch64::STRXui;
3459  if (Register::isVirtualRegister(SrcReg))
3460  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3461  else
3462  assert(SrcReg != AArch64::SP);
3463  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3464  Opc = AArch64::STRDui;
3465  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3467  get(AArch64::STPWi), SrcReg, isKill,
3468  AArch64::sube32, AArch64::subo32, FI, MMO);
3469  return;
3470  }
3471  break;
3472  case 16:
3473  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3474  Opc = AArch64::STRQui;
3475  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3476  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3477  Opc = AArch64::ST1Twov1d;
3478  Offset = false;
3479  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3481  get(AArch64::STPXi), SrcReg, isKill,
3482  AArch64::sube64, AArch64::subo64, FI, MMO);
3483  return;
3484  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3485  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3486  Opc = AArch64::STR_ZXI;
3488  }
3489  break;
3490  case 24:
3491  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3492  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3493  Opc = AArch64::ST1Threev1d;
3494  Offset = false;
3495  }
3496  break;
3497  case 32:
3498  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3499  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3500  Opc = AArch64::ST1Fourv1d;
3501  Offset = false;
3502  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3503  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3504  Opc = AArch64::ST1Twov2d;
3505  Offset = false;
3506  } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3507  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3508  Opc = AArch64::STR_ZZXI;
3510  }
3511  break;
3512  case 48:
3513  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3514  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3515  Opc = AArch64::ST1Threev2d;
3516  Offset = false;
3517  } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3518  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3519  Opc = AArch64::STR_ZZZXI;
3521  }
3522  break;
3523  case 64:
3524  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3525  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3526  Opc = AArch64::ST1Fourv2d;
3527  Offset = false;
3528  } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3529  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3530  Opc = AArch64::STR_ZZZZXI;
3532  }
3533  break;
3534  }
3535  assert(Opc && "Unknown register class");
3536  MFI.setStackID(FI, StackID);
3537 
3538  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3539  .addReg(SrcReg, getKillRegState(isKill))
3540  .addFrameIndex(FI);
3541 
3542  if (Offset)
3543  MI.addImm(0);
3544  MI.addMemOperand(MMO);
3545 }
3546 
3549  MachineBasicBlock::iterator InsertBefore,
3550  const MCInstrDesc &MCID,
3551  Register DestReg, unsigned SubIdx0,
3552  unsigned SubIdx1, int FI,
3553  MachineMemOperand *MMO) {
3554  Register DestReg0 = DestReg;
3555  Register DestReg1 = DestReg;
3556  bool IsUndef = true;
3557  if (Register::isPhysicalRegister(DestReg)) {
3558  DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
3559  SubIdx0 = 0;
3560  DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
3561  SubIdx1 = 0;
3562  IsUndef = false;
3563  }
3564  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3565  .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
3566  .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
3567  .addFrameIndex(FI)
3568  .addImm(0)
3569  .addMemOperand(MMO);
3570 }
3571 
3574  int FI, const TargetRegisterClass *RC,
3575  const TargetRegisterInfo *TRI) const {
3576  MachineFunction &MF = *MBB.getParent();
3577  MachineFrameInfo &MFI = MF.getFrameInfo();
3579  MachineMemOperand *MMO =
3581  MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3582 
3583  unsigned Opc = 0;
3584  bool Offset = true;
3585  unsigned StackID = TargetStackID::Default;
3586  switch (TRI->getSpillSize(*RC)) {
3587  case 1:
3588  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3589  Opc = AArch64::LDRBui;
3590  break;
3591  case 2:
3592  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3593  Opc = AArch64::LDRHui;
3594  else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3595  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3596  Opc = AArch64::LDR_PXI;
3598  }
3599  break;
3600  case 4:
3601  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3602  Opc = AArch64::LDRWui;
3603  if (Register::isVirtualRegister(DestReg))
3604  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
3605  else
3606  assert(DestReg != AArch64::WSP);
3607  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3608  Opc = AArch64::LDRSui;
3609  break;
3610  case 8:
3611  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3612  Opc = AArch64::LDRXui;
3613  if (Register::isVirtualRegister(DestReg))
3614  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
3615  else
3616  assert(DestReg != AArch64::SP);
3617  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3618  Opc = AArch64::LDRDui;
3619  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3621  get(AArch64::LDPWi), DestReg, AArch64::sube32,
3622  AArch64::subo32, FI, MMO);
3623  return;
3624  }
3625  break;
3626  case 16:
3627  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3628  Opc = AArch64::LDRQui;
3629  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3630  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3631  Opc = AArch64::LD1Twov1d;
3632  Offset = false;
3633  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3635  get(AArch64::LDPXi), DestReg, AArch64::sube64,
3636  AArch64::subo64, FI, MMO);
3637  return;
3638  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3639  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3640  Opc = AArch64::LDR_ZXI;
3642  }
3643  break;
3644  case 24:
3645  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3646  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3647  Opc = AArch64::LD1Threev1d;
3648  Offset = false;
3649  }
3650  break;
3651  case 32:
3652  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3653  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3654  Opc = AArch64::LD1Fourv1d;
3655  Offset = false;
3656  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3657  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3658  Opc = AArch64::LD1Twov2d;
3659  Offset = false;
3660  } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3661  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3662  Opc = AArch64::LDR_ZZXI;
3664  }
3665  break;
3666  case 48:
3667  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3668  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3669  Opc = AArch64::LD1Threev2d;
3670  Offset = false;
3671  } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3672  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3673  Opc = AArch64::LDR_ZZZXI;
3675  }
3676  break;
3677  case 64:
3678  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3679  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3680  Opc = AArch64::LD1Fourv2d;
3681  Offset = false;
3682  } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3683  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3684  Opc = AArch64::LDR_ZZZZXI;
3686  }
3687  break;
3688  }
3689 
3690  assert(Opc && "Unknown register class");
3691  MFI.setStackID(FI, StackID);
3692 
3693  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3694  .addReg(DestReg, getDefRegState(true))
3695  .addFrameIndex(FI);
3696  if (Offset)
3697  MI.addImm(0);
3698  MI.addMemOperand(MMO);
3699 }
3700 
3702  const MachineInstr &UseMI,
3703  const TargetRegisterInfo *TRI) {
3704  return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
3705  UseMI.getIterator()),
3706  [TRI](const MachineInstr &I) {
3707  return I.modifiesRegister(AArch64::NZCV, TRI) ||
3708  I.readsRegister(AArch64::NZCV, TRI);
3709  });
3710 }
3711 
3713  const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
3714  // The smallest scalable element supported by scaled SVE addressing
3715  // modes are predicates, which are 2 scalable bytes in size. So the scalable
3716  // byte offset must always be a multiple of 2.
3717  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3718 
3719  // VGSized offsets are divided by '2', because the VG register is the
3720  // the number of 64bit granules as opposed to 128bit vector chunks,
3721  // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
3722  // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
3723  // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
3724  ByteSized = Offset.getFixed();
3725  VGSized = Offset.getScalable() / 2;
3726 }
3727 
3728 /// Returns the offset in parts to which this frame offset can be
3729 /// decomposed for the purpose of describing a frame offset.
3730 /// For non-scalable offsets this is simply its byte size.
3732  const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
3733  int64_t &NumDataVectors) {
3734  // The smallest scalable element supported by scaled SVE addressing
3735  // modes are predicates, which are 2 scalable bytes in size. So the scalable
3736  // byte offset must always be a multiple of 2.
3737  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
3738 
3739  NumBytes = Offset.getFixed();
3740  NumDataVectors = 0;
3741  NumPredicateVectors = Offset.getScalable() / 2;
3742  // This method is used to get the offsets to adjust the frame offset.
3743  // If the function requires ADDPL to be used and needs more than two ADDPL
3744  // instructions, part of the offset is folded into NumDataVectors so that it
3745  // uses ADDVL for part of it, reducing the number of ADDPL instructions.
3746  if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
3747  NumPredicateVectors > 62) {
3748  NumDataVectors = NumPredicateVectors / 8;
3749  NumPredicateVectors -= NumDataVectors * 8;
3750  }
3751 }
3752 
3753 // Helper function to emit a frame offset adjustment from a given
3754 // pointer (SrcReg), stored into DestReg. This function is explicit
3755 // in that it requires the opcode.
3758  const DebugLoc &DL, unsigned DestReg,
3759  unsigned SrcReg, int64_t Offset, unsigned Opc,
3760  const TargetInstrInfo *TII,
3761  MachineInstr::MIFlag Flag, bool NeedsWinCFI,
3762  bool *HasWinCFI) {
3763  int Sign = 1;
3764  unsigned MaxEncoding, ShiftSize;
3765  switch (Opc) {
3766  case AArch64::ADDXri:
3767  case AArch64::ADDSXri:
3768  case AArch64::SUBXri:
3769  case AArch64::SUBSXri:
3770  MaxEncoding = 0xfff;
3771  ShiftSize = 12;
3772  break;
3773  case AArch64::ADDVL_XXI:
3774  case AArch64::ADDPL_XXI:
3775  MaxEncoding = 31;
3776  ShiftSize = 0;
3777  if (Offset < 0) {
3778  MaxEncoding = 32;
3779  Sign = -1;
3780  Offset = -Offset;
3781  }
3782  break;
3783  default:
3784  llvm_unreachable("Unsupported opcode");
3785  }
3786 
3787  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
3788  // scratch register. If DestReg is a virtual register, use it as the
3789  // scratch register; otherwise, create a new virtual register (to be
3790  // replaced by the scavenger at the end of PEI). That case can be optimized
3791  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
3792  // register can be loaded with offset%8 and the add/sub can use an extending
3793  // instruction with LSL#3.
3794  // Currently the function handles any offsets but generates a poor sequence
3795  // of code.
3796  // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
3797 
3798  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
3799  Register TmpReg = DestReg;
3800  if (TmpReg == AArch64::XZR)
3802  &AArch64::GPR64RegClass);
3803  do {
3804  uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
3805  unsigned LocalShiftSize = 0;
3806  if (ThisVal > MaxEncoding) {
3807  ThisVal = ThisVal >> ShiftSize;
3808  LocalShiftSize = ShiftSize;
3809  }
3810  assert((ThisVal >> ShiftSize) <= MaxEncoding &&
3811  "Encoding cannot handle value that big");
3812 
3813  Offset -= ThisVal << LocalShiftSize;
3814  if (Offset == 0)
3815  TmpReg = DestReg;
3816  auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
3817  .addReg(SrcReg)
3818  .addImm(Sign * (int)ThisVal);
3819  if (ShiftSize)
3820  MBI = MBI.addImm(
3821  AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
3822  MBI = MBI.setMIFlag(Flag);
3823 
3824  if (NeedsWinCFI) {
3825  assert(Sign == 1 && "SEH directives should always have a positive sign");
3826  int Imm = (int)(ThisVal << LocalShiftSize);
3827  if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
3828  (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
3829  if (HasWinCFI)
3830  *HasWinCFI = true;
3831  if (Imm == 0)
3832  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
3833  else
3834  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
3835  .addImm(Imm)
3836  .setMIFlag(Flag);
3837  assert(Offset == 0 && "Expected remaining offset to be zero to "
3838  "emit a single SEH directive");
3839  } else if (DestReg == AArch64::SP) {
3840  if (HasWinCFI)
3841  *HasWinCFI = true;
3842  assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
3843  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
3844  .addImm(Imm)
3845  .setMIFlag(Flag);
3846  }
3847  if (HasWinCFI)
3848  *HasWinCFI = true;
3849  }
3850 
3851  SrcReg = TmpReg;
3852  } while (Offset);
3853 }
3854 
3857  unsigned DestReg, unsigned SrcReg,
3859  MachineInstr::MIFlag Flag, bool SetNZCV,
3860  bool NeedsWinCFI, bool *HasWinCFI) {
3861  int64_t Bytes, NumPredicateVectors, NumDataVectors;
3863  Offset, Bytes, NumPredicateVectors, NumDataVectors);
3864 
3865  // First emit non-scalable frame offsets, or a simple 'mov'.
3866  if (Bytes || (!Offset && SrcReg != DestReg)) {
3867  assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
3868  "SP increment/decrement not 8-byte aligned");
3869  unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
3870  if (Bytes < 0) {
3871  Bytes = -Bytes;
3872  Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
3873  }
3874  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
3875  NeedsWinCFI, HasWinCFI);
3876  SrcReg = DestReg;
3877  }
3878 
3879  assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
3880  "SetNZCV not supported with SVE vectors");
3881  assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
3882  "WinCFI not supported with SVE vectors");
3883 
3884  if (NumDataVectors) {
3885  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
3886  AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3887  SrcReg = DestReg;
3888  }
3889 
3890  if (NumPredicateVectors) {
3891  assert(DestReg != AArch64::SP && "Unaligned access to SP");
3892  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
3893  AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3894  }
3895 }
3896 
3900  LiveIntervals *LIS, VirtRegMap *VRM) const {
3901  // This is a bit of a hack. Consider this instruction:
3902  //
3903  // %0 = COPY %sp; GPR64all:%0
3904  //
3905  // We explicitly chose GPR64all for the virtual register so such a copy might
3906  // be eliminated by RegisterCoalescer. However, that may not be possible, and
3907  // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
3908  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
3909  //
3910  // To prevent that, we are going to constrain the %0 register class here.
3911  //
3912  // <rdar://problem/11522048>
3913  //
3914  if (MI.isFullCopy()) {
3915  Register DstReg = MI.getOperand(0).getReg();
3916  Register SrcReg = MI.getOperand(1).getReg();
3917  if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
3918  MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
3919  return nullptr;
3920  }
3921  if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
3922  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3923  return nullptr;
3924  }
3925  }
3926 
3927  // Handle the case where a copy is being spilled or filled but the source
3928  // and destination register class don't match. For example:
3929  //
3930  // %0 = COPY %xzr; GPR64common:%0
3931  //
3932  // In this case we can still safely fold away the COPY and generate the
3933  // following spill code:
3934  //
3935  // STRXui %xzr, %stack.0
3936  //
3937  // This also eliminates spilled cross register class COPYs (e.g. between x and
3938  // d regs) of the same size. For example:
3939  //
3940  // %0 = COPY %1; GPR64:%0, FPR64:%1
3941  //
3942  // will be filled as
3943  //
3944  // LDRDui %0, fi<#0>
3945  //
3946  // instead of
3947  //
3948  // LDRXui %Temp, fi<#0>
3949  // %0 = FMOV %Temp
3950  //
3951  if (MI.isCopy() && Ops.size() == 1 &&
3952  // Make sure we're only folding the explicit COPY defs/uses.
3953  (Ops[0] == 0 || Ops[0] == 1)) {
3954  bool IsSpill = Ops[0] == 0;
3955  bool IsFill = !IsSpill;
3957  const MachineRegisterInfo &MRI = MF.getRegInfo();
3958  MachineBasicBlock &MBB = *MI.getParent();
3959  const MachineOperand &DstMO = MI.getOperand(0);
3960  const MachineOperand &SrcMO = MI.getOperand(1);
3961  Register DstReg = DstMO.getReg();
3962  Register SrcReg = SrcMO.getReg();
3963  // This is slightly expensive to compute for physical regs since
3964  // getMinimalPhysRegClass is slow.
3965  auto getRegClass = [&](unsigned Reg) {
3968  };
3969 
3970  if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3971  assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3972  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3973  "Mismatched register size in non subreg COPY");
3974  if (IsSpill)
3975  storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3976  getRegClass(SrcReg), &TRI);
3977  else
3978  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3979  getRegClass(DstReg), &TRI);
3980  return &*--InsertPt;
3981  }
3982 
3983  // Handle cases like spilling def of:
3984  //
3985  // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3986  //
3987  // where the physical register source can be widened and stored to the full
3988  // virtual reg destination stack slot, in this case producing:
3989  //
3990  // STRXui %xzr, %stack.0
3991  //
3992  if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
3993  assert(SrcMO.getSubReg() == 0 &&
3994  "Unexpected subreg on physical register");
3995  const TargetRegisterClass *SpillRC;
3996  unsigned SpillSubreg;
3997  switch (DstMO.getSubReg()) {
3998  default:
3999  SpillRC = nullptr;
4000  break;
4001  case AArch64::sub_32:
4002  case AArch64::ssub:
4003  if (AArch64::GPR32RegClass.contains(SrcReg)) {
4004  SpillRC = &AArch64::GPR64RegClass;
4005  SpillSubreg = AArch64::sub_32;
4006  } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
4007  SpillRC = &AArch64::FPR64RegClass;
4008  SpillSubreg = AArch64::ssub;
4009  } else
4010  SpillRC = nullptr;
4011  break;
4012  case AArch64::dsub:
4013  if (AArch64::FPR64RegClass.contains(SrcReg)) {
4014  SpillRC = &AArch64::FPR128RegClass;
4015  SpillSubreg = AArch64::dsub;
4016  } else
4017  SpillRC = nullptr;
4018  break;
4019  }
4020 
4021  if (SpillRC)
4022  if (unsigned WidenedSrcReg =
4023  TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
4024  storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
4025  FrameIndex, SpillRC, &TRI);
4026  return &*--InsertPt;
4027  }
4028  }
4029 
4030  // Handle cases like filling use of:
4031  //
4032  // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
4033  //
4034  // where we can load the full virtual reg source stack slot, into the subreg
4035  // destination, in this case producing:
4036  //
4037  // LDRWui %0:sub_32<def,read-undef>, %stack.0
4038  //
4039  if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
4040  const TargetRegisterClass *FillRC;
4041  switch (DstMO.getSubReg()) {
4042  default:
4043  FillRC = nullptr;
4044  break;
4045  case AArch64::sub_32:
4046  FillRC = &AArch64::GPR32RegClass;
4047  break;
4048  case AArch64::ssub:
4049  FillRC = &AArch64::FPR32RegClass;
4050  break;
4051  case AArch64::dsub:
4052  FillRC = &AArch64::FPR64RegClass;
4053  break;
4054  }
4055 
4056  if (FillRC) {
4057  assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
4058  TRI.getRegSizeInBits(*FillRC) &&
4059  "Mismatched regclass size on folded subreg COPY");
4060  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
4061  MachineInstr &LoadMI = *--InsertPt;
4062  MachineOperand &LoadDst = LoadMI.getOperand(0);
4063  assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
4064  LoadDst.setSubReg(DstMO.getSubReg());
4065  LoadDst.setIsUndef();
4066  return &LoadMI;
4067  }
4068  }
4069  }
4070 
4071  // Cannot fold.
4072  return nullptr;
4073 }
4074 
4076  StackOffset &SOffset,
4077  bool *OutUseUnscaledOp,
4078  unsigned *OutUnscaledOp,
4079  int64_t *EmittableOffset) {
4080  // Set output values in case of early exit.
4081  if (EmittableOffset)
4082  *EmittableOffset = 0;
4083  if (OutUseUnscaledOp)
4084  *OutUseUnscaledOp = false;
4085  if (OutUnscaledOp)
4086  *OutUnscaledOp = 0;
4087 
4088  // Exit early for structured vector spills/fills as they can't take an
4089  // immediate offset.
4090  switch (MI.getOpcode()) {
4091  default:
4092  break;
4093  case AArch64::LD1Twov2d:
4094  case AArch64::LD1Threev2d:
4095  case AArch64::LD1Fourv2d:
4096  case AArch64::LD1Twov1d:
4097  case AArch64::LD1Threev1d:
4098  case AArch64::LD1Fourv1d:
4099  case AArch64::ST1Twov2d:
4100  case AArch64::ST1Threev2d:
4101  case AArch64::ST1Fourv2d:
4102  case AArch64::ST1Twov1d:
4103  case AArch64::ST1Threev1d:
4104  case AArch64::ST1Fourv1d:
4105  case AArch64::IRG:
4106  case AArch64::IRGstack:
4107  case AArch64::STGloop:
4108  case AArch64::STZGloop:
4110  }
4111 
4112  // Get the min/max offset and the scale.
4113  TypeSize ScaleValue(0U, false);
4114  unsigned Width;
4115  int64_t MinOff, MaxOff;
4116  if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
4117  MaxOff))
4118  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
4119 
4120  // Construct the complete offset.
4121  bool IsMulVL = ScaleValue.isScalable();
4122  unsigned Scale = ScaleValue.getKnownMinSize();
4123  int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
4124 
4125  const MachineOperand &ImmOpnd =
4126  MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
4127  Offset += ImmOpnd.getImm() * Scale;
4128 
4129  // If the offset doesn't match the scale, we rewrite the instruction to
4130  // use the unscaled instruction instead. Likewise, if we have a negative
4131  // offset and there is an unscaled op to use.
4132  Optional<unsigned> UnscaledOp =
4134  bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
4135  if (useUnscaledOp &&
4136  !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
4137  MaxOff))
4138  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
4139 
4140  Scale = ScaleValue.getKnownMinSize();
4141  assert(IsMulVL == ScaleValue.isScalable() &&
4142  "Unscaled opcode has different value for scalable");
4143 
4144  int64_t Remainder = Offset % Scale;
4145  assert(!(Remainder && useUnscaledOp) &&
4146  "Cannot have remainder when using unscaled op");
4147 
4148  assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
4149  int64_t NewOffset = Offset / Scale;
4150  if (MinOff <= NewOffset && NewOffset <= MaxOff)
4151  Offset = Remainder;
4152  else {
4153  NewOffset = NewOffset < 0 ? MinOff : MaxOff;
4154  Offset = Offset - NewOffset * Scale + Remainder;
4155  }
4156 
4157  if (EmittableOffset)
4158  *EmittableOffset = NewOffset;
4159  if (OutUseUnscaledOp)
4160  *OutUseUnscaledOp = useUnscaledOp;
4161  if (OutUnscaledOp && UnscaledOp)
4162  *OutUnscaledOp = *UnscaledOp;
4163 
4164  if (IsMulVL)
4165  SOffset = StackOffset::get(SOffset.getFixed(), Offset);
4166  else
4167  SOffset = StackOffset::get(Offset, SOffset.getScalable());
4169  (SOffset ? 0 : AArch64FrameOffsetIsLegal);
4170 }
4171 
4172 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
4173  unsigned FrameReg, StackOffset &Offset,
4174  const AArch64InstrInfo *TII) {
4175  unsigned Opcode = MI.getOpcode();
4176  unsigned ImmIdx = FrameRegIdx + 1;
4177 
4178  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
4179  Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
4180  emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
4181  MI.getOperand(0).getReg(), FrameReg, Offset, TII,
4182  MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
4183  MI.eraseFromParent();
4184  Offset = StackOffset();
4185  return true;
4186  }
4187 
4188  int64_t NewOffset;
4189  unsigned UnscaledOp;
4190  bool UseUnscaledOp;
4191  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
4192  &UnscaledOp, &NewOffset);
4195  // Replace the FrameIndex with FrameReg.
4196  MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
4197  if (UseUnscaledOp)
4198  MI.setDesc(TII->get(UnscaledOp));
4199 
4200  MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
4201  return !Offset;
4202  }
4203 
4204  return false;
4205 }
4206 
4208  return MCInstBuilder(AArch64::HINT).addImm(0);
4209 }
4210 
4211 // AArch64 supports MachineCombiner.
4212 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
4213 
4214 // True when Opc sets flag
4215 static bool isCombineInstrSettingFlag(unsigned Opc) {
4216  switch (Opc) {
4217  case AArch64::ADDSWrr:
4218  case AArch64::ADDSWri:
4219  case AArch64::ADDSXrr:
4220  case AArch64::ADDSXri:
4221  case AArch64::SUBSWrr:
4222  case AArch64::SUBSXrr:
4223  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4224  case AArch64::SUBSWri:
4225  case AArch64::SUBSXri:
4226  return true;
4227  default:
4228  break;
4229  }
4230  return false;
4231 }
4232 
4233 // 32b Opcodes that can be combined with a MUL
4234 static bool isCombineInstrCandidate32(unsigned Opc) {
4235  switch (Opc) {
4236  case AArch64::ADDWrr:
4237  case AArch64::ADDWri:
4238  case AArch64::SUBWrr:
4239  case AArch64::ADDSWrr:
4240  case AArch64::ADDSWri:
4241  case AArch64::SUBSWrr:
4242  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4243  case AArch64::SUBWri:
4244  case AArch64::SUBSWri:
4245  return true;
4246  default:
4247  break;
4248  }
4249  return false;
4250 }
4251 
4252 // 64b Opcodes that can be combined with a MUL
4253 static bool isCombineInstrCandidate64(unsigned Opc) {
4254  switch (Opc) {
4255  case AArch64::ADDXrr:
4256  case AArch64::ADDXri:
4257  case AArch64::SUBXrr:
4258  case AArch64::ADDSXrr:
4259  case AArch64::ADDSXri:
4260  case AArch64::SUBSXrr:
4261  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4262  case AArch64::SUBXri:
4263  case AArch64::SUBSXri:
4264  case AArch64::ADDv8i8:
4265  case AArch64::ADDv16i8:
4266  case AArch64::ADDv4i16:
4267  case AArch64::ADDv8i16:
4268  case AArch64::ADDv2i32:
4269  case AArch64::ADDv4i32:
4270  case AArch64::SUBv8i8:
4271  case AArch64::SUBv16i8:
4272  case AArch64::SUBv4i16:
4273  case AArch64::SUBv8i16:
4274  case AArch64::SUBv2i32:
4275  case AArch64::SUBv4i32:
4276  return true;
4277  default:
4278  break;
4279  }
4280  return false;
4281 }
4282 
4283 // FP Opcodes that can be combined with a FMUL.
4284 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
4285  switch (Inst.getOpcode()) {
4286  default:
4287  break;
4288  case AArch64::FADDHrr:
4289  case AArch64::FADDSrr:
4290  case AArch64::FADDDrr:
4291  case AArch64::FADDv4f16:
4292  case AArch64::FADDv8f16:
4293  case AArch64::FADDv2f32:
4294  case AArch64::FADDv2f64:
4295  case AArch64::FADDv4f32:
4296  case AArch64::FSUBHrr:
4297  case AArch64::FSUBSrr:
4298  case AArch64::FSUBDrr:
4299  case AArch64::FSUBv4f16:
4300  case AArch64::FSUBv8f16:
4301  case AArch64::FSUBv2f32:
4302  case AArch64::FSUBv2f64:
4303  case AArch64::FSUBv4f32:
4304  TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
4305  // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
4306  // the target options or if FADD/FSUB has the contract fast-math flag.
4307  return Options.UnsafeFPMath ||
4308  Options.AllowFPOpFusion == FPOpFusion::Fast ||
4310  return true;
4311  }
4312  return false;
4313 }
4314 
4315 // Opcodes that can be combined with a MUL
4316 static bool isCombineInstrCandidate(unsigned Opc) {
4318 }
4319 
4320 //
4321 // Utility routine that checks if \param MO is defined by an
4322 // \param CombineOpc instruction in the basic block \param MBB
4324  unsigned CombineOpc, unsigned ZeroReg = 0,
4325  bool CheckZeroReg = false) {
4327  MachineInstr *MI = nullptr;
4328 
4329  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
4330  MI = MRI.getUniqueVRegDef(MO.getReg());
4331  // And it needs to be in the trace (otherwise, it won't have a depth).
4332  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
4333  return false;
4334  // Must only used by the user we combine with.
4335  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
4336  return false;
4337 
4338  if (CheckZeroReg) {
4339  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
4340  MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
4341  MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
4342  // The third input reg must be zero.
4343  if (MI->getOperand(3).getReg() != ZeroReg)
4344  return false;
4345  }
4346 
4347  return true;
4348 }
4349 
4350 //
4351 // Is \param MO defined by an integer multiply and can be combined?
4353  unsigned MulOpc, unsigned ZeroReg) {
4354  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
4355 }
4356 
4357 //
4358 // Is \param MO defined by a floating-point multiply and can be combined?
4360  unsigned MulOpc) {
4361  return canCombine(MBB, MO, MulOpc);
4362 }
4363 
4364 // TODO: There are many more machine instruction opcodes to match:
4365 // 1. Other data types (integer, vectors)
4366 // 2. Other math / logic operations (xor, or)
4367 // 3. Other forms of the same operation (intrinsics and other variants)
4369  const MachineInstr &Inst) const {
4370  switch (Inst.getOpcode()) {
4371  case AArch64::FADDDrr:
4372  case AArch64::FADDSrr:
4373  case AArch64::FADDv2f32:
4374  case AArch64::FADDv2f64:
4375  case AArch64::FADDv4f32:
4376  case AArch64::FMULDrr:
4377  case AArch64::FMULSrr:
4378  case AArch64::FMULX32:
4379  case AArch64::FMULX64:
4380  case AArch64::FMULXv2f32:
4381  case AArch64::FMULXv2f64:
4382  case AArch64::FMULXv4f32:
4383  case AArch64::FMULv2f32:
4384  case AArch64::FMULv2f64:
4385  case AArch64::FMULv4f32:
4386  return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
4387  default:
4388  return false;
4389  }
4390 }
4391 
4392 /// Find instructions that can be turned into madd.
4393 static bool getMaddPatterns(MachineInstr &Root,
4395  unsigned Opc = Root.getOpcode();
4396  MachineBasicBlock &MBB = *Root.getParent();
4397  bool Found = false;
4398 
4399  if (!isCombineInstrCandidate(Opc))
4400  return false;
4401  if (isCombineInstrSettingFlag(Opc)) {
4402  int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
4403  // When NZCV is live bail out.
4404  if (Cmp_NZCV == -1)
4405  return false;
4406  unsigned NewOpc = convertToNonFlagSettingOpc(Root);
4407  // When opcode can't change bail out.
4408  // CHECKME: do we miss any cases for opcode conversion?
4409  if (NewOpc == Opc)
4410  return false;
4411  Opc = NewOpc;
4412  }
4413 
4414  auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
4416  if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
4417  Patterns.push_back(Pattern);
4418  Found = true;
4419  }
4420  };
4421 
4422  auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
4423  if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
4424  Patterns.push_back(Pattern);
4425  Found = true;
4426  }
4427  };
4428 
4429  typedef MachineCombinerPattern MCP;
4430 
4431  switch (Opc) {
4432  default:
4433  break;
4434  case AArch64::ADDWrr:
4435  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4436  "ADDWrr does not have register operands");
4437  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
4438  setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
4439  break;
4440  case AArch64::ADDXrr:
4441  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
4442  setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
4443  break;
4444  case AArch64::SUBWrr:
4445  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
4446  setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
4447  break;
4448  case AArch64::SUBXrr:
4449  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
4450  setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
4451  break;
4452  case AArch64::ADDWri:
4453  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
4454  break;
4455  case AArch64::ADDXri:
4456  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
4457  break;
4458  case AArch64::SUBWri:
4459  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
4460  break;
4461  case AArch64::SUBXri:
4462  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
4463  break;
4464  case AArch64::ADDv8i8:
4465  setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
4466  setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
4467  break;
4468  case AArch64::ADDv16i8:
4469  setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
4470  setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
4471  break;
4472  case AArch64::ADDv4i16:
4473  setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
4474  setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
4475  setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
4476  setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
4477  break;
4478  case AArch64::ADDv8i16:
4479  setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
4480  setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
4481  setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
4482  setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
4483  break;
4484  case AArch64::ADDv2i32:
4485  setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
4486  setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
4487  setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
4488  setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
4489  break;
4490  case AArch64::ADDv4i32:
4491  setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
4492  setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
4493  setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
4494  setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
4495  break;
4496  case AArch64::SUBv8i8:
4497  setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
4498  setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
4499  break;
4500  case AArch64::SUBv16i8:
4501  setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
4502  setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
4503  break;
4504  case AArch64::SUBv4i16:
4505  setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
4506  setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
4507  setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
4508  setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
4509  break;
4510  case AArch64::SUBv8i16:
4511  setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
4512  setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
4513  setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
4514  setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
4515  break;
4516  case AArch64::SUBv2i32:
4517  setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
4518  setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
4519  setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
4520  setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
4521  break;
4522  case AArch64::SUBv4i32:
4523  setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
4524  setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
4525  setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
4526  setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
4527  break;
4528  }
4529  return Found;
4530 }
4531 /// Floating-Point Support
4532 
4533 /// Find instructions that can be turned into madd.
4534 static bool getFMAPatterns(MachineInstr &Root,
4536 
4537  if (!isCombineInstrCandidateFP(Root))
4538  return false;
4539 
4540  MachineBasicBlock &MBB = *Root.getParent();
4541  bool Found = false;
4542 
4543  auto Match = [&](int Opcode, int Operand,
4544  MachineCombinerPattern Pattern) -> bool {
4545  if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
4546  Patterns.push_back(Pattern);
4547  return true;
4548  }
4549  return false;
4550  };
4551 
4552  typedef MachineCombinerPattern MCP;
4553 
4554  switch (Root.getOpcode()) {
4555  default:
4556  assert(false && "Unsupported FP instruction in combiner\n");
4557  break;
4558  case AArch64::FADDHrr:
4559  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4560  "FADDHrr does not have register operands");
4561 
4562  Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
4563  Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
4564  break;
4565  case AArch64::FADDSrr:
4566  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
4567  "FADDSrr does not have register operands");
4568 
4569  Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
4570  Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
4571 
4572  Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
4573  Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
4574  break;
4575  case AArch64::FADDDrr:
4576  Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
4577  Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
4578 
4579  Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
4580  Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
4581  break;
4582  case AArch64::FADDv4f16:
4583  Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
4584  Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
4585 
4586  Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
4587  Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
4588  break;
4589  case AArch64::FADDv8f16:
4590  Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
4591  Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
4592 
4593  Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
4594  Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
4595  break;
4596  case AArch64::FADDv2f32:
4597  Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
4598  Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
4599 
4600  Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
4601  Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
4602  break;
4603  case AArch64::FADDv2f64:
4604  Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
4605  Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
4606 
4607  Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
4608  Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
4609  break;
4610  case AArch64::FADDv4f32:
4611  Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
4612  Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
4613 
4614  Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
4615  Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
4616  break;
4617  case AArch64::FSUBHrr:
4618  Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
4619  Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
4620  Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
4621  break;
4622  case AArch64::FSUBSrr:
4623  Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
4624 
4625  Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
4626  Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
4627 
4628  Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
4629  break;
4630  case AArch64::FSUBDrr:
4631  Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
4632 
4633  Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
4634  Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
4635 
4636  Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
4637  break;
4638  case AArch64::FSUBv4f16:
4639  Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
4640  Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
4641 
4642  Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
4643  Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
4644  break;
4645  case AArch64::FSUBv8f16:
4646  Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
4647  Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
4648 
4649  Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
4650  Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
4651  break;
4652  case AArch64::FSUBv2f32:
4653  Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
4654  Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
4655 
4656  Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
4657  Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
4658  break;
4659  case AArch64::FSUBv2f64:
4660  Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
4661  Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
4662 
4663  Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
4664  Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
4665  break;
4666  case AArch64::FSUBv4f32:
4667  Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
4668  Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
4669 
4670  Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
4671  Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
4672  break;
4673  }
4674  return Found;
4675 }
4676 
4677 /// Return true when a code sequence can improve throughput. It
4678 /// should be called only for instructions in loops.
4679 /// \param Pattern - combiner pattern
4682  switch (Pattern) {
4683  default:
4684  break;
4780  return true;
4781  } // end switch (Pattern)
4782  return false;
4783 }
4784 /// Return true when there is potentially a faster code sequence for an
4785 /// instruction chain ending in \p Root. All potential patterns are listed in
4786 /// the \p Pattern vector. Pattern should be sorted in priority order since the
4787 /// pattern evaluator stops checking as soon as it finds a faster sequence.
4788 
4791  bool DoRegPressureReduce) const {
4792  // Integer patterns
4793  if (getMaddPatterns(Root, Patterns))
4794  return true;
4795  // Floating point patterns
4796  if (getFMAPatterns(Root, Patterns))
4797  return true;
4798 
4799  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
4800  DoRegPressureReduce);
4801 }
4802 
4803 enum class FMAInstKind { Default, Indexed, Accumulator };
4804 /// genFusedMultiply - Generate fused multiply instructions.
4805 /// This function supports both integer and floating point instructions.
4806 /// A typical example:
4807 /// F|MUL I=A,B,0
4808 /// F|ADD R,I,C
4809 /// ==> F|MADD R,A,B,C
4810 /// \param MF Containing MachineFunction
4811 /// \param MRI Register information
4812 /// \param TII Target information
4813 /// \param Root is the F|ADD instruction
4814 /// \param [out] InsInstrs is a vector of machine instructions and will
4815 /// contain the generated madd instruction
4816 /// \param IdxMulOpd is index of operand in Root that is the result of
4817 /// the F|MUL. In the example above IdxMulOpd is 1.
4818 /// \param MaddOpc the opcode fo the f|madd instruction
4819 /// \param RC Register class of operands
4820 /// \param kind of fma instruction (addressing mode) to be generated
4821 /// \param ReplacedAddend is the result register from the instruction
4822 /// replacing the non-combined operand, if any.
4823 static MachineInstr *
4825  const TargetInstrInfo *TII, MachineInstr &Root,
4826  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
4827  unsigned MaddOpc, const TargetRegisterClass *RC,
4829  const Register *ReplacedAddend = nullptr) {
4830  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4831 
4832  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
4833  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4834  Register ResultReg = Root.getOperand(0).getReg();
4835  Register SrcReg0 = MUL->getOperand(1).getReg();
4836  bool Src0IsKill = MUL->getOperand(1).isKill();
4837  Register SrcReg1 = MUL->getOperand(2).getReg();
4838  bool Src1IsKill = MUL->getOperand(2).isKill();
4839 
4840  unsigned SrcReg2;
4841  bool Src2IsKill;
4842  if (ReplacedAddend) {
4843  // If we just generated a new addend, we must be it's only use.
4844  SrcReg2 = *ReplacedAddend;
4845  Src2IsKill = true;
4846  } else {
4847  SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4848  Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4849  }
4850 
4851  if (Register::isVirtualRegister(ResultReg))
4852  MRI.constrainRegClass(ResultReg, RC);
4853  if (Register::isVirtualRegister(SrcReg0))
4854  MRI.constrainRegClass(SrcReg0, RC);
4855  if (Register::isVirtualRegister(SrcReg1))
4856  MRI.constrainRegClass(SrcReg1, RC);
4857  if (Register::isVirtualRegister(SrcReg2))
4858  MRI.constrainRegClass(SrcReg2, RC);
4859 
4860  MachineInstrBuilder MIB;
4861  if (kind == FMAInstKind::Default)
4862  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4863  .addReg(SrcReg0, getKillRegState(Src0IsKill))
4864  .addReg(SrcReg1, getKillRegState(Src1IsKill))
4865  .addReg(SrcReg2, getKillRegState(Src2IsKill));
4866  else if (kind == FMAInstKind::Indexed)
4867  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4868  .addReg(SrcReg2, getKillRegState(Src2IsKill))
4869  .addReg(SrcReg0, getKillRegState(Src0IsKill))
4870  .addReg(SrcReg1, getKillRegState(Src1IsKill))
4871  .addImm(MUL->getOperand(3).getImm());
4872  else if (kind == FMAInstKind::Accumulator)
4873  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4874  .addReg(SrcReg2, getKillRegState(Src2IsKill))
4875  .addReg(SrcReg0, getKillRegState(Src0IsKill))
4876  .addReg(SrcReg1, getKillRegState(Src1IsKill));
4877  else
4878  assert(false && "Invalid FMA instruction kind \n");
4879  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4880  InsInstrs.push_back(MIB);
4881  return MUL;
4882 }
4883 
4884 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
4885 /// instructions.
4886 ///
4887 /// \see genFusedMultiply
4891  unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4892  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4894 }
4895 
4896 /// genNeg - Helper to generate an intermediate negation of the second operand
4897 /// of Root
4899  const TargetInstrInfo *TII, MachineInstr &Root,
4901  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
4902  unsigned MnegOpc, const TargetRegisterClass *RC) {
4903  Register NewVR = MRI.createVirtualRegister(RC);
4904  MachineInstrBuilder MIB =
4905  BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
4906  .add(Root.getOperand(2));
4907  InsInstrs.push_back(MIB);
4908 
4909  assert(InstrIdxForVirtReg.empty());
4910  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4911 
4912  return NewVR;
4913 }
4914 
4915 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4916 /// instructions with an additional negation of the accumulator
4920  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4921  unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4922  assert(IdxMulOpd == 1);
4923 
4924  Register NewVR =
4925  genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4926  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4927  FMAInstKind::Accumulator, &NewVR);
4928 }
4929 
4930 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
4931 /// instructions.
4932 ///
4933 /// \see genFusedMultiply
4937  unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4938  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4940 }
4941 
4942 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4943 /// instructions with an additional negation of the accumulator
4947  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4948  unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4949  assert(IdxMulOpd == 1);
4950 
4951  Register NewVR =
4952  genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4953 
4954  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4955  FMAInstKind::Indexed, &NewVR);
4956 }
4957 
4958 /// genMaddR - Generate madd instruction and combine mul and add using
4959 /// an extra virtual register
4960 /// Example - an ADD intermediate needs to be stored in a register:
4961 /// MUL I=A,B,0
4962 /// ADD R,I,Imm
4963 /// ==> ORR V, ZR, Imm
4964 /// ==> MADD R,A,B,V
4965 /// \param MF Containing MachineFunction
4966 /// \param MRI Register information
4967 /// \param TII Target information
4968 /// \param Root is the ADD instruction
4969 /// \param [out] InsInstrs is a vector of machine instructions and will
4970 /// contain the generated madd instruction
4971 /// \param IdxMulOpd is index of operand in Root that is the result of
4972 /// the MUL. In the example above IdxMulOpd is 1.
4973 /// \param MaddOpc the opcode fo the madd instruction
4974 /// \param VR is a virtual register that holds the value of an ADD operand
4975 /// (V in the example above).
4976 /// \param RC Register class of operands
4978  const TargetInstrInfo *TII, MachineInstr &Root,
4980  unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4981  const TargetRegisterClass *RC) {
4982  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4983 
4984  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4985  Register ResultReg = Root.getOperand(0).getReg();
4986  Register SrcReg0 = MUL->getOperand(1).getReg();
4987  bool Src0IsKill = MUL->getOperand(1).isKill();
4988  Register SrcReg1 = MUL->getOperand(2).getReg();
4989  bool Src1IsKill = MUL->getOperand(2).isKill();
4990 
4991  if (Register::isVirtualRegister(ResultReg))
4992  MRI.constrainRegClass(ResultReg, RC);
4993  if (Register::isVirtualRegister(SrcReg0))
4994  MRI.constrainRegClass(SrcReg0, RC);
4995  if (Register::isVirtualRegister(SrcReg1))
4996  MRI.constrainRegClass(SrcReg1, RC);
4998  MRI.constrainRegClass(VR, RC);
4999 
5000  MachineInstrBuilder MIB =
5001  BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
5002  .addReg(SrcReg0, getKillRegState(Src0IsKill))
5003  .addReg(SrcReg1, getKillRegState(Src1IsKill))
5004  .addReg(VR);
5005  // Insert the MADD
5006  InsInstrs.push_back(MIB);
5007  return MUL;
5008 }
5009 
5010 /// When getMachineCombinerPatterns() finds potential patterns,
5011 /// this function generates the instructions that could replace the
5012 /// original code sequence
5017  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
5018  MachineBasicBlock &MBB = *Root.getParent();
5020  MachineFunction &MF = *MBB.getParent();
5021  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
5022 
5023  MachineInstr *MUL = nullptr;
5024  const TargetRegisterClass *RC;
5025  unsigned Opc;
5026  switch (Pattern) {
5027  default:
5028  // Reassociate instructions.
5030  DelInstrs, InstrIdxForVirtReg);
5031  return;
5034  // MUL I=A,B,0
5035  // ADD R,I,C
5036  // ==> MADD R,A,B,C
5037  // --- Create(MADD);
5039  Opc = AArch64::MADDWrrr;
5040  RC = &AArch64::GPR32RegClass;
5041  } else {
5042  Opc = AArch64::MADDXrrr;
5043  RC = &AArch64::GPR64RegClass;
5044  }
5045  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5046  break;
5049  // MUL I=A,B,0
5050  // ADD R,C,I
5051  // ==> MADD R,A,B,C
5052  // --- Create(MADD);
5054  Opc = AArch64::MADDWrrr;
5055  RC = &AArch64::GPR32RegClass;
5056  } else {
5057  Opc = AArch64::MADDXrrr;
5058  RC = &AArch64::GPR64RegClass;
5059  }
5060  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5061  break;
5064  // MUL I=A,B,0
5065  // ADD R,I,Imm
5066  // ==> ORR V, ZR, Imm
5067  // ==> MADD R,A,B,V
5068  // --- Create(MADD);
5069  const TargetRegisterClass *OrrRC;
5070  unsigned BitSize, OrrOpc, ZeroReg;
5072  OrrOpc = AArch64::ORRWri;
5073  OrrRC = &AArch64::GPR32spRegClass;
5074  BitSize = 32;
5075  ZeroReg = AArch64::WZR;
5076  Opc = AArch64::MADDWrrr;
5077  RC = &AArch64::GPR32RegClass;
5078  } else {
5079  OrrOpc = AArch64::ORRXri;
5080  OrrRC = &AArch64::GPR64spRegClass;
5081  BitSize = 64;
5082  ZeroReg = AArch64::XZR;
5083  Opc = AArch64::MADDXrrr;
5084  RC = &AArch64::GPR64RegClass;
5085  }
5086  Register NewVR = MRI.createVirtualRegister(OrrRC);
5087  uint64_t Imm = Root.getOperand(2).getImm();
5088 
5089  if (Root.getOperand(3).isImm()) {
5090  unsigned Val = Root.getOperand(3).getImm();
5091  Imm = Imm << Val;
5092  }
5093  uint64_t UImm = SignExtend64(Imm, BitSize);
5094  uint64_t Encoding;
5095  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
5096  MachineInstrBuilder MIB1 =
5097  BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
5098  .addReg(ZeroReg)
5099  .addImm(Encoding);
5100  InsInstrs.push_back(MIB1);
5101  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5102  MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
5103  }
5104  break;
5105  }
5108  // MUL I=A,B,0
5109  // SUB R,I, C
5110  // ==> SUB V, 0, C
5111  // ==> MADD R,A,B,V // = -C + A*B
5112  // --- Create(MADD);
5113  const TargetRegisterClass *SubRC;
5114  unsigned SubOpc, ZeroReg;
5116  SubOpc = AArch64::SUBWrr;
5117  SubRC = &AArch64::GPR32spRegClass;
5118  ZeroReg = AArch64::WZR;
5119  Opc = AArch64::MADDWrrr;
5120  RC = &AArch64::GPR32RegClass;
5121  } else {
5122  SubOpc = AArch64::SUBXrr;
5123  SubRC = &AArch64::GPR64spRegClass;
5124  ZeroReg = AArch64::XZR;
5125  Opc = AArch64::MADDXrrr;
5126  RC = &AArch64::GPR64RegClass;
5127  }
5128  Register NewVR = MRI.createVirtualRegister(SubRC);
5129  // SUB NewVR, 0, C
5130  MachineInstrBuilder MIB1 =
5131  BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
5132  .addReg(ZeroReg)
5133  .add(Root.getOperand(2));
5134  InsInstrs.push_back(MIB1);
5135  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5136  MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
5137  break;
5138  }
5141  // MUL I=A,B,0
5142  // SUB R,C,I
5143  // ==> MSUB R,A,B,C (computes C - A*B)
5144  // --- Create(MSUB);
5146  Opc = AArch64::MSUBWrrr;
5147  RC = &AArch64::GPR32RegClass;
5148  } else {
5149  Opc = AArch64::MSUBXrrr;
5150  RC = &AArch64::GPR64RegClass;
5151  }
5152  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5153  break;
5156  // MUL I=A,B,0
5157  // SUB R,I, Imm
5158  // ==> ORR V, ZR, -Imm
5159  // ==> MADD R,A,B,V // = -Imm + A*B
5160  // --- Create(MADD);
5161  const TargetRegisterClass *OrrRC;
5162  unsigned BitSize, OrrOpc, ZeroReg;
5164  OrrOpc = AArch64::ORRWri;
5165  OrrRC = &AArch64::GPR32spRegClass;
5166  BitSize = 32;
5167  ZeroReg = AArch64::WZR;
5168  Opc = AArch64::MADDWrrr;
5169  RC = &AArch64::GPR32RegClass;
5170  } else {
5171  OrrOpc = AArch64::ORRXri;
5172  OrrRC = &AArch64::GPR64spRegClass;
5173  BitSize = 64;
5174  ZeroReg = AArch64::XZR;
5175  Opc = AArch64::MADDXrrr;
5176  RC = &AArch64::GPR64RegClass;
5177  }
5178  Register NewVR = MRI.createVirtualRegister(OrrRC);
5179  uint64_t Imm = Root.getOperand(2).getImm();
5180  if (Root.getOperand(3).isImm()) {
5181  unsigned Val = Root.getOperand(3).getImm();
5182  Imm = Imm << Val;
5183  }
5184  uint64_t UImm = SignExtend64(-Imm, BitSize);
5185  uint64_t Encoding;
5186  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
5187  MachineInstrBuilder MIB1 =
5188  BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
5189  .addReg(ZeroReg)
5190  .addImm(Encoding);
5191  InsInstrs.push_back(MIB1);
5192  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5193  MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
5194  }
5195  break;
5196  }
5197 
5199  Opc = AArch64::MLAv8i8;
5200  RC = &AArch64::FPR64RegClass;
5201  MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
5202  break;
5203  case