LLVM 22.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
43#include "llvm/IR/DebugLoc.h"
44#include "llvm/IR/GlobalValue.h"
45#include "llvm/IR/Module.h"
46#include "llvm/MC/MCAsmInfo.h"
47#include "llvm/MC/MCInst.h"
49#include "llvm/MC/MCInstrDesc.h"
54#include "llvm/Support/LEB128.h"
58#include <cassert>
59#include <cstdint>
60#include <iterator>
61#include <utility>
62
63using namespace llvm;
64
65#define GET_INSTRINFO_CTOR_DTOR
66#include "AArch64GenInstrInfo.inc"
67
69 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
70 cl::desc("Restrict range of CB instructions (DEBUG)"));
71
73 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
74 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
75
77 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
78 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
79
81 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
82 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
83
85 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
86 cl::desc("Restrict range of B instructions (DEBUG)"));
87
89 "aarch64-search-limit", cl::Hidden, cl::init(2048),
90 cl::desc("Restrict range of instructions to search for the "
91 "machine-combiner gather pattern optimization"));
92
94 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
95 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
96 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
97
98/// GetInstSize - Return the number of bytes of code the specified
99/// instruction may be. This returns the maximum number of bytes.
101 const MachineBasicBlock &MBB = *MI.getParent();
102 const MachineFunction *MF = MBB.getParent();
103 const Function &F = MF->getFunction();
104 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
105
106 {
107 auto Op = MI.getOpcode();
108 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
109 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
110 }
111
112 // Meta-instructions emit no code.
113 if (MI.isMetaInstruction())
114 return 0;
115
116 // FIXME: We currently only handle pseudoinstructions that don't get expanded
117 // before the assembly printer.
118 unsigned NumBytes = 0;
119 const MCInstrDesc &Desc = MI.getDesc();
120
121 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
122 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
123
124 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
125 if (!MFI->shouldSignReturnAddress(*MF))
126 return NumBytes;
127
128 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
129 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
130 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
131 return NumBytes;
132 }
133
134 // Size should be preferably set in
135 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
136 // Specific cases handle instructions of variable sizes
137 switch (Desc.getOpcode()) {
138 default:
139 if (Desc.getSize())
140 return Desc.getSize();
141
142 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
143 // with fixed constant size but not specified in .td file) is a normal
144 // 4-byte insn.
145 NumBytes = 4;
146 break;
147 case TargetOpcode::STACKMAP:
148 // The upper bound for a stackmap intrinsic is the full length of its shadow
149 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
150 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
151 break;
152 case TargetOpcode::PATCHPOINT:
153 // The size of the patchpoint intrinsic is the number of bytes requested
154 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
155 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
156 break;
157 case TargetOpcode::STATEPOINT:
158 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
159 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
160 // No patch bytes means a normal call inst is emitted
161 if (NumBytes == 0)
162 NumBytes = 4;
163 break;
164 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
165 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
166 // instructions are expanded to the specified number of NOPs. Otherwise,
167 // they are expanded to 36-byte XRay sleds.
168 NumBytes =
169 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
170 break;
171 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
172 case TargetOpcode::PATCHABLE_TAIL_CALL:
173 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
174 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
175 NumBytes = 36;
176 break;
177 case TargetOpcode::PATCHABLE_EVENT_CALL:
178 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
179 NumBytes = 24;
180 break;
181
182 case AArch64::SPACE:
183 NumBytes = MI.getOperand(1).getImm();
184 break;
185 case TargetOpcode::BUNDLE:
186 NumBytes = getInstBundleLength(MI);
187 break;
188 }
189
190 return NumBytes;
191}
192
193unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
194 unsigned Size = 0;
196 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
197 while (++I != E && I->isInsideBundle()) {
198 assert(!I->isBundle() && "No nested bundle!");
200 }
201 return Size;
202}
203
206 // Block ends with fall-through condbranch.
207 switch (LastInst->getOpcode()) {
208 default:
209 llvm_unreachable("Unknown branch instruction?");
210 case AArch64::Bcc:
211 Target = LastInst->getOperand(1).getMBB();
212 Cond.push_back(LastInst->getOperand(0));
213 break;
214 case AArch64::CBZW:
215 case AArch64::CBZX:
216 case AArch64::CBNZW:
217 case AArch64::CBNZX:
218 Target = LastInst->getOperand(1).getMBB();
219 Cond.push_back(MachineOperand::CreateImm(-1));
220 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
221 Cond.push_back(LastInst->getOperand(0));
222 break;
223 case AArch64::TBZW:
224 case AArch64::TBZX:
225 case AArch64::TBNZW:
226 case AArch64::TBNZX:
227 Target = LastInst->getOperand(2).getMBB();
228 Cond.push_back(MachineOperand::CreateImm(-1));
229 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
230 Cond.push_back(LastInst->getOperand(0));
231 Cond.push_back(LastInst->getOperand(1));
232 break;
233 case AArch64::CBWPri:
234 case AArch64::CBXPri:
235 case AArch64::CBWPrr:
236 case AArch64::CBXPrr:
237 Target = LastInst->getOperand(3).getMBB();
238 Cond.push_back(MachineOperand::CreateImm(-1));
239 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
240 Cond.push_back(LastInst->getOperand(0));
241 Cond.push_back(LastInst->getOperand(1));
242 Cond.push_back(LastInst->getOperand(2));
243 break;
244 case AArch64::CBBAssertExt:
245 case AArch64::CBHAssertExt:
246 Target = LastInst->getOperand(3).getMBB();
247 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
248 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
249 Cond.push_back(LastInst->getOperand(0)); // Cond
250 Cond.push_back(LastInst->getOperand(1)); // Op0
251 Cond.push_back(LastInst->getOperand(2)); // Op1
252 Cond.push_back(LastInst->getOperand(4)); // Ext0
253 Cond.push_back(LastInst->getOperand(5)); // Ext1
254 break;
255 }
256}
257
258static unsigned getBranchDisplacementBits(unsigned Opc) {
259 switch (Opc) {
260 default:
261 llvm_unreachable("unexpected opcode!");
262 case AArch64::B:
263 return BDisplacementBits;
264 case AArch64::TBNZW:
265 case AArch64::TBZW:
266 case AArch64::TBNZX:
267 case AArch64::TBZX:
268 return TBZDisplacementBits;
269 case AArch64::CBNZW:
270 case AArch64::CBZW:
271 case AArch64::CBNZX:
272 case AArch64::CBZX:
273 return CBZDisplacementBits;
274 case AArch64::Bcc:
275 return BCCDisplacementBits;
276 case AArch64::CBWPri:
277 case AArch64::CBXPri:
278 case AArch64::CBBAssertExt:
279 case AArch64::CBHAssertExt:
280 case AArch64::CBWPrr:
281 case AArch64::CBXPrr:
282 return CBDisplacementBits;
283 }
284}
285
287 int64_t BrOffset) const {
288 unsigned Bits = getBranchDisplacementBits(BranchOp);
289 assert(Bits >= 3 && "max branch displacement must be enough to jump"
290 "over conditional branch expansion");
291 return isIntN(Bits, BrOffset / 4);
292}
293
296 switch (MI.getOpcode()) {
297 default:
298 llvm_unreachable("unexpected opcode!");
299 case AArch64::B:
300 return MI.getOperand(0).getMBB();
301 case AArch64::TBZW:
302 case AArch64::TBNZW:
303 case AArch64::TBZX:
304 case AArch64::TBNZX:
305 return MI.getOperand(2).getMBB();
306 case AArch64::CBZW:
307 case AArch64::CBNZW:
308 case AArch64::CBZX:
309 case AArch64::CBNZX:
310 case AArch64::Bcc:
311 return MI.getOperand(1).getMBB();
312 case AArch64::CBWPri:
313 case AArch64::CBXPri:
314 case AArch64::CBBAssertExt:
315 case AArch64::CBHAssertExt:
316 case AArch64::CBWPrr:
317 case AArch64::CBXPrr:
318 return MI.getOperand(3).getMBB();
319 }
320}
321
323 MachineBasicBlock &NewDestBB,
324 MachineBasicBlock &RestoreBB,
325 const DebugLoc &DL,
326 int64_t BrOffset,
327 RegScavenger *RS) const {
328 assert(RS && "RegScavenger required for long branching");
329 assert(MBB.empty() &&
330 "new block should be inserted for expanding unconditional branch");
331 assert(MBB.pred_size() == 1);
332 assert(RestoreBB.empty() &&
333 "restore block should be inserted for restoring clobbered registers");
334
335 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
336 // Offsets outside of the signed 33-bit range are not supported for ADRP +
337 // ADD.
338 if (!isInt<33>(BrOffset))
340 "Branch offsets outside of the signed 33-bit range not supported");
341
342 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
343 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
344 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
345 .addReg(Reg)
346 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
347 .addImm(0);
348 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
349 };
350
351 RS->enterBasicBlockEnd(MBB);
352 // If X16 is unused, we can rely on the linker to insert a range extension
353 // thunk if NewDestBB is out of range of a single B instruction.
354 constexpr Register Reg = AArch64::X16;
355 if (!RS->isRegUsed(Reg)) {
356 insertUnconditionalBranch(MBB, &NewDestBB, DL);
357 RS->setRegUsed(Reg);
358 return;
359 }
360
361 // If there's a free register and it's worth inflating the code size,
362 // manually insert the indirect branch.
363 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
364 if (Scavenged != AArch64::NoRegister &&
365 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
366 buildIndirectBranch(Scavenged, NewDestBB);
367 RS->setRegUsed(Scavenged);
368 return;
369 }
370
371 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
372 // with red zones.
373 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
374 if (!AFI || AFI->hasRedZone().value_or(true))
376 "Unable to insert indirect branch inside function that has red zone");
377
378 // Otherwise, spill X16 and defer range extension to the linker.
379 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
380 .addReg(AArch64::SP, RegState::Define)
381 .addReg(Reg)
382 .addReg(AArch64::SP)
383 .addImm(-16);
384
385 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
386
387 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
388 .addReg(AArch64::SP, RegState::Define)
390 .addReg(AArch64::SP)
391 .addImm(16);
392}
393
394// Branch analysis.
397 MachineBasicBlock *&FBB,
399 bool AllowModify) const {
400 // If the block has no terminators, it just falls into the block after it.
401 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
402 if (I == MBB.end())
403 return false;
404
405 // Skip over SpeculationBarrierEndBB terminators
406 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
407 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
408 --I;
409 }
410
411 if (!isUnpredicatedTerminator(*I))
412 return false;
413
414 // Get the last instruction in the block.
415 MachineInstr *LastInst = &*I;
416
417 // If there is only one terminator instruction, process it.
418 unsigned LastOpc = LastInst->getOpcode();
419 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
420 if (isUncondBranchOpcode(LastOpc)) {
421 TBB = LastInst->getOperand(0).getMBB();
422 return false;
423 }
424 if (isCondBranchOpcode(LastOpc)) {
425 // Block ends with fall-through condbranch.
426 parseCondBranch(LastInst, TBB, Cond);
427 return false;
428 }
429 return true; // Can't handle indirect branch.
430 }
431
432 // Get the instruction before it if it is a terminator.
433 MachineInstr *SecondLastInst = &*I;
434 unsigned SecondLastOpc = SecondLastInst->getOpcode();
435
436 // If AllowModify is true and the block ends with two or more unconditional
437 // branches, delete all but the first unconditional branch.
438 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
439 while (isUncondBranchOpcode(SecondLastOpc)) {
440 LastInst->eraseFromParent();
441 LastInst = SecondLastInst;
442 LastOpc = LastInst->getOpcode();
443 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
444 // Return now the only terminator is an unconditional branch.
445 TBB = LastInst->getOperand(0).getMBB();
446 return false;
447 }
448 SecondLastInst = &*I;
449 SecondLastOpc = SecondLastInst->getOpcode();
450 }
451 }
452
453 // If we're allowed to modify and the block ends in a unconditional branch
454 // which could simply fallthrough, remove the branch. (Note: This case only
455 // matters when we can't understand the whole sequence, otherwise it's also
456 // handled by BranchFolding.cpp.)
457 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
458 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
459 LastInst->eraseFromParent();
460 LastInst = SecondLastInst;
461 LastOpc = LastInst->getOpcode();
462 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
463 assert(!isUncondBranchOpcode(LastOpc) &&
464 "unreachable unconditional branches removed above");
465
466 if (isCondBranchOpcode(LastOpc)) {
467 // Block ends with fall-through condbranch.
468 parseCondBranch(LastInst, TBB, Cond);
469 return false;
470 }
471 return true; // Can't handle indirect branch.
472 }
473 SecondLastInst = &*I;
474 SecondLastOpc = SecondLastInst->getOpcode();
475 }
476
477 // If there are three terminators, we don't know what sort of block this is.
478 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
479 return true;
480
481 // If the block ends with a B and a Bcc, handle it.
482 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
483 parseCondBranch(SecondLastInst, TBB, Cond);
484 FBB = LastInst->getOperand(0).getMBB();
485 return false;
486 }
487
488 // If the block ends with two unconditional branches, handle it. The second
489 // one is not executed, so remove it.
490 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
491 TBB = SecondLastInst->getOperand(0).getMBB();
492 I = LastInst;
493 if (AllowModify)
494 I->eraseFromParent();
495 return false;
496 }
497
498 // ...likewise if it ends with an indirect branch followed by an unconditional
499 // branch.
500 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
501 I = LastInst;
502 if (AllowModify)
503 I->eraseFromParent();
504 return true;
505 }
506
507 // Otherwise, can't handle this.
508 return true;
509}
510
512 MachineBranchPredicate &MBP,
513 bool AllowModify) const {
514 // For the moment, handle only a block which ends with a cb(n)zx followed by
515 // a fallthrough. Why this? Because it is a common form.
516 // TODO: Should we handle b.cc?
517
518 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
519 if (I == MBB.end())
520 return true;
521
522 // Skip over SpeculationBarrierEndBB terminators
523 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
524 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
525 --I;
526 }
527
528 if (!isUnpredicatedTerminator(*I))
529 return true;
530
531 // Get the last instruction in the block.
532 MachineInstr *LastInst = &*I;
533 unsigned LastOpc = LastInst->getOpcode();
534 if (!isCondBranchOpcode(LastOpc))
535 return true;
536
537 switch (LastOpc) {
538 default:
539 return true;
540 case AArch64::CBZW:
541 case AArch64::CBZX:
542 case AArch64::CBNZW:
543 case AArch64::CBNZX:
544 break;
545 };
546
547 MBP.TrueDest = LastInst->getOperand(1).getMBB();
548 assert(MBP.TrueDest && "expected!");
549 MBP.FalseDest = MBB.getNextNode();
550
551 MBP.ConditionDef = nullptr;
552 MBP.SingleUseCondition = false;
553
554 MBP.LHS = LastInst->getOperand(0);
555 MBP.RHS = MachineOperand::CreateImm(0);
556 MBP.Predicate = (LastOpc == AArch64::CBNZX || LastOpc == AArch64::CBNZW)
557 ? MachineBranchPredicate::PRED_NE
558 : MachineBranchPredicate::PRED_EQ;
559 return false;
560}
561
564 if (Cond[0].getImm() != -1) {
565 // Regular Bcc
566 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
568 } else {
569 // Folded compare-and-branch
570 switch (Cond[1].getImm()) {
571 default:
572 llvm_unreachable("Unknown conditional branch!");
573 case AArch64::CBZW:
574 Cond[1].setImm(AArch64::CBNZW);
575 break;
576 case AArch64::CBNZW:
577 Cond[1].setImm(AArch64::CBZW);
578 break;
579 case AArch64::CBZX:
580 Cond[1].setImm(AArch64::CBNZX);
581 break;
582 case AArch64::CBNZX:
583 Cond[1].setImm(AArch64::CBZX);
584 break;
585 case AArch64::TBZW:
586 Cond[1].setImm(AArch64::TBNZW);
587 break;
588 case AArch64::TBNZW:
589 Cond[1].setImm(AArch64::TBZW);
590 break;
591 case AArch64::TBZX:
592 Cond[1].setImm(AArch64::TBNZX);
593 break;
594 case AArch64::TBNZX:
595 Cond[1].setImm(AArch64::TBZX);
596 break;
597
598 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
599 case AArch64::CBWPri:
600 case AArch64::CBXPri:
601 case AArch64::CBBAssertExt:
602 case AArch64::CBHAssertExt:
603 case AArch64::CBWPrr:
604 case AArch64::CBXPrr: {
605 // Pseudos using standard 4bit Arm condition codes
607 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
609 }
610 }
611 }
612
613 return false;
614}
615
617 int *BytesRemoved) const {
618 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
619 if (I == MBB.end())
620 return 0;
621
622 if (!isUncondBranchOpcode(I->getOpcode()) &&
623 !isCondBranchOpcode(I->getOpcode()))
624 return 0;
625
626 // Remove the branch.
627 I->eraseFromParent();
628
629 I = MBB.end();
630
631 if (I == MBB.begin()) {
632 if (BytesRemoved)
633 *BytesRemoved = 4;
634 return 1;
635 }
636 --I;
637 if (!isCondBranchOpcode(I->getOpcode())) {
638 if (BytesRemoved)
639 *BytesRemoved = 4;
640 return 1;
641 }
642
643 // Remove the branch.
644 I->eraseFromParent();
645 if (BytesRemoved)
646 *BytesRemoved = 8;
647
648 return 2;
649}
650
651void AArch64InstrInfo::instantiateCondBranch(
654 if (Cond[0].getImm() != -1) {
655 // Regular Bcc
656 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
657 } else {
658 // Folded compare-and-branch
659 // Note that we use addOperand instead of addReg to keep the flags.
660
661 // cbz, cbnz
662 const MachineInstrBuilder MIB =
663 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
664
665 // tbz/tbnz
666 if (Cond.size() > 3)
667 MIB.add(Cond[3]);
668
669 // cb
670 if (Cond.size() > 4)
671 MIB.add(Cond[4]);
672
673 MIB.addMBB(TBB);
674
675 // cb[b,h]
676 if (Cond.size() > 5) {
677 MIB.addImm(Cond[5].getImm());
678 MIB.addImm(Cond[6].getImm());
679 }
680 }
681}
682
685 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
686 // Shouldn't be a fall through.
687 assert(TBB && "insertBranch must not be told to insert a fallthrough");
688
689 if (!FBB) {
690 if (Cond.empty()) // Unconditional branch?
691 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
692 else
693 instantiateCondBranch(MBB, DL, TBB, Cond);
694
695 if (BytesAdded)
696 *BytesAdded = 4;
697
698 return 1;
699 }
700
701 // Two-way conditional branch.
702 instantiateCondBranch(MBB, DL, TBB, Cond);
703 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
704
705 if (BytesAdded)
706 *BytesAdded = 8;
707
708 return 2;
709}
710
712 const TargetInstrInfo &TII) {
713 for (MachineInstr &MI : MBB->terminators()) {
714 unsigned Opc = MI.getOpcode();
715 switch (Opc) {
716 case AArch64::CBZW:
717 case AArch64::CBZX:
718 case AArch64::TBZW:
719 case AArch64::TBZX:
720 // CBZ/TBZ with WZR/XZR -> unconditional B
721 if (MI.getOperand(0).getReg() == AArch64::WZR ||
722 MI.getOperand(0).getReg() == AArch64::XZR) {
723 DEBUG_WITH_TYPE("optimizeTerminators",
724 dbgs() << "Removing always taken branch: " << MI);
725 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
726 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
727 for (auto *S : Succs)
728 if (S != Target)
729 MBB->removeSuccessor(S);
730 DebugLoc DL = MI.getDebugLoc();
731 while (MBB->rbegin() != &MI)
732 MBB->rbegin()->eraseFromParent();
733 MI.eraseFromParent();
734 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
735 return true;
736 }
737 break;
738 case AArch64::CBNZW:
739 case AArch64::CBNZX:
740 case AArch64::TBNZW:
741 case AArch64::TBNZX:
742 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
743 if (MI.getOperand(0).getReg() == AArch64::WZR ||
744 MI.getOperand(0).getReg() == AArch64::XZR) {
745 DEBUG_WITH_TYPE("optimizeTerminators",
746 dbgs() << "Removing never taken branch: " << MI);
747 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
748 MI.getParent()->removeSuccessor(Target);
749 MI.eraseFromParent();
750 return true;
751 }
752 break;
753 }
754 }
755 return false;
756}
757
758// Find the original register that VReg is copied from.
759static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
760 while (Register::isVirtualRegister(VReg)) {
761 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
762 if (!DefMI->isFullCopy())
763 return VReg;
764 VReg = DefMI->getOperand(1).getReg();
765 }
766 return VReg;
767}
768
769// Determine if VReg is defined by an instruction that can be folded into a
770// csel instruction. If so, return the folded opcode, and the replacement
771// register.
772static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
773 unsigned *NewReg = nullptr) {
774 VReg = removeCopies(MRI, VReg);
776 return 0;
777
778 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
779 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
780 unsigned Opc = 0;
781 unsigned SrcReg = 0;
782 switch (DefMI->getOpcode()) {
783 case AArch64::SUBREG_TO_REG:
784 // Check for the following way to define an 64-bit immediate:
785 // %0:gpr32 = MOVi32imm 1
786 // %1:gpr64 = SUBREG_TO_REG 0, %0:gpr32, %subreg.sub_32
787 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 0)
788 return 0;
789 if (!DefMI->getOperand(2).isReg())
790 return 0;
791 if (!DefMI->getOperand(3).isImm() ||
792 DefMI->getOperand(3).getImm() != AArch64::sub_32)
793 return 0;
794 DefMI = MRI.getVRegDef(DefMI->getOperand(2).getReg());
795 if (DefMI->getOpcode() != AArch64::MOVi32imm)
796 return 0;
797 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
798 return 0;
799 assert(Is64Bit);
800 SrcReg = AArch64::XZR;
801 Opc = AArch64::CSINCXr;
802 break;
803
804 case AArch64::MOVi32imm:
805 case AArch64::MOVi64imm:
806 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
807 return 0;
808 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
809 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
810 break;
811
812 case AArch64::ADDSXri:
813 case AArch64::ADDSWri:
814 // if NZCV is used, do not fold.
815 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
816 true) == -1)
817 return 0;
818 // fall-through to ADDXri and ADDWri.
819 [[fallthrough]];
820 case AArch64::ADDXri:
821 case AArch64::ADDWri:
822 // add x, 1 -> csinc.
823 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
824 DefMI->getOperand(3).getImm() != 0)
825 return 0;
826 SrcReg = DefMI->getOperand(1).getReg();
827 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
828 break;
829
830 case AArch64::ORNXrr:
831 case AArch64::ORNWrr: {
832 // not x -> csinv, represented as orn dst, xzr, src.
833 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
834 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
835 return 0;
836 SrcReg = DefMI->getOperand(2).getReg();
837 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
838 break;
839 }
840
841 case AArch64::SUBSXrr:
842 case AArch64::SUBSWrr:
843 // if NZCV is used, do not fold.
844 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
845 true) == -1)
846 return 0;
847 // fall-through to SUBXrr and SUBWrr.
848 [[fallthrough]];
849 case AArch64::SUBXrr:
850 case AArch64::SUBWrr: {
851 // neg x -> csneg, represented as sub dst, xzr, src.
852 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
853 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
854 return 0;
855 SrcReg = DefMI->getOperand(2).getReg();
856 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
857 break;
858 }
859 default:
860 return 0;
861 }
862 assert(Opc && SrcReg && "Missing parameters");
863
864 if (NewReg)
865 *NewReg = SrcReg;
866 return Opc;
867}
868
871 Register DstReg, Register TrueReg,
872 Register FalseReg, int &CondCycles,
873 int &TrueCycles,
874 int &FalseCycles) const {
875 // Check register classes.
876 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
877 const TargetRegisterClass *RC =
878 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
879 if (!RC)
880 return false;
881
882 // Also need to check the dest regclass, in case we're trying to optimize
883 // something like:
884 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
885 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
886 return false;
887
888 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
889 unsigned ExtraCondLat = Cond.size() != 1;
890
891 // GPRs are handled by csel.
892 // FIXME: Fold in x+1, -x, and ~x when applicable.
893 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
894 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
895 // Single-cycle csel, csinc, csinv, and csneg.
896 CondCycles = 1 + ExtraCondLat;
897 TrueCycles = FalseCycles = 1;
898 if (canFoldIntoCSel(MRI, TrueReg))
899 TrueCycles = 0;
900 else if (canFoldIntoCSel(MRI, FalseReg))
901 FalseCycles = 0;
902 return true;
903 }
904
905 // Scalar floating point is handled by fcsel.
906 // FIXME: Form fabs, fmin, and fmax when applicable.
907 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
908 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
909 CondCycles = 5 + ExtraCondLat;
910 TrueCycles = FalseCycles = 2;
911 return true;
912 }
913
914 // Can't do vectors.
915 return false;
916}
917
920 const DebugLoc &DL, Register DstReg,
922 Register TrueReg, Register FalseReg) const {
923 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
924
925 // Parse the condition code, see parseCondBranch() above.
927 switch (Cond.size()) {
928 default:
929 llvm_unreachable("Unknown condition opcode in Cond");
930 case 1: // b.cc
932 break;
933 case 3: { // cbz/cbnz
934 // We must insert a compare against 0.
935 bool Is64Bit;
936 switch (Cond[1].getImm()) {
937 default:
938 llvm_unreachable("Unknown branch opcode in Cond");
939 case AArch64::CBZW:
940 Is64Bit = false;
941 CC = AArch64CC::EQ;
942 break;
943 case AArch64::CBZX:
944 Is64Bit = true;
945 CC = AArch64CC::EQ;
946 break;
947 case AArch64::CBNZW:
948 Is64Bit = false;
949 CC = AArch64CC::NE;
950 break;
951 case AArch64::CBNZX:
952 Is64Bit = true;
953 CC = AArch64CC::NE;
954 break;
955 }
956 Register SrcReg = Cond[2].getReg();
957 if (Is64Bit) {
958 // cmp reg, #0 is actually subs xzr, reg, #0.
959 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
960 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
961 .addReg(SrcReg)
962 .addImm(0)
963 .addImm(0);
964 } else {
965 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
966 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
967 .addReg(SrcReg)
968 .addImm(0)
969 .addImm(0);
970 }
971 break;
972 }
973 case 4: { // tbz/tbnz
974 // We must insert a tst instruction.
975 switch (Cond[1].getImm()) {
976 default:
977 llvm_unreachable("Unknown branch opcode in Cond");
978 case AArch64::TBZW:
979 case AArch64::TBZX:
980 CC = AArch64CC::EQ;
981 break;
982 case AArch64::TBNZW:
983 case AArch64::TBNZX:
984 CC = AArch64CC::NE;
985 break;
986 }
987 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
988 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
989 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
990 .addReg(Cond[2].getReg())
991 .addImm(
993 else
994 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
995 .addReg(Cond[2].getReg())
996 .addImm(
998 break;
999 }
1000 case 5: { // cb
1001 // We must insert a cmp, that is a subs
1002 // 0 1 2 3 4
1003 // Cond is { -1, Opcode, CC, Op0, Op1 }
1004
1005 unsigned SubsOpc, SubsDestReg;
1006 bool IsImm = false;
1007 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1008 switch (Cond[1].getImm()) {
1009 default:
1010 llvm_unreachable("Unknown branch opcode in Cond");
1011 case AArch64::CBWPri:
1012 SubsOpc = AArch64::SUBSWri;
1013 SubsDestReg = AArch64::WZR;
1014 IsImm = true;
1015 break;
1016 case AArch64::CBXPri:
1017 SubsOpc = AArch64::SUBSXri;
1018 SubsDestReg = AArch64::XZR;
1019 IsImm = true;
1020 break;
1021 case AArch64::CBWPrr:
1022 SubsOpc = AArch64::SUBSWrr;
1023 SubsDestReg = AArch64::WZR;
1024 IsImm = false;
1025 break;
1026 case AArch64::CBXPrr:
1027 SubsOpc = AArch64::SUBSXrr;
1028 SubsDestReg = AArch64::XZR;
1029 IsImm = false;
1030 break;
1031 }
1032
1033 if (IsImm)
1034 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1035 .addReg(Cond[3].getReg())
1036 .addImm(Cond[4].getImm())
1037 .addImm(0);
1038 else
1039 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1040 .addReg(Cond[3].getReg())
1041 .addReg(Cond[4].getReg());
1042 } break;
1043 case 7: { // cb[b,h]
1044 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1045 // that have been folded. For the first operand we codegen an explicit
1046 // extension, for the second operand we fold the extension into cmp.
1047 // 0 1 2 3 4 5 6
1048 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1049
1050 // We need a new register for the now explicitly extended register
1051 Register Reg = Cond[4].getReg();
1053 unsigned ExtOpc;
1054 unsigned ExtBits;
1055 AArch64_AM::ShiftExtendType ExtendType =
1057 switch (ExtendType) {
1058 default:
1059 llvm_unreachable("Unknown shift-extend for CB instruction");
1060 case AArch64_AM::SXTB:
1061 assert(
1062 Cond[1].getImm() == AArch64::CBBAssertExt &&
1063 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1064 ExtOpc = AArch64::SBFMWri;
1065 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1066 break;
1067 case AArch64_AM::SXTH:
1068 assert(
1069 Cond[1].getImm() == AArch64::CBHAssertExt &&
1070 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1071 ExtOpc = AArch64::SBFMWri;
1072 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1073 break;
1074 case AArch64_AM::UXTB:
1075 assert(
1076 Cond[1].getImm() == AArch64::CBBAssertExt &&
1077 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1078 ExtOpc = AArch64::ANDWri;
1079 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1080 break;
1081 case AArch64_AM::UXTH:
1082 assert(
1083 Cond[1].getImm() == AArch64::CBHAssertExt &&
1084 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1085 ExtOpc = AArch64::ANDWri;
1086 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1087 break;
1088 }
1089
1090 // Build the explicit extension of the first operand
1091 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1093 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1094 if (ExtOpc != AArch64::ANDWri)
1095 MBBI.addImm(0);
1096 MBBI.addImm(ExtBits);
1097 }
1098
1099 // Now, subs with an extended second operand
1101 AArch64_AM::ShiftExtendType ExtendType =
1103 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1104 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1105 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1106 .addReg(Cond[3].getReg())
1107 .addReg(Reg)
1108 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1109 } // If no extension is needed, just a regular subs
1110 else {
1111 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1112 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1113 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1114 .addReg(Cond[3].getReg())
1115 .addReg(Reg);
1116 }
1117
1118 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1119 } break;
1120 }
1121
1122 unsigned Opc = 0;
1123 const TargetRegisterClass *RC = nullptr;
1124 bool TryFold = false;
1125 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1126 RC = &AArch64::GPR64RegClass;
1127 Opc = AArch64::CSELXr;
1128 TryFold = true;
1129 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1130 RC = &AArch64::GPR32RegClass;
1131 Opc = AArch64::CSELWr;
1132 TryFold = true;
1133 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1134 RC = &AArch64::FPR64RegClass;
1135 Opc = AArch64::FCSELDrrr;
1136 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1137 RC = &AArch64::FPR32RegClass;
1138 Opc = AArch64::FCSELSrrr;
1139 }
1140 assert(RC && "Unsupported regclass");
1141
1142 // Try folding simple instructions into the csel.
1143 if (TryFold) {
1144 unsigned NewReg = 0;
1145 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1146 if (FoldedOpc) {
1147 // The folded opcodes csinc, csinc and csneg apply the operation to
1148 // FalseReg, so we need to invert the condition.
1150 TrueReg = FalseReg;
1151 } else
1152 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1153
1154 // Fold the operation. Leave any dead instructions for DCE to clean up.
1155 if (FoldedOpc) {
1156 FalseReg = NewReg;
1157 Opc = FoldedOpc;
1158 // Extend the live range of NewReg.
1159 MRI.clearKillFlags(NewReg);
1160 }
1161 }
1162
1163 // Pull all virtual register into the appropriate class.
1164 MRI.constrainRegClass(TrueReg, RC);
1165 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1166 assert(
1167 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1168 FalseReg == AArch64::XZR) &&
1169 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1170 if (FalseReg.isVirtual())
1171 MRI.constrainRegClass(FalseReg, RC);
1172
1173 // Insert the csel.
1174 BuildMI(MBB, I, DL, get(Opc), DstReg)
1175 .addReg(TrueReg)
1176 .addReg(FalseReg)
1177 .addImm(CC);
1178}
1179
1180// Return true if Imm can be loaded into a register by a "cheap" sequence of
1181// instructions. For now, "cheap" means at most two instructions.
1182static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1183 if (BitSize == 32)
1184 return true;
1185
1186 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1187 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1189 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1190
1191 return Is.size() <= 2;
1192}
1193
1194// Check if a COPY instruction is cheap.
1195static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1196 assert(MI.isCopy() && "Expected COPY instruction");
1197 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1198
1199 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1200 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1201 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1202 if (Reg.isVirtual())
1203 return MRI.getRegClass(Reg);
1204 if (Reg.isPhysical())
1205 return RI.getMinimalPhysRegClass(Reg);
1206 return nullptr;
1207 };
1208 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1209 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1210 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1211 return false;
1212
1213 return MI.isAsCheapAsAMove();
1214}
1215
1216// FIXME: this implementation should be micro-architecture dependent, so a
1217// micro-architecture target hook should be introduced here in future.
1219 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1220 if (isExynosCheapAsMove(MI))
1221 return true;
1222 return MI.isAsCheapAsAMove();
1223 }
1224
1225 switch (MI.getOpcode()) {
1226 default:
1227 return MI.isAsCheapAsAMove();
1228
1229 case TargetOpcode::COPY:
1230 return isCheapCopy(MI, RI);
1231
1232 case AArch64::ADDWrs:
1233 case AArch64::ADDXrs:
1234 case AArch64::SUBWrs:
1235 case AArch64::SUBXrs:
1236 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1237
1238 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1239 // ORRXri, it is as cheap as MOV.
1240 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1241 case AArch64::MOVi32imm:
1242 return isCheapImmediate(MI, 32);
1243 case AArch64::MOVi64imm:
1244 return isCheapImmediate(MI, 64);
1245 }
1246}
1247
1248bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1249 switch (MI.getOpcode()) {
1250 default:
1251 return false;
1252
1253 case AArch64::ADDWrs:
1254 case AArch64::ADDXrs:
1255 case AArch64::ADDSWrs:
1256 case AArch64::ADDSXrs: {
1257 unsigned Imm = MI.getOperand(3).getImm();
1258 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1259 if (ShiftVal == 0)
1260 return true;
1261 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1262 }
1263
1264 case AArch64::ADDWrx:
1265 case AArch64::ADDXrx:
1266 case AArch64::ADDXrx64:
1267 case AArch64::ADDSWrx:
1268 case AArch64::ADDSXrx:
1269 case AArch64::ADDSXrx64: {
1270 unsigned Imm = MI.getOperand(3).getImm();
1271 switch (AArch64_AM::getArithExtendType(Imm)) {
1272 default:
1273 return false;
1274 case AArch64_AM::UXTB:
1275 case AArch64_AM::UXTH:
1276 case AArch64_AM::UXTW:
1277 case AArch64_AM::UXTX:
1278 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1279 }
1280 }
1281
1282 case AArch64::SUBWrs:
1283 case AArch64::SUBSWrs: {
1284 unsigned Imm = MI.getOperand(3).getImm();
1285 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1286 return ShiftVal == 0 ||
1287 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1288 }
1289
1290 case AArch64::SUBXrs:
1291 case AArch64::SUBSXrs: {
1292 unsigned Imm = MI.getOperand(3).getImm();
1293 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1294 return ShiftVal == 0 ||
1295 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1296 }
1297
1298 case AArch64::SUBWrx:
1299 case AArch64::SUBXrx:
1300 case AArch64::SUBXrx64:
1301 case AArch64::SUBSWrx:
1302 case AArch64::SUBSXrx:
1303 case AArch64::SUBSXrx64: {
1304 unsigned Imm = MI.getOperand(3).getImm();
1305 switch (AArch64_AM::getArithExtendType(Imm)) {
1306 default:
1307 return false;
1308 case AArch64_AM::UXTB:
1309 case AArch64_AM::UXTH:
1310 case AArch64_AM::UXTW:
1311 case AArch64_AM::UXTX:
1312 return AArch64_AM::getArithShiftValue(Imm) == 0;
1313 }
1314 }
1315
1316 case AArch64::LDRBBroW:
1317 case AArch64::LDRBBroX:
1318 case AArch64::LDRBroW:
1319 case AArch64::LDRBroX:
1320 case AArch64::LDRDroW:
1321 case AArch64::LDRDroX:
1322 case AArch64::LDRHHroW:
1323 case AArch64::LDRHHroX:
1324 case AArch64::LDRHroW:
1325 case AArch64::LDRHroX:
1326 case AArch64::LDRQroW:
1327 case AArch64::LDRQroX:
1328 case AArch64::LDRSBWroW:
1329 case AArch64::LDRSBWroX:
1330 case AArch64::LDRSBXroW:
1331 case AArch64::LDRSBXroX:
1332 case AArch64::LDRSHWroW:
1333 case AArch64::LDRSHWroX:
1334 case AArch64::LDRSHXroW:
1335 case AArch64::LDRSHXroX:
1336 case AArch64::LDRSWroW:
1337 case AArch64::LDRSWroX:
1338 case AArch64::LDRSroW:
1339 case AArch64::LDRSroX:
1340 case AArch64::LDRWroW:
1341 case AArch64::LDRWroX:
1342 case AArch64::LDRXroW:
1343 case AArch64::LDRXroX:
1344 case AArch64::PRFMroW:
1345 case AArch64::PRFMroX:
1346 case AArch64::STRBBroW:
1347 case AArch64::STRBBroX:
1348 case AArch64::STRBroW:
1349 case AArch64::STRBroX:
1350 case AArch64::STRDroW:
1351 case AArch64::STRDroX:
1352 case AArch64::STRHHroW:
1353 case AArch64::STRHHroX:
1354 case AArch64::STRHroW:
1355 case AArch64::STRHroX:
1356 case AArch64::STRQroW:
1357 case AArch64::STRQroX:
1358 case AArch64::STRSroW:
1359 case AArch64::STRSroX:
1360 case AArch64::STRWroW:
1361 case AArch64::STRWroX:
1362 case AArch64::STRXroW:
1363 case AArch64::STRXroX: {
1364 unsigned IsSigned = MI.getOperand(3).getImm();
1365 return !IsSigned;
1366 }
1367 }
1368}
1369
1370bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1371 unsigned Opc = MI.getOpcode();
1372 switch (Opc) {
1373 default:
1374 return false;
1375 case AArch64::SEH_StackAlloc:
1376 case AArch64::SEH_SaveFPLR:
1377 case AArch64::SEH_SaveFPLR_X:
1378 case AArch64::SEH_SaveReg:
1379 case AArch64::SEH_SaveReg_X:
1380 case AArch64::SEH_SaveRegP:
1381 case AArch64::SEH_SaveRegP_X:
1382 case AArch64::SEH_SaveFReg:
1383 case AArch64::SEH_SaveFReg_X:
1384 case AArch64::SEH_SaveFRegP:
1385 case AArch64::SEH_SaveFRegP_X:
1386 case AArch64::SEH_SetFP:
1387 case AArch64::SEH_AddFP:
1388 case AArch64::SEH_Nop:
1389 case AArch64::SEH_PrologEnd:
1390 case AArch64::SEH_EpilogStart:
1391 case AArch64::SEH_EpilogEnd:
1392 case AArch64::SEH_PACSignLR:
1393 case AArch64::SEH_SaveAnyRegI:
1394 case AArch64::SEH_SaveAnyRegIP:
1395 case AArch64::SEH_SaveAnyRegQP:
1396 case AArch64::SEH_SaveAnyRegQPX:
1397 case AArch64::SEH_AllocZ:
1398 case AArch64::SEH_SaveZReg:
1399 case AArch64::SEH_SavePReg:
1400 return true;
1401 }
1402}
1403
1405 Register &SrcReg, Register &DstReg,
1406 unsigned &SubIdx) const {
1407 switch (MI.getOpcode()) {
1408 default:
1409 return false;
1410 case AArch64::SBFMXri: // aka sxtw
1411 case AArch64::UBFMXri: // aka uxtw
1412 // Check for the 32 -> 64 bit extension case, these instructions can do
1413 // much more.
1414 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1415 return false;
1416 // This is a signed or unsigned 32 -> 64 bit extension.
1417 SrcReg = MI.getOperand(1).getReg();
1418 DstReg = MI.getOperand(0).getReg();
1419 SubIdx = AArch64::sub_32;
1420 return true;
1421 }
1422}
1423
1425 const MachineInstr &MIa, const MachineInstr &MIb) const {
1427 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1428 int64_t OffsetA = 0, OffsetB = 0;
1429 TypeSize WidthA(0, false), WidthB(0, false);
1430 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1431
1432 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1433 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1434
1437 return false;
1438
1439 // Retrieve the base, offset from the base and width. Width
1440 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1441 // base are identical, and the offset of a lower memory access +
1442 // the width doesn't overlap the offset of a higher memory access,
1443 // then the memory accesses are different.
1444 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1445 // are assumed to have the same scale (vscale).
1446 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1447 WidthA, TRI) &&
1448 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1449 WidthB, TRI)) {
1450 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1451 OffsetAIsScalable == OffsetBIsScalable) {
1452 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1453 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1454 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1455 if (LowWidth.isScalable() == OffsetAIsScalable &&
1456 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1457 return true;
1458 }
1459 }
1460 return false;
1461}
1462
1464 const MachineBasicBlock *MBB,
1465 const MachineFunction &MF) const {
1467 return true;
1468
1469 // Do not move an instruction that can be recognized as a branch target.
1470 if (hasBTISemantics(MI))
1471 return true;
1472
1473 switch (MI.getOpcode()) {
1474 case AArch64::HINT:
1475 // CSDB hints are scheduling barriers.
1476 if (MI.getOperand(0).getImm() == 0x14)
1477 return true;
1478 break;
1479 case AArch64::DSB:
1480 case AArch64::ISB:
1481 // DSB and ISB also are scheduling barriers.
1482 return true;
1483 case AArch64::MSRpstatesvcrImm1:
1484 // SMSTART and SMSTOP are also scheduling barriers.
1485 return true;
1486 default:;
1487 }
1488 if (isSEHInstruction(MI))
1489 return true;
1490 auto Next = std::next(MI.getIterator());
1491 return Next != MBB->end() && Next->isCFIInstruction();
1492}
1493
1494/// analyzeCompare - For a comparison instruction, return the source registers
1495/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1496/// Return true if the comparison instruction can be analyzed.
1498 Register &SrcReg2, int64_t &CmpMask,
1499 int64_t &CmpValue) const {
1500 // The first operand can be a frame index where we'd normally expect a
1501 // register.
1502 // FIXME: Pass subregisters out of analyzeCompare
1503 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1504 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1505 return false;
1506
1507 switch (MI.getOpcode()) {
1508 default:
1509 break;
1510 case AArch64::PTEST_PP:
1511 case AArch64::PTEST_PP_ANY:
1512 case AArch64::PTEST_PP_FIRST:
1513 SrcReg = MI.getOperand(0).getReg();
1514 SrcReg2 = MI.getOperand(1).getReg();
1515 if (MI.getOperand(2).getSubReg())
1516 return false;
1517
1518 // Not sure about the mask and value for now...
1519 CmpMask = ~0;
1520 CmpValue = 0;
1521 return true;
1522 case AArch64::SUBSWrr:
1523 case AArch64::SUBSWrs:
1524 case AArch64::SUBSWrx:
1525 case AArch64::SUBSXrr:
1526 case AArch64::SUBSXrs:
1527 case AArch64::SUBSXrx:
1528 case AArch64::ADDSWrr:
1529 case AArch64::ADDSWrs:
1530 case AArch64::ADDSWrx:
1531 case AArch64::ADDSXrr:
1532 case AArch64::ADDSXrs:
1533 case AArch64::ADDSXrx:
1534 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1535 SrcReg = MI.getOperand(1).getReg();
1536 SrcReg2 = MI.getOperand(2).getReg();
1537
1538 // FIXME: Pass subregisters out of analyzeCompare
1539 if (MI.getOperand(2).getSubReg())
1540 return false;
1541
1542 CmpMask = ~0;
1543 CmpValue = 0;
1544 return true;
1545 case AArch64::SUBSWri:
1546 case AArch64::ADDSWri:
1547 case AArch64::SUBSXri:
1548 case AArch64::ADDSXri:
1549 SrcReg = MI.getOperand(1).getReg();
1550 SrcReg2 = 0;
1551 CmpMask = ~0;
1552 CmpValue = MI.getOperand(2).getImm();
1553 return true;
1554 case AArch64::ANDSWri:
1555 case AArch64::ANDSXri:
1556 // ANDS does not use the same encoding scheme as the others xxxS
1557 // instructions.
1558 SrcReg = MI.getOperand(1).getReg();
1559 SrcReg2 = 0;
1560 CmpMask = ~0;
1562 MI.getOperand(2).getImm(),
1563 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1564 return true;
1565 }
1566
1567 return false;
1568}
1569
1571 MachineBasicBlock *MBB = Instr.getParent();
1572 assert(MBB && "Can't get MachineBasicBlock here");
1573 MachineFunction *MF = MBB->getParent();
1574 assert(MF && "Can't get MachineFunction here");
1578
1579 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1580 ++OpIdx) {
1581 MachineOperand &MO = Instr.getOperand(OpIdx);
1582 const TargetRegisterClass *OpRegCstraints =
1583 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1584
1585 // If there's no constraint, there's nothing to do.
1586 if (!OpRegCstraints)
1587 continue;
1588 // If the operand is a frame index, there's nothing to do here.
1589 // A frame index operand will resolve correctly during PEI.
1590 if (MO.isFI())
1591 continue;
1592
1593 assert(MO.isReg() &&
1594 "Operand has register constraints without being a register!");
1595
1596 Register Reg = MO.getReg();
1597 if (Reg.isPhysical()) {
1598 if (!OpRegCstraints->contains(Reg))
1599 return false;
1600 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1601 !MRI->constrainRegClass(Reg, OpRegCstraints))
1602 return false;
1603 }
1604
1605 return true;
1606}
1607
1608/// Return the opcode that does not set flags when possible - otherwise
1609/// return the original opcode. The caller is responsible to do the actual
1610/// substitution and legality checking.
1612 // Don't convert all compare instructions, because for some the zero register
1613 // encoding becomes the sp register.
1614 bool MIDefinesZeroReg = false;
1615 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1616 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1617 MIDefinesZeroReg = true;
1618
1619 switch (MI.getOpcode()) {
1620 default:
1621 return MI.getOpcode();
1622 case AArch64::ADDSWrr:
1623 return AArch64::ADDWrr;
1624 case AArch64::ADDSWri:
1625 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1626 case AArch64::ADDSWrs:
1627 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1628 case AArch64::ADDSWrx:
1629 return AArch64::ADDWrx;
1630 case AArch64::ADDSXrr:
1631 return AArch64::ADDXrr;
1632 case AArch64::ADDSXri:
1633 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1634 case AArch64::ADDSXrs:
1635 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1636 case AArch64::ADDSXrx:
1637 return AArch64::ADDXrx;
1638 case AArch64::SUBSWrr:
1639 return AArch64::SUBWrr;
1640 case AArch64::SUBSWri:
1641 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1642 case AArch64::SUBSWrs:
1643 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1644 case AArch64::SUBSWrx:
1645 return AArch64::SUBWrx;
1646 case AArch64::SUBSXrr:
1647 return AArch64::SUBXrr;
1648 case AArch64::SUBSXri:
1649 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1650 case AArch64::SUBSXrs:
1651 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1652 case AArch64::SUBSXrx:
1653 return AArch64::SUBXrx;
1654 }
1655}
1656
1657enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1658
1659/// True when condition flags are accessed (either by writing or reading)
1660/// on the instruction trace starting at From and ending at To.
1661///
1662/// Note: If From and To are from different blocks it's assumed CC are accessed
1663/// on the path.
1666 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1667 // Early exit if To is at the beginning of the BB.
1668 if (To == To->getParent()->begin())
1669 return true;
1670
1671 // Check whether the instructions are in the same basic block
1672 // If not, assume the condition flags might get modified somewhere.
1673 if (To->getParent() != From->getParent())
1674 return true;
1675
1676 // From must be above To.
1677 assert(std::any_of(
1678 ++To.getReverse(), To->getParent()->rend(),
1679 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1680
1681 // We iterate backward starting at \p To until we hit \p From.
1682 for (const MachineInstr &Instr :
1684 if (((AccessToCheck & AK_Write) &&
1685 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1686 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1687 return true;
1688 }
1689 return false;
1690}
1691
1692std::optional<unsigned>
1693AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1694 MachineInstr *Pred,
1695 const MachineRegisterInfo *MRI) const {
1696 unsigned MaskOpcode = Mask->getOpcode();
1697 unsigned PredOpcode = Pred->getOpcode();
1698 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1699 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1700
1701 if (PredIsWhileLike) {
1702 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1703 // instruction and the condition is "any" since WHILcc does an implicit
1704 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1705 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1706 return PredOpcode;
1707
1708 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1709 // redundant since WHILE performs an implicit PTEST with an all active
1710 // mask.
1711 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1712 getElementSizeForOpcode(MaskOpcode) ==
1713 getElementSizeForOpcode(PredOpcode))
1714 return PredOpcode;
1715
1716 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1717 // WHILEcc performs an implicit PTEST with an all active mask, setting
1718 // the N flag as the PTEST_FIRST would.
1719 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1720 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1721 return PredOpcode;
1722
1723 return {};
1724 }
1725
1726 if (PredIsPTestLike) {
1727 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1728 // instruction that sets the flags as PTEST would and the condition is
1729 // "any" since PG is always a subset of the governing predicate of the
1730 // ptest-like instruction.
1731 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1732 return PredOpcode;
1733
1734 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1735
1736 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1737 // to look through a copy and try again. This is because some instructions
1738 // take a predicate whose register class is a subset of its result class.
1739 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1740 PTestLikeMask->getOperand(1).getReg().isVirtual())
1741 PTestLikeMask =
1742 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1743
1744 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1745 // the element size matches and either the PTEST_LIKE instruction uses
1746 // the same all active mask or the condition is "any".
1747 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1748 getElementSizeForOpcode(MaskOpcode) ==
1749 getElementSizeForOpcode(PredOpcode)) {
1750 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1751 return PredOpcode;
1752 }
1753
1754 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1755 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1756 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1757 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1758 // performed by the compare could consider fewer lanes for these element
1759 // sizes.
1760 //
1761 // For example, consider
1762 //
1763 // ptrue p0.b ; P0=1111-1111-1111-1111
1764 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1765 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1766 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1767 // ; ^ last active
1768 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1769 // ; ^ last active
1770 //
1771 // where the compare generates a canonical all active 32-bit predicate
1772 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1773 // active flag, whereas the PTEST instruction with the same mask doesn't.
1774 // For PTEST_ANY this doesn't apply as the flags in this case would be
1775 // identical regardless of element size.
1776 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1777 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1778 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1779 return PredOpcode;
1780
1781 return {};
1782 }
1783
1784 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1785 // opcode so the PTEST becomes redundant.
1786 switch (PredOpcode) {
1787 case AArch64::AND_PPzPP:
1788 case AArch64::BIC_PPzPP:
1789 case AArch64::EOR_PPzPP:
1790 case AArch64::NAND_PPzPP:
1791 case AArch64::NOR_PPzPP:
1792 case AArch64::ORN_PPzPP:
1793 case AArch64::ORR_PPzPP:
1794 case AArch64::BRKA_PPzP:
1795 case AArch64::BRKPA_PPzPP:
1796 case AArch64::BRKB_PPzP:
1797 case AArch64::BRKPB_PPzPP:
1798 case AArch64::RDFFR_PPz: {
1799 // Check to see if our mask is the same. If not the resulting flag bits
1800 // may be different and we can't remove the ptest.
1801 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1802 if (Mask != PredMask)
1803 return {};
1804 break;
1805 }
1806 case AArch64::BRKN_PPzP: {
1807 // BRKN uses an all active implicit mask to set flags unlike the other
1808 // flag-setting instructions.
1809 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1810 if ((MaskOpcode != AArch64::PTRUE_B) ||
1811 (Mask->getOperand(1).getImm() != 31))
1812 return {};
1813 break;
1814 }
1815 case AArch64::PTRUE_B:
1816 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1817 break;
1818 default:
1819 // Bail out if we don't recognize the input
1820 return {};
1821 }
1822
1823 return convertToFlagSettingOpc(PredOpcode);
1824}
1825
1826/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1827/// operation which could set the flags in an identical manner
1828bool AArch64InstrInfo::optimizePTestInstr(
1829 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1830 const MachineRegisterInfo *MRI) const {
1831 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1832 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1833
1834 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1835 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1836 // before the branch to extract each subregister.
1837 auto Op = Pred->getOperand(1);
1838 if (Op.isReg() && Op.getReg().isVirtual() &&
1839 Op.getSubReg() == AArch64::psub0)
1840 Pred = MRI->getUniqueVRegDef(Op.getReg());
1841 }
1842
1843 unsigned PredOpcode = Pred->getOpcode();
1844 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1845 if (!NewOp)
1846 return false;
1847
1848 const TargetRegisterInfo *TRI = &getRegisterInfo();
1849
1850 // If another instruction between Pred and PTest accesses flags, don't remove
1851 // the ptest or update the earlier instruction to modify them.
1852 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1853 return false;
1854
1855 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1856 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1857 // operand to be replaced with an equivalent instruction that also sets the
1858 // flags.
1859 PTest->eraseFromParent();
1860 if (*NewOp != PredOpcode) {
1861 Pred->setDesc(get(*NewOp));
1862 bool succeeded = UpdateOperandRegClass(*Pred);
1863 (void)succeeded;
1864 assert(succeeded && "Operands have incompatible register classes!");
1865 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1866 }
1867
1868 // Ensure that the flags def is live.
1869 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1870 unsigned i = 0, e = Pred->getNumOperands();
1871 for (; i != e; ++i) {
1872 MachineOperand &MO = Pred->getOperand(i);
1873 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1874 MO.setIsDead(false);
1875 break;
1876 }
1877 }
1878 }
1879 return true;
1880}
1881
1882/// Try to optimize a compare instruction. A compare instruction is an
1883/// instruction which produces AArch64::NZCV. It can be truly compare
1884/// instruction
1885/// when there are no uses of its destination register.
1886///
1887/// The following steps are tried in order:
1888/// 1. Convert CmpInstr into an unconditional version.
1889/// 2. Remove CmpInstr if above there is an instruction producing a needed
1890/// condition code or an instruction which can be converted into such an
1891/// instruction.
1892/// Only comparison with zero is supported.
1894 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1895 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1896 assert(CmpInstr.getParent());
1897 assert(MRI);
1898
1899 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1900 int DeadNZCVIdx =
1901 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1902 if (DeadNZCVIdx != -1) {
1903 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1904 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1905 CmpInstr.eraseFromParent();
1906 return true;
1907 }
1908 unsigned Opc = CmpInstr.getOpcode();
1909 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1910 if (NewOpc == Opc)
1911 return false;
1912 const MCInstrDesc &MCID = get(NewOpc);
1913 CmpInstr.setDesc(MCID);
1914 CmpInstr.removeOperand(DeadNZCVIdx);
1915 bool succeeded = UpdateOperandRegClass(CmpInstr);
1916 (void)succeeded;
1917 assert(succeeded && "Some operands reg class are incompatible!");
1918 return true;
1919 }
1920
1921 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1922 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1923 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1924 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1925
1926 if (SrcReg2 != 0)
1927 return false;
1928
1929 // CmpInstr is a Compare instruction if destination register is not used.
1930 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1931 return false;
1932
1933 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1934 return true;
1935 return (CmpValue == 0 || CmpValue == 1) &&
1936 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1937}
1938
1939/// Get opcode of S version of Instr.
1940/// If Instr is S version its opcode is returned.
1941/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1942/// or we are not interested in it.
1943static unsigned sForm(MachineInstr &Instr) {
1944 switch (Instr.getOpcode()) {
1945 default:
1946 return AArch64::INSTRUCTION_LIST_END;
1947
1948 case AArch64::ADDSWrr:
1949 case AArch64::ADDSWri:
1950 case AArch64::ADDSXrr:
1951 case AArch64::ADDSXri:
1952 case AArch64::ADDSWrx:
1953 case AArch64::ADDSXrx:
1954 case AArch64::SUBSWrr:
1955 case AArch64::SUBSWri:
1956 case AArch64::SUBSWrx:
1957 case AArch64::SUBSXrr:
1958 case AArch64::SUBSXri:
1959 case AArch64::SUBSXrx:
1960 case AArch64::ANDSWri:
1961 case AArch64::ANDSWrr:
1962 case AArch64::ANDSWrs:
1963 case AArch64::ANDSXri:
1964 case AArch64::ANDSXrr:
1965 case AArch64::ANDSXrs:
1966 case AArch64::BICSWrr:
1967 case AArch64::BICSXrr:
1968 case AArch64::BICSWrs:
1969 case AArch64::BICSXrs:
1970 return Instr.getOpcode();
1971
1972 case AArch64::ADDWrr:
1973 return AArch64::ADDSWrr;
1974 case AArch64::ADDWri:
1975 return AArch64::ADDSWri;
1976 case AArch64::ADDXrr:
1977 return AArch64::ADDSXrr;
1978 case AArch64::ADDXri:
1979 return AArch64::ADDSXri;
1980 case AArch64::ADDWrx:
1981 return AArch64::ADDSWrx;
1982 case AArch64::ADDXrx:
1983 return AArch64::ADDSXrx;
1984 case AArch64::ADCWr:
1985 return AArch64::ADCSWr;
1986 case AArch64::ADCXr:
1987 return AArch64::ADCSXr;
1988 case AArch64::SUBWrr:
1989 return AArch64::SUBSWrr;
1990 case AArch64::SUBWri:
1991 return AArch64::SUBSWri;
1992 case AArch64::SUBXrr:
1993 return AArch64::SUBSXrr;
1994 case AArch64::SUBXri:
1995 return AArch64::SUBSXri;
1996 case AArch64::SUBWrx:
1997 return AArch64::SUBSWrx;
1998 case AArch64::SUBXrx:
1999 return AArch64::SUBSXrx;
2000 case AArch64::SBCWr:
2001 return AArch64::SBCSWr;
2002 case AArch64::SBCXr:
2003 return AArch64::SBCSXr;
2004 case AArch64::ANDWri:
2005 return AArch64::ANDSWri;
2006 case AArch64::ANDXri:
2007 return AArch64::ANDSXri;
2008 case AArch64::ANDWrr:
2009 return AArch64::ANDSWrr;
2010 case AArch64::ANDWrs:
2011 return AArch64::ANDSWrs;
2012 case AArch64::ANDXrr:
2013 return AArch64::ANDSXrr;
2014 case AArch64::ANDXrs:
2015 return AArch64::ANDSXrs;
2016 case AArch64::BICWrr:
2017 return AArch64::BICSWrr;
2018 case AArch64::BICXrr:
2019 return AArch64::BICSXrr;
2020 case AArch64::BICWrs:
2021 return AArch64::BICSWrs;
2022 case AArch64::BICXrs:
2023 return AArch64::BICSXrs;
2024 }
2025}
2026
2027/// Check if AArch64::NZCV should be alive in successors of MBB.
2029 for (auto *BB : MBB->successors())
2030 if (BB->isLiveIn(AArch64::NZCV))
2031 return true;
2032 return false;
2033}
2034
2035/// \returns The condition code operand index for \p Instr if it is a branch
2036/// or select and -1 otherwise.
2037static int
2039 switch (Instr.getOpcode()) {
2040 default:
2041 return -1;
2042
2043 case AArch64::Bcc: {
2044 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2045 assert(Idx >= 2);
2046 return Idx - 2;
2047 }
2048
2049 case AArch64::CSINVWr:
2050 case AArch64::CSINVXr:
2051 case AArch64::CSINCWr:
2052 case AArch64::CSINCXr:
2053 case AArch64::CSELWr:
2054 case AArch64::CSELXr:
2055 case AArch64::CSNEGWr:
2056 case AArch64::CSNEGXr:
2057 case AArch64::FCSELSrrr:
2058 case AArch64::FCSELDrrr: {
2059 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2060 assert(Idx >= 1);
2061 return Idx - 1;
2062 }
2063 }
2064}
2065
2066/// Find a condition code used by the instruction.
2067/// Returns AArch64CC::Invalid if either the instruction does not use condition
2068/// codes or we don't optimize CmpInstr in the presence of such instructions.
2071 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2072 Instr.getOperand(CCIdx).getImm())
2074}
2075
2078 UsedNZCV UsedFlags;
2079 switch (CC) {
2080 default:
2081 break;
2082
2083 case AArch64CC::EQ: // Z set
2084 case AArch64CC::NE: // Z clear
2085 UsedFlags.Z = true;
2086 break;
2087
2088 case AArch64CC::HI: // Z clear and C set
2089 case AArch64CC::LS: // Z set or C clear
2090 UsedFlags.Z = true;
2091 [[fallthrough]];
2092 case AArch64CC::HS: // C set
2093 case AArch64CC::LO: // C clear
2094 UsedFlags.C = true;
2095 break;
2096
2097 case AArch64CC::MI: // N set
2098 case AArch64CC::PL: // N clear
2099 UsedFlags.N = true;
2100 break;
2101
2102 case AArch64CC::VS: // V set
2103 case AArch64CC::VC: // V clear
2104 UsedFlags.V = true;
2105 break;
2106
2107 case AArch64CC::GT: // Z clear, N and V the same
2108 case AArch64CC::LE: // Z set, N and V differ
2109 UsedFlags.Z = true;
2110 [[fallthrough]];
2111 case AArch64CC::GE: // N and V the same
2112 case AArch64CC::LT: // N and V differ
2113 UsedFlags.N = true;
2114 UsedFlags.V = true;
2115 break;
2116 }
2117 return UsedFlags;
2118}
2119
2120/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2121/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2122/// \returns std::nullopt otherwise.
2123///
2124/// Collect instructions using that flags in \p CCUseInstrs if provided.
2125std::optional<UsedNZCV>
2127 const TargetRegisterInfo &TRI,
2128 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2129 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2130 if (MI.getParent() != CmpParent)
2131 return std::nullopt;
2132
2133 if (areCFlagsAliveInSuccessors(CmpParent))
2134 return std::nullopt;
2135
2136 UsedNZCV NZCVUsedAfterCmp;
2138 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2139 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2141 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2142 return std::nullopt;
2143 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2144 if (CCUseInstrs)
2145 CCUseInstrs->push_back(&Instr);
2146 }
2147 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2148 break;
2149 }
2150 return NZCVUsedAfterCmp;
2151}
2152
2153static bool isADDSRegImm(unsigned Opcode) {
2154 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2155}
2156
2157static bool isSUBSRegImm(unsigned Opcode) {
2158 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2159}
2160
2162 unsigned Opc = sForm(MI);
2163 switch (Opc) {
2164 case AArch64::ANDSWri:
2165 case AArch64::ANDSWrr:
2166 case AArch64::ANDSWrs:
2167 case AArch64::ANDSXri:
2168 case AArch64::ANDSXrr:
2169 case AArch64::ANDSXrs:
2170 case AArch64::BICSWrr:
2171 case AArch64::BICSXrr:
2172 case AArch64::BICSWrs:
2173 case AArch64::BICSXrs:
2174 return true;
2175 default:
2176 return false;
2177 }
2178}
2179
2180/// Check if CmpInstr can be substituted by MI.
2181///
2182/// CmpInstr can be substituted:
2183/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2184/// - and, MI and CmpInstr are from the same MachineBB
2185/// - and, condition flags are not alive in successors of the CmpInstr parent
2186/// - and, if MI opcode is the S form there must be no defs of flags between
2187/// MI and CmpInstr
2188/// or if MI opcode is not the S form there must be neither defs of flags
2189/// nor uses of flags between MI and CmpInstr.
2190/// - and, if C/V flags are not used after CmpInstr
2191/// or if N flag is used but MI produces poison value if signed overflow
2192/// occurs.
2194 const TargetRegisterInfo &TRI) {
2195 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2196 // that may or may not set flags.
2197 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2198
2199 const unsigned CmpOpcode = CmpInstr.getOpcode();
2200 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2201 return false;
2202
2203 assert((CmpInstr.getOperand(2).isImm() &&
2204 CmpInstr.getOperand(2).getImm() == 0) &&
2205 "Caller guarantees that CmpInstr compares with constant 0");
2206
2207 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2208 if (!NZVCUsed || NZVCUsed->C)
2209 return false;
2210
2211 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2212 // '%vreg = add ...' or '%vreg = sub ...'.
2213 // Condition flag V is used to indicate signed overflow.
2214 // 1) MI and CmpInstr set N and V to the same value.
2215 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2216 // signed overflow occurs, so CmpInstr could still be simplified away.
2217 // Note that Ands and Bics instructions always clear the V flag.
2218 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2219 return false;
2220
2221 AccessKind AccessToCheck = AK_Write;
2222 if (sForm(MI) != MI.getOpcode())
2223 AccessToCheck = AK_All;
2224 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2225}
2226
2227/// Substitute an instruction comparing to zero with another instruction
2228/// which produces needed condition flags.
2229///
2230/// Return true on success.
2231bool AArch64InstrInfo::substituteCmpToZero(
2232 MachineInstr &CmpInstr, unsigned SrcReg,
2233 const MachineRegisterInfo &MRI) const {
2234 // Get the unique definition of SrcReg.
2235 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2236 if (!MI)
2237 return false;
2238
2239 const TargetRegisterInfo &TRI = getRegisterInfo();
2240
2241 unsigned NewOpc = sForm(*MI);
2242 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2243 return false;
2244
2245 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2246 return false;
2247
2248 // Update the instruction to set NZCV.
2249 MI->setDesc(get(NewOpc));
2250 CmpInstr.eraseFromParent();
2252 (void)succeeded;
2253 assert(succeeded && "Some operands reg class are incompatible!");
2254 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2255 return true;
2256}
2257
2258/// \returns True if \p CmpInstr can be removed.
2259///
2260/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2261/// codes used in \p CCUseInstrs must be inverted.
2263 int CmpValue, const TargetRegisterInfo &TRI,
2265 bool &IsInvertCC) {
2266 assert((CmpValue == 0 || CmpValue == 1) &&
2267 "Only comparisons to 0 or 1 considered for removal!");
2268
2269 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2270 unsigned MIOpc = MI.getOpcode();
2271 if (MIOpc == AArch64::CSINCWr) {
2272 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2273 MI.getOperand(2).getReg() != AArch64::WZR)
2274 return false;
2275 } else if (MIOpc == AArch64::CSINCXr) {
2276 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2277 MI.getOperand(2).getReg() != AArch64::XZR)
2278 return false;
2279 } else {
2280 return false;
2281 }
2283 if (MICC == AArch64CC::Invalid)
2284 return false;
2285
2286 // NZCV needs to be defined
2287 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2288 return false;
2289
2290 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2291 const unsigned CmpOpcode = CmpInstr.getOpcode();
2292 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2293 if (CmpValue && !IsSubsRegImm)
2294 return false;
2295 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2296 return false;
2297
2298 // MI conditions allowed: eq, ne, mi, pl
2299 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2300 if (MIUsedNZCV.C || MIUsedNZCV.V)
2301 return false;
2302
2303 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2304 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2305 // Condition flags are not used in CmpInstr basic block successors and only
2306 // Z or N flags allowed to be used after CmpInstr within its basic block
2307 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2308 return false;
2309 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2310 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2311 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2312 return false;
2313 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2314 if (MIUsedNZCV.N && !CmpValue)
2315 return false;
2316
2317 // There must be no defs of flags between MI and CmpInstr
2318 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2319 return false;
2320
2321 // Condition code is inverted in the following cases:
2322 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2323 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2324 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2325 (!CmpValue && MICC == AArch64CC::NE);
2326 return true;
2327}
2328
2329/// Remove comparison in csinc-cmp sequence
2330///
2331/// Examples:
2332/// 1. \code
2333/// csinc w9, wzr, wzr, ne
2334/// cmp w9, #0
2335/// b.eq
2336/// \endcode
2337/// to
2338/// \code
2339/// csinc w9, wzr, wzr, ne
2340/// b.ne
2341/// \endcode
2342///
2343/// 2. \code
2344/// csinc x2, xzr, xzr, mi
2345/// cmp x2, #1
2346/// b.pl
2347/// \endcode
2348/// to
2349/// \code
2350/// csinc x2, xzr, xzr, mi
2351/// b.pl
2352/// \endcode
2353///
2354/// \param CmpInstr comparison instruction
2355/// \return True when comparison removed
2356bool AArch64InstrInfo::removeCmpToZeroOrOne(
2357 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2358 const MachineRegisterInfo &MRI) const {
2359 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2360 if (!MI)
2361 return false;
2362 const TargetRegisterInfo &TRI = getRegisterInfo();
2363 SmallVector<MachineInstr *, 4> CCUseInstrs;
2364 bool IsInvertCC = false;
2365 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2366 IsInvertCC))
2367 return false;
2368 // Make transformation
2369 CmpInstr.eraseFromParent();
2370 if (IsInvertCC) {
2371 // Invert condition codes in CmpInstr CC users
2372 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2373 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2374 assert(Idx >= 0 && "Unexpected instruction using CC.");
2375 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2377 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2378 CCOperand.setImm(CCUse);
2379 }
2380 }
2381 return true;
2382}
2383
2384bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2385 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2386 MI.getOpcode() != AArch64::CATCHRET)
2387 return false;
2388
2389 MachineBasicBlock &MBB = *MI.getParent();
2390 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2391 auto TRI = Subtarget.getRegisterInfo();
2392 DebugLoc DL = MI.getDebugLoc();
2393
2394 if (MI.getOpcode() == AArch64::CATCHRET) {
2395 // Skip to the first instruction before the epilog.
2396 const TargetInstrInfo *TII =
2398 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2400 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2401 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2402 FirstEpilogSEH != MBB.begin())
2403 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2404 if (FirstEpilogSEH != MBB.begin())
2405 FirstEpilogSEH = std::next(FirstEpilogSEH);
2406 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2407 .addReg(AArch64::X0, RegState::Define)
2408 .addMBB(TargetMBB);
2409 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2410 .addReg(AArch64::X0, RegState::Define)
2411 .addReg(AArch64::X0)
2412 .addMBB(TargetMBB)
2413 .addImm(0);
2414 TargetMBB->setMachineBlockAddressTaken();
2415 return true;
2416 }
2417
2418 Register Reg = MI.getOperand(0).getReg();
2420 if (M.getStackProtectorGuard() == "sysreg") {
2421 const AArch64SysReg::SysReg *SrcReg =
2422 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2423 if (!SrcReg)
2424 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2425
2426 // mrs xN, sysreg
2427 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2429 .addImm(SrcReg->Encoding);
2430 int Offset = M.getStackProtectorGuardOffset();
2431 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2432 // ldr xN, [xN, #offset]
2433 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2434 .addDef(Reg)
2436 .addImm(Offset / 8);
2437 } else if (Offset >= -256 && Offset <= 255) {
2438 // ldur xN, [xN, #offset]
2439 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2440 .addDef(Reg)
2442 .addImm(Offset);
2443 } else if (Offset >= -4095 && Offset <= 4095) {
2444 if (Offset > 0) {
2445 // add xN, xN, #offset
2446 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2447 .addDef(Reg)
2449 .addImm(Offset)
2450 .addImm(0);
2451 } else {
2452 // sub xN, xN, #offset
2453 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2454 .addDef(Reg)
2456 .addImm(-Offset)
2457 .addImm(0);
2458 }
2459 // ldr xN, [xN]
2460 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2461 .addDef(Reg)
2463 .addImm(0);
2464 } else {
2465 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2466 // than 23760.
2467 // It might be nice to use AArch64::MOVi32imm here, which would get
2468 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2469 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2470 // AArch64FrameLowering might help us find such a scratch register
2471 // though. If we failed to find a scratch register, we could emit a
2472 // stream of add instructions to build up the immediate. Or, we could try
2473 // to insert a AArch64::MOVi32imm before register allocation so that we
2474 // didn't need to scavenge for a scratch register.
2475 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2476 }
2477 MBB.erase(MI);
2478 return true;
2479 }
2480
2481 const GlobalValue *GV =
2482 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2483 const TargetMachine &TM = MBB.getParent()->getTarget();
2484 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2485 const unsigned char MO_NC = AArch64II::MO_NC;
2486
2487 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2488 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2489 .addGlobalAddress(GV, 0, OpFlags);
2490 if (Subtarget.isTargetILP32()) {
2491 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2492 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2493 .addDef(Reg32, RegState::Dead)
2495 .addImm(0)
2496 .addMemOperand(*MI.memoperands_begin())
2498 } else {
2499 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2501 .addImm(0)
2502 .addMemOperand(*MI.memoperands_begin());
2503 }
2504 } else if (TM.getCodeModel() == CodeModel::Large) {
2505 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2506 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2507 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2508 .addImm(0);
2509 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2511 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2512 .addImm(16);
2513 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2515 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2516 .addImm(32);
2517 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2520 .addImm(48);
2521 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2523 .addImm(0)
2524 .addMemOperand(*MI.memoperands_begin());
2525 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2526 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2527 .addGlobalAddress(GV, 0, OpFlags);
2528 } else {
2529 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2530 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2531 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2532 if (Subtarget.isTargetILP32()) {
2533 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2534 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2535 .addDef(Reg32, RegState::Dead)
2537 .addGlobalAddress(GV, 0, LoFlags)
2538 .addMemOperand(*MI.memoperands_begin())
2540 } else {
2541 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2543 .addGlobalAddress(GV, 0, LoFlags)
2544 .addMemOperand(*MI.memoperands_begin());
2545 }
2546 }
2547
2548 MBB.erase(MI);
2549
2550 return true;
2551}
2552
2553// Return true if this instruction simply sets its single destination register
2554// to zero. This is equivalent to a register rename of the zero-register.
2556 switch (MI.getOpcode()) {
2557 default:
2558 break;
2559 case AArch64::MOVZWi:
2560 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2561 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2562 assert(MI.getDesc().getNumOperands() == 3 &&
2563 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2564 return true;
2565 }
2566 break;
2567 case AArch64::ANDWri: // and Rd, Rzr, #imm
2568 return MI.getOperand(1).getReg() == AArch64::WZR;
2569 case AArch64::ANDXri:
2570 return MI.getOperand(1).getReg() == AArch64::XZR;
2571 case TargetOpcode::COPY:
2572 return MI.getOperand(1).getReg() == AArch64::WZR;
2573 }
2574 return false;
2575}
2576
2577// Return true if this instruction simply renames a general register without
2578// modifying bits.
2580 switch (MI.getOpcode()) {
2581 default:
2582 break;
2583 case TargetOpcode::COPY: {
2584 // GPR32 copies will by lowered to ORRXrs
2585 Register DstReg = MI.getOperand(0).getReg();
2586 return (AArch64::GPR32RegClass.contains(DstReg) ||
2587 AArch64::GPR64RegClass.contains(DstReg));
2588 }
2589 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2590 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2591 assert(MI.getDesc().getNumOperands() == 4 &&
2592 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2593 return true;
2594 }
2595 break;
2596 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2597 if (MI.getOperand(2).getImm() == 0) {
2598 assert(MI.getDesc().getNumOperands() == 4 &&
2599 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2600 return true;
2601 }
2602 break;
2603 }
2604 return false;
2605}
2606
2607// Return true if this instruction simply renames a general register without
2608// modifying bits.
2610 switch (MI.getOpcode()) {
2611 default:
2612 break;
2613 case TargetOpcode::COPY: {
2614 Register DstReg = MI.getOperand(0).getReg();
2615 return AArch64::FPR128RegClass.contains(DstReg);
2616 }
2617 case AArch64::ORRv16i8:
2618 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2619 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2620 "invalid ORRv16i8 operands");
2621 return true;
2622 }
2623 break;
2624 }
2625 return false;
2626}
2627
2628static bool isFrameLoadOpcode(int Opcode) {
2629 switch (Opcode) {
2630 default:
2631 return false;
2632 case AArch64::LDRWui:
2633 case AArch64::LDRXui:
2634 case AArch64::LDRBui:
2635 case AArch64::LDRHui:
2636 case AArch64::LDRSui:
2637 case AArch64::LDRDui:
2638 case AArch64::LDRQui:
2639 case AArch64::LDR_PXI:
2640 return true;
2641 }
2642}
2643
2645 int &FrameIndex) const {
2646 if (!isFrameLoadOpcode(MI.getOpcode()))
2647 return Register();
2648
2649 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2650 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2651 FrameIndex = MI.getOperand(1).getIndex();
2652 return MI.getOperand(0).getReg();
2653 }
2654 return Register();
2655}
2656
2657static bool isFrameStoreOpcode(int Opcode) {
2658 switch (Opcode) {
2659 default:
2660 return false;
2661 case AArch64::STRWui:
2662 case AArch64::STRXui:
2663 case AArch64::STRBui:
2664 case AArch64::STRHui:
2665 case AArch64::STRSui:
2666 case AArch64::STRDui:
2667 case AArch64::STRQui:
2668 case AArch64::STR_PXI:
2669 return true;
2670 }
2671}
2672
2674 int &FrameIndex) const {
2675 if (!isFrameStoreOpcode(MI.getOpcode()))
2676 return Register();
2677
2678 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2679 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2680 FrameIndex = MI.getOperand(1).getIndex();
2681 return MI.getOperand(0).getReg();
2682 }
2683 return Register();
2684}
2685
2687 int &FrameIndex) const {
2688 if (!isFrameStoreOpcode(MI.getOpcode()))
2689 return Register();
2690
2691 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2692 return Reg;
2693
2695 if (hasStoreToStackSlot(MI, Accesses)) {
2696 if (Accesses.size() > 1)
2697 return Register();
2698
2699 FrameIndex =
2700 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2701 ->getFrameIndex();
2702 return MI.getOperand(0).getReg();
2703 }
2704 return Register();
2705}
2706
2708 int &FrameIndex) const {
2709 if (!isFrameLoadOpcode(MI.getOpcode()))
2710 return Register();
2711
2712 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2713 return Reg;
2714
2716 if (hasLoadFromStackSlot(MI, Accesses)) {
2717 if (Accesses.size() > 1)
2718 return Register();
2719
2720 FrameIndex =
2721 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2722 ->getFrameIndex();
2723 return MI.getOperand(0).getReg();
2724 }
2725 return Register();
2726}
2727
2728/// Check all MachineMemOperands for a hint to suppress pairing.
2730 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2731 return MMO->getFlags() & MOSuppressPair;
2732 });
2733}
2734
2735/// Set a flag on the first MachineMemOperand to suppress pairing.
2737 if (MI.memoperands_empty())
2738 return;
2739 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2740}
2741
2742/// Check all MachineMemOperands for a hint that the load/store is strided.
2744 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2745 return MMO->getFlags() & MOStridedAccess;
2746 });
2747}
2748
2750 switch (Opc) {
2751 default:
2752 return false;
2753 case AArch64::STURSi:
2754 case AArch64::STRSpre:
2755 case AArch64::STURDi:
2756 case AArch64::STRDpre:
2757 case AArch64::STURQi:
2758 case AArch64::STRQpre:
2759 case AArch64::STURBBi:
2760 case AArch64::STURHHi:
2761 case AArch64::STURWi:
2762 case AArch64::STRWpre:
2763 case AArch64::STURXi:
2764 case AArch64::STRXpre:
2765 case AArch64::LDURSi:
2766 case AArch64::LDRSpre:
2767 case AArch64::LDURDi:
2768 case AArch64::LDRDpre:
2769 case AArch64::LDURQi:
2770 case AArch64::LDRQpre:
2771 case AArch64::LDURWi:
2772 case AArch64::LDRWpre:
2773 case AArch64::LDURXi:
2774 case AArch64::LDRXpre:
2775 case AArch64::LDRSWpre:
2776 case AArch64::LDURSWi:
2777 case AArch64::LDURHHi:
2778 case AArch64::LDURBBi:
2779 case AArch64::LDURSBWi:
2780 case AArch64::LDURSHWi:
2781 return true;
2782 }
2783}
2784
2785std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2786 switch (Opc) {
2787 default: return {};
2788 case AArch64::PRFMui: return AArch64::PRFUMi;
2789 case AArch64::LDRXui: return AArch64::LDURXi;
2790 case AArch64::LDRWui: return AArch64::LDURWi;
2791 case AArch64::LDRBui: return AArch64::LDURBi;
2792 case AArch64::LDRHui: return AArch64::LDURHi;
2793 case AArch64::LDRSui: return AArch64::LDURSi;
2794 case AArch64::LDRDui: return AArch64::LDURDi;
2795 case AArch64::LDRQui: return AArch64::LDURQi;
2796 case AArch64::LDRBBui: return AArch64::LDURBBi;
2797 case AArch64::LDRHHui: return AArch64::LDURHHi;
2798 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2799 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2800 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2801 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2802 case AArch64::LDRSWui: return AArch64::LDURSWi;
2803 case AArch64::STRXui: return AArch64::STURXi;
2804 case AArch64::STRWui: return AArch64::STURWi;
2805 case AArch64::STRBui: return AArch64::STURBi;
2806 case AArch64::STRHui: return AArch64::STURHi;
2807 case AArch64::STRSui: return AArch64::STURSi;
2808 case AArch64::STRDui: return AArch64::STURDi;
2809 case AArch64::STRQui: return AArch64::STURQi;
2810 case AArch64::STRBBui: return AArch64::STURBBi;
2811 case AArch64::STRHHui: return AArch64::STURHHi;
2812 }
2813}
2814
2816 switch (Opc) {
2817 default:
2818 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2819 case AArch64::ADDG:
2820 case AArch64::LDAPURBi:
2821 case AArch64::LDAPURHi:
2822 case AArch64::LDAPURi:
2823 case AArch64::LDAPURSBWi:
2824 case AArch64::LDAPURSBXi:
2825 case AArch64::LDAPURSHWi:
2826 case AArch64::LDAPURSHXi:
2827 case AArch64::LDAPURSWi:
2828 case AArch64::LDAPURXi:
2829 case AArch64::LDR_PPXI:
2830 case AArch64::LDR_PXI:
2831 case AArch64::LDR_ZXI:
2832 case AArch64::LDR_ZZXI:
2833 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2834 case AArch64::LDR_ZZZXI:
2835 case AArch64::LDR_ZZZZXI:
2836 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2837 case AArch64::LDRBBui:
2838 case AArch64::LDRBui:
2839 case AArch64::LDRDui:
2840 case AArch64::LDRHHui:
2841 case AArch64::LDRHui:
2842 case AArch64::LDRQui:
2843 case AArch64::LDRSBWui:
2844 case AArch64::LDRSBXui:
2845 case AArch64::LDRSHWui:
2846 case AArch64::LDRSHXui:
2847 case AArch64::LDRSui:
2848 case AArch64::LDRSWui:
2849 case AArch64::LDRWui:
2850 case AArch64::LDRXui:
2851 case AArch64::LDURBBi:
2852 case AArch64::LDURBi:
2853 case AArch64::LDURDi:
2854 case AArch64::LDURHHi:
2855 case AArch64::LDURHi:
2856 case AArch64::LDURQi:
2857 case AArch64::LDURSBWi:
2858 case AArch64::LDURSBXi:
2859 case AArch64::LDURSHWi:
2860 case AArch64::LDURSHXi:
2861 case AArch64::LDURSi:
2862 case AArch64::LDURSWi:
2863 case AArch64::LDURWi:
2864 case AArch64::LDURXi:
2865 case AArch64::PRFMui:
2866 case AArch64::PRFUMi:
2867 case AArch64::ST2Gi:
2868 case AArch64::STGi:
2869 case AArch64::STLURBi:
2870 case AArch64::STLURHi:
2871 case AArch64::STLURWi:
2872 case AArch64::STLURXi:
2873 case AArch64::StoreSwiftAsyncContext:
2874 case AArch64::STR_PPXI:
2875 case AArch64::STR_PXI:
2876 case AArch64::STR_ZXI:
2877 case AArch64::STR_ZZXI:
2878 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2879 case AArch64::STR_ZZZXI:
2880 case AArch64::STR_ZZZZXI:
2881 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2882 case AArch64::STRBBui:
2883 case AArch64::STRBui:
2884 case AArch64::STRDui:
2885 case AArch64::STRHHui:
2886 case AArch64::STRHui:
2887 case AArch64::STRQui:
2888 case AArch64::STRSui:
2889 case AArch64::STRWui:
2890 case AArch64::STRXui:
2891 case AArch64::STURBBi:
2892 case AArch64::STURBi:
2893 case AArch64::STURDi:
2894 case AArch64::STURHHi:
2895 case AArch64::STURHi:
2896 case AArch64::STURQi:
2897 case AArch64::STURSi:
2898 case AArch64::STURWi:
2899 case AArch64::STURXi:
2900 case AArch64::STZ2Gi:
2901 case AArch64::STZGi:
2902 case AArch64::TAGPstack:
2903 return 2;
2904 case AArch64::LD1B_D_IMM:
2905 case AArch64::LD1B_H_IMM:
2906 case AArch64::LD1B_IMM:
2907 case AArch64::LD1B_S_IMM:
2908 case AArch64::LD1D_IMM:
2909 case AArch64::LD1H_D_IMM:
2910 case AArch64::LD1H_IMM:
2911 case AArch64::LD1H_S_IMM:
2912 case AArch64::LD1RB_D_IMM:
2913 case AArch64::LD1RB_H_IMM:
2914 case AArch64::LD1RB_IMM:
2915 case AArch64::LD1RB_S_IMM:
2916 case AArch64::LD1RD_IMM:
2917 case AArch64::LD1RH_D_IMM:
2918 case AArch64::LD1RH_IMM:
2919 case AArch64::LD1RH_S_IMM:
2920 case AArch64::LD1RSB_D_IMM:
2921 case AArch64::LD1RSB_H_IMM:
2922 case AArch64::LD1RSB_S_IMM:
2923 case AArch64::LD1RSH_D_IMM:
2924 case AArch64::LD1RSH_S_IMM:
2925 case AArch64::LD1RSW_IMM:
2926 case AArch64::LD1RW_D_IMM:
2927 case AArch64::LD1RW_IMM:
2928 case AArch64::LD1SB_D_IMM:
2929 case AArch64::LD1SB_H_IMM:
2930 case AArch64::LD1SB_S_IMM:
2931 case AArch64::LD1SH_D_IMM:
2932 case AArch64::LD1SH_S_IMM:
2933 case AArch64::LD1SW_D_IMM:
2934 case AArch64::LD1W_D_IMM:
2935 case AArch64::LD1W_IMM:
2936 case AArch64::LD2B_IMM:
2937 case AArch64::LD2D_IMM:
2938 case AArch64::LD2H_IMM:
2939 case AArch64::LD2W_IMM:
2940 case AArch64::LD3B_IMM:
2941 case AArch64::LD3D_IMM:
2942 case AArch64::LD3H_IMM:
2943 case AArch64::LD3W_IMM:
2944 case AArch64::LD4B_IMM:
2945 case AArch64::LD4D_IMM:
2946 case AArch64::LD4H_IMM:
2947 case AArch64::LD4W_IMM:
2948 case AArch64::LDG:
2949 case AArch64::LDNF1B_D_IMM:
2950 case AArch64::LDNF1B_H_IMM:
2951 case AArch64::LDNF1B_IMM:
2952 case AArch64::LDNF1B_S_IMM:
2953 case AArch64::LDNF1D_IMM:
2954 case AArch64::LDNF1H_D_IMM:
2955 case AArch64::LDNF1H_IMM:
2956 case AArch64::LDNF1H_S_IMM:
2957 case AArch64::LDNF1SB_D_IMM:
2958 case AArch64::LDNF1SB_H_IMM:
2959 case AArch64::LDNF1SB_S_IMM:
2960 case AArch64::LDNF1SH_D_IMM:
2961 case AArch64::LDNF1SH_S_IMM:
2962 case AArch64::LDNF1SW_D_IMM:
2963 case AArch64::LDNF1W_D_IMM:
2964 case AArch64::LDNF1W_IMM:
2965 case AArch64::LDNPDi:
2966 case AArch64::LDNPQi:
2967 case AArch64::LDNPSi:
2968 case AArch64::LDNPWi:
2969 case AArch64::LDNPXi:
2970 case AArch64::LDNT1B_ZRI:
2971 case AArch64::LDNT1D_ZRI:
2972 case AArch64::LDNT1H_ZRI:
2973 case AArch64::LDNT1W_ZRI:
2974 case AArch64::LDPDi:
2975 case AArch64::LDPQi:
2976 case AArch64::LDPSi:
2977 case AArch64::LDPWi:
2978 case AArch64::LDPXi:
2979 case AArch64::LDRBBpost:
2980 case AArch64::LDRBBpre:
2981 case AArch64::LDRBpost:
2982 case AArch64::LDRBpre:
2983 case AArch64::LDRDpost:
2984 case AArch64::LDRDpre:
2985 case AArch64::LDRHHpost:
2986 case AArch64::LDRHHpre:
2987 case AArch64::LDRHpost:
2988 case AArch64::LDRHpre:
2989 case AArch64::LDRQpost:
2990 case AArch64::LDRQpre:
2991 case AArch64::LDRSpost:
2992 case AArch64::LDRSpre:
2993 case AArch64::LDRWpost:
2994 case AArch64::LDRWpre:
2995 case AArch64::LDRXpost:
2996 case AArch64::LDRXpre:
2997 case AArch64::ST1B_D_IMM:
2998 case AArch64::ST1B_H_IMM:
2999 case AArch64::ST1B_IMM:
3000 case AArch64::ST1B_S_IMM:
3001 case AArch64::ST1D_IMM:
3002 case AArch64::ST1H_D_IMM:
3003 case AArch64::ST1H_IMM:
3004 case AArch64::ST1H_S_IMM:
3005 case AArch64::ST1W_D_IMM:
3006 case AArch64::ST1W_IMM:
3007 case AArch64::ST2B_IMM:
3008 case AArch64::ST2D_IMM:
3009 case AArch64::ST2H_IMM:
3010 case AArch64::ST2W_IMM:
3011 case AArch64::ST3B_IMM:
3012 case AArch64::ST3D_IMM:
3013 case AArch64::ST3H_IMM:
3014 case AArch64::ST3W_IMM:
3015 case AArch64::ST4B_IMM:
3016 case AArch64::ST4D_IMM:
3017 case AArch64::ST4H_IMM:
3018 case AArch64::ST4W_IMM:
3019 case AArch64::STGPi:
3020 case AArch64::STGPreIndex:
3021 case AArch64::STZGPreIndex:
3022 case AArch64::ST2GPreIndex:
3023 case AArch64::STZ2GPreIndex:
3024 case AArch64::STGPostIndex:
3025 case AArch64::STZGPostIndex:
3026 case AArch64::ST2GPostIndex:
3027 case AArch64::STZ2GPostIndex:
3028 case AArch64::STNPDi:
3029 case AArch64::STNPQi:
3030 case AArch64::STNPSi:
3031 case AArch64::STNPWi:
3032 case AArch64::STNPXi:
3033 case AArch64::STNT1B_ZRI:
3034 case AArch64::STNT1D_ZRI:
3035 case AArch64::STNT1H_ZRI:
3036 case AArch64::STNT1W_ZRI:
3037 case AArch64::STPDi:
3038 case AArch64::STPQi:
3039 case AArch64::STPSi:
3040 case AArch64::STPWi:
3041 case AArch64::STPXi:
3042 case AArch64::STRBBpost:
3043 case AArch64::STRBBpre:
3044 case AArch64::STRBpost:
3045 case AArch64::STRBpre:
3046 case AArch64::STRDpost:
3047 case AArch64::STRDpre:
3048 case AArch64::STRHHpost:
3049 case AArch64::STRHHpre:
3050 case AArch64::STRHpost:
3051 case AArch64::STRHpre:
3052 case AArch64::STRQpost:
3053 case AArch64::STRQpre:
3054 case AArch64::STRSpost:
3055 case AArch64::STRSpre:
3056 case AArch64::STRWpost:
3057 case AArch64::STRWpre:
3058 case AArch64::STRXpost:
3059 case AArch64::STRXpre:
3060 return 3;
3061 case AArch64::LDPDpost:
3062 case AArch64::LDPDpre:
3063 case AArch64::LDPQpost:
3064 case AArch64::LDPQpre:
3065 case AArch64::LDPSpost:
3066 case AArch64::LDPSpre:
3067 case AArch64::LDPWpost:
3068 case AArch64::LDPWpre:
3069 case AArch64::LDPXpost:
3070 case AArch64::LDPXpre:
3071 case AArch64::STGPpre:
3072 case AArch64::STGPpost:
3073 case AArch64::STPDpost:
3074 case AArch64::STPDpre:
3075 case AArch64::STPQpost:
3076 case AArch64::STPQpre:
3077 case AArch64::STPSpost:
3078 case AArch64::STPSpre:
3079 case AArch64::STPWpost:
3080 case AArch64::STPWpre:
3081 case AArch64::STPXpost:
3082 case AArch64::STPXpre:
3083 return 4;
3084 }
3085}
3086
3088 switch (MI.getOpcode()) {
3089 default:
3090 return false;
3091 // Scaled instructions.
3092 case AArch64::STRSui:
3093 case AArch64::STRDui:
3094 case AArch64::STRQui:
3095 case AArch64::STRXui:
3096 case AArch64::STRWui:
3097 case AArch64::LDRSui:
3098 case AArch64::LDRDui:
3099 case AArch64::LDRQui:
3100 case AArch64::LDRXui:
3101 case AArch64::LDRWui:
3102 case AArch64::LDRSWui:
3103 // Unscaled instructions.
3104 case AArch64::STURSi:
3105 case AArch64::STRSpre:
3106 case AArch64::STURDi:
3107 case AArch64::STRDpre:
3108 case AArch64::STURQi:
3109 case AArch64::STRQpre:
3110 case AArch64::STURWi:
3111 case AArch64::STRWpre:
3112 case AArch64::STURXi:
3113 case AArch64::STRXpre:
3114 case AArch64::LDURSi:
3115 case AArch64::LDRSpre:
3116 case AArch64::LDURDi:
3117 case AArch64::LDRDpre:
3118 case AArch64::LDURQi:
3119 case AArch64::LDRQpre:
3120 case AArch64::LDURWi:
3121 case AArch64::LDRWpre:
3122 case AArch64::LDURXi:
3123 case AArch64::LDRXpre:
3124 case AArch64::LDURSWi:
3125 case AArch64::LDRSWpre:
3126 // SVE instructions.
3127 case AArch64::LDR_ZXI:
3128 case AArch64::STR_ZXI:
3129 return true;
3130 }
3131}
3132
3134 switch (MI.getOpcode()) {
3135 default:
3136 assert((!MI.isCall() || !MI.isReturn()) &&
3137 "Unexpected instruction - was a new tail call opcode introduced?");
3138 return false;
3139 case AArch64::TCRETURNdi:
3140 case AArch64::TCRETURNri:
3141 case AArch64::TCRETURNrix16x17:
3142 case AArch64::TCRETURNrix17:
3143 case AArch64::TCRETURNrinotx16:
3144 case AArch64::TCRETURNriALL:
3145 case AArch64::AUTH_TCRETURN:
3146 case AArch64::AUTH_TCRETURN_BTI:
3147 return true;
3148 }
3149}
3150
3152 switch (Opc) {
3153 default:
3154 llvm_unreachable("Opcode has no flag setting equivalent!");
3155 // 32-bit cases:
3156 case AArch64::ADDWri:
3157 return AArch64::ADDSWri;
3158 case AArch64::ADDWrr:
3159 return AArch64::ADDSWrr;
3160 case AArch64::ADDWrs:
3161 return AArch64::ADDSWrs;
3162 case AArch64::ADDWrx:
3163 return AArch64::ADDSWrx;
3164 case AArch64::ANDWri:
3165 return AArch64::ANDSWri;
3166 case AArch64::ANDWrr:
3167 return AArch64::ANDSWrr;
3168 case AArch64::ANDWrs:
3169 return AArch64::ANDSWrs;
3170 case AArch64::BICWrr:
3171 return AArch64::BICSWrr;
3172 case AArch64::BICWrs:
3173 return AArch64::BICSWrs;
3174 case AArch64::SUBWri:
3175 return AArch64::SUBSWri;
3176 case AArch64::SUBWrr:
3177 return AArch64::SUBSWrr;
3178 case AArch64::SUBWrs:
3179 return AArch64::SUBSWrs;
3180 case AArch64::SUBWrx:
3181 return AArch64::SUBSWrx;
3182 // 64-bit cases:
3183 case AArch64::ADDXri:
3184 return AArch64::ADDSXri;
3185 case AArch64::ADDXrr:
3186 return AArch64::ADDSXrr;
3187 case AArch64::ADDXrs:
3188 return AArch64::ADDSXrs;
3189 case AArch64::ADDXrx:
3190 return AArch64::ADDSXrx;
3191 case AArch64::ANDXri:
3192 return AArch64::ANDSXri;
3193 case AArch64::ANDXrr:
3194 return AArch64::ANDSXrr;
3195 case AArch64::ANDXrs:
3196 return AArch64::ANDSXrs;
3197 case AArch64::BICXrr:
3198 return AArch64::BICSXrr;
3199 case AArch64::BICXrs:
3200 return AArch64::BICSXrs;
3201 case AArch64::SUBXri:
3202 return AArch64::SUBSXri;
3203 case AArch64::SUBXrr:
3204 return AArch64::SUBSXrr;
3205 case AArch64::SUBXrs:
3206 return AArch64::SUBSXrs;
3207 case AArch64::SUBXrx:
3208 return AArch64::SUBSXrx;
3209 // SVE instructions:
3210 case AArch64::AND_PPzPP:
3211 return AArch64::ANDS_PPzPP;
3212 case AArch64::BIC_PPzPP:
3213 return AArch64::BICS_PPzPP;
3214 case AArch64::EOR_PPzPP:
3215 return AArch64::EORS_PPzPP;
3216 case AArch64::NAND_PPzPP:
3217 return AArch64::NANDS_PPzPP;
3218 case AArch64::NOR_PPzPP:
3219 return AArch64::NORS_PPzPP;
3220 case AArch64::ORN_PPzPP:
3221 return AArch64::ORNS_PPzPP;
3222 case AArch64::ORR_PPzPP:
3223 return AArch64::ORRS_PPzPP;
3224 case AArch64::BRKA_PPzP:
3225 return AArch64::BRKAS_PPzP;
3226 case AArch64::BRKPA_PPzPP:
3227 return AArch64::BRKPAS_PPzPP;
3228 case AArch64::BRKB_PPzP:
3229 return AArch64::BRKBS_PPzP;
3230 case AArch64::BRKPB_PPzPP:
3231 return AArch64::BRKPBS_PPzPP;
3232 case AArch64::BRKN_PPzP:
3233 return AArch64::BRKNS_PPzP;
3234 case AArch64::RDFFR_PPz:
3235 return AArch64::RDFFRS_PPz;
3236 case AArch64::PTRUE_B:
3237 return AArch64::PTRUES_B;
3238 }
3239}
3240
3241// Is this a candidate for ld/st merging or pairing? For example, we don't
3242// touch volatiles or load/stores that have a hint to avoid pair formation.
3244
3245 bool IsPreLdSt = isPreLdSt(MI);
3246
3247 // If this is a volatile load/store, don't mess with it.
3248 if (MI.hasOrderedMemoryRef())
3249 return false;
3250
3251 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3252 // For Pre-inc LD/ST, the operand is shifted by one.
3253 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3254 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3255 "Expected a reg or frame index operand.");
3256
3257 // For Pre-indexed addressing quadword instructions, the third operand is the
3258 // immediate value.
3259 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3260
3261 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3262 return false;
3263
3264 // Can't merge/pair if the instruction modifies the base register.
3265 // e.g., ldr x0, [x0]
3266 // This case will never occur with an FI base.
3267 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3268 // STR<S,D,Q,W,X>pre, it can be merged.
3269 // For example:
3270 // ldr q0, [x11, #32]!
3271 // ldr q1, [x11, #16]
3272 // to
3273 // ldp q0, q1, [x11, #32]!
3274 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3275 Register BaseReg = MI.getOperand(1).getReg();
3277 if (MI.modifiesRegister(BaseReg, TRI))
3278 return false;
3279 }
3280
3281 // Pairing SVE fills/spills is only valid for little-endian targets that
3282 // implement VLS 128.
3283 switch (MI.getOpcode()) {
3284 default:
3285 break;
3286 case AArch64::LDR_ZXI:
3287 case AArch64::STR_ZXI:
3288 if (!Subtarget.isLittleEndian() ||
3289 Subtarget.getSVEVectorSizeInBits() != 128)
3290 return false;
3291 }
3292
3293 // Check if this load/store has a hint to avoid pair formation.
3294 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3296 return false;
3297
3298 // Do not pair any callee-save store/reload instructions in the
3299 // prologue/epilogue if the CFI information encoded the operations as separate
3300 // instructions, as that will cause the size of the actual prologue to mismatch
3301 // with the prologue size recorded in the Windows CFI.
3302 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
3303 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
3304 MI.getMF()->getFunction().needsUnwindTableEntry();
3305 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3307 return false;
3308
3309 // On some CPUs quad load/store pairs are slower than two single load/stores.
3310 if (Subtarget.isPaired128Slow()) {
3311 switch (MI.getOpcode()) {
3312 default:
3313 break;
3314 case AArch64::LDURQi:
3315 case AArch64::STURQi:
3316 case AArch64::LDRQui:
3317 case AArch64::STRQui:
3318 return false;
3319 }
3320 }
3321
3322 return true;
3323}
3324
3327 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3328 const TargetRegisterInfo *TRI) const {
3329 if (!LdSt.mayLoadOrStore())
3330 return false;
3331
3332 const MachineOperand *BaseOp;
3333 TypeSize WidthN(0, false);
3334 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3335 WidthN, TRI))
3336 return false;
3337 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3338 // vector.
3339 Width = LocationSize::precise(WidthN);
3340 BaseOps.push_back(BaseOp);
3341 return true;
3342}
3343
3344std::optional<ExtAddrMode>
3346 const TargetRegisterInfo *TRI) const {
3347 const MachineOperand *Base; // Filled with the base operand of MI.
3348 int64_t Offset; // Filled with the offset of MI.
3349 bool OffsetIsScalable;
3350 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3351 return std::nullopt;
3352
3353 if (!Base->isReg())
3354 return std::nullopt;
3355 ExtAddrMode AM;
3356 AM.BaseReg = Base->getReg();
3357 AM.Displacement = Offset;
3358 AM.ScaledReg = 0;
3359 AM.Scale = 0;
3360 return AM;
3361}
3362
3364 Register Reg,
3365 const MachineInstr &AddrI,
3366 ExtAddrMode &AM) const {
3367 // Filter out instructions into which we cannot fold.
3368 unsigned NumBytes;
3369 int64_t OffsetScale = 1;
3370 switch (MemI.getOpcode()) {
3371 default:
3372 return false;
3373
3374 case AArch64::LDURQi:
3375 case AArch64::STURQi:
3376 NumBytes = 16;
3377 break;
3378
3379 case AArch64::LDURDi:
3380 case AArch64::STURDi:
3381 case AArch64::LDURXi:
3382 case AArch64::STURXi:
3383 NumBytes = 8;
3384 break;
3385
3386 case AArch64::LDURWi:
3387 case AArch64::LDURSWi:
3388 case AArch64::STURWi:
3389 NumBytes = 4;
3390 break;
3391
3392 case AArch64::LDURHi:
3393 case AArch64::STURHi:
3394 case AArch64::LDURHHi:
3395 case AArch64::STURHHi:
3396 case AArch64::LDURSHXi:
3397 case AArch64::LDURSHWi:
3398 NumBytes = 2;
3399 break;
3400
3401 case AArch64::LDRBroX:
3402 case AArch64::LDRBBroX:
3403 case AArch64::LDRSBXroX:
3404 case AArch64::LDRSBWroX:
3405 case AArch64::STRBroX:
3406 case AArch64::STRBBroX:
3407 case AArch64::LDURBi:
3408 case AArch64::LDURBBi:
3409 case AArch64::LDURSBXi:
3410 case AArch64::LDURSBWi:
3411 case AArch64::STURBi:
3412 case AArch64::STURBBi:
3413 case AArch64::LDRBui:
3414 case AArch64::LDRBBui:
3415 case AArch64::LDRSBXui:
3416 case AArch64::LDRSBWui:
3417 case AArch64::STRBui:
3418 case AArch64::STRBBui:
3419 NumBytes = 1;
3420 break;
3421
3422 case AArch64::LDRQroX:
3423 case AArch64::STRQroX:
3424 case AArch64::LDRQui:
3425 case AArch64::STRQui:
3426 NumBytes = 16;
3427 OffsetScale = 16;
3428 break;
3429
3430 case AArch64::LDRDroX:
3431 case AArch64::STRDroX:
3432 case AArch64::LDRXroX:
3433 case AArch64::STRXroX:
3434 case AArch64::LDRDui:
3435 case AArch64::STRDui:
3436 case AArch64::LDRXui:
3437 case AArch64::STRXui:
3438 NumBytes = 8;
3439 OffsetScale = 8;
3440 break;
3441
3442 case AArch64::LDRWroX:
3443 case AArch64::LDRSWroX:
3444 case AArch64::STRWroX:
3445 case AArch64::LDRWui:
3446 case AArch64::LDRSWui:
3447 case AArch64::STRWui:
3448 NumBytes = 4;
3449 OffsetScale = 4;
3450 break;
3451
3452 case AArch64::LDRHroX:
3453 case AArch64::STRHroX:
3454 case AArch64::LDRHHroX:
3455 case AArch64::STRHHroX:
3456 case AArch64::LDRSHXroX:
3457 case AArch64::LDRSHWroX:
3458 case AArch64::LDRHui:
3459 case AArch64::STRHui:
3460 case AArch64::LDRHHui:
3461 case AArch64::STRHHui:
3462 case AArch64::LDRSHXui:
3463 case AArch64::LDRSHWui:
3464 NumBytes = 2;
3465 OffsetScale = 2;
3466 break;
3467 }
3468
3469 // Check the fold operand is not the loaded/stored value.
3470 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3471 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3472 return false;
3473
3474 // Handle memory instructions with a [Reg, Reg] addressing mode.
3475 if (MemI.getOperand(2).isReg()) {
3476 // Bail if the addressing mode already includes extension of the offset
3477 // register.
3478 if (MemI.getOperand(3).getImm())
3479 return false;
3480
3481 // Check if we actually have a scaled offset.
3482 if (MemI.getOperand(4).getImm() == 0)
3483 OffsetScale = 1;
3484
3485 // If the address instructions is folded into the base register, then the
3486 // addressing mode must not have a scale. Then we can swap the base and the
3487 // scaled registers.
3488 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3489 return false;
3490
3491 switch (AddrI.getOpcode()) {
3492 default:
3493 return false;
3494
3495 case AArch64::SBFMXri:
3496 // sxtw Xa, Wm
3497 // ldr Xd, [Xn, Xa, lsl #N]
3498 // ->
3499 // ldr Xd, [Xn, Wm, sxtw #N]
3500 if (AddrI.getOperand(2).getImm() != 0 ||
3501 AddrI.getOperand(3).getImm() != 31)
3502 return false;
3503
3504 AM.BaseReg = MemI.getOperand(1).getReg();
3505 if (AM.BaseReg == Reg)
3506 AM.BaseReg = MemI.getOperand(2).getReg();
3507 AM.ScaledReg = AddrI.getOperand(1).getReg();
3508 AM.Scale = OffsetScale;
3509 AM.Displacement = 0;
3511 return true;
3512
3513 case TargetOpcode::SUBREG_TO_REG: {
3514 // mov Wa, Wm
3515 // ldr Xd, [Xn, Xa, lsl #N]
3516 // ->
3517 // ldr Xd, [Xn, Wm, uxtw #N]
3518
3519 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3520 if (AddrI.getOperand(1).getImm() != 0 ||
3521 AddrI.getOperand(3).getImm() != AArch64::sub_32)
3522 return false;
3523
3524 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3525 Register OffsetReg = AddrI.getOperand(2).getReg();
3526 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3527 return false;
3528
3529 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3530 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3531 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3532 DefMI.getOperand(3).getImm() != 0)
3533 return false;
3534
3535 AM.BaseReg = MemI.getOperand(1).getReg();
3536 if (AM.BaseReg == Reg)
3537 AM.BaseReg = MemI.getOperand(2).getReg();
3538 AM.ScaledReg = DefMI.getOperand(2).getReg();
3539 AM.Scale = OffsetScale;
3540 AM.Displacement = 0;
3542 return true;
3543 }
3544 }
3545 }
3546
3547 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3548
3549 // Check we are not breaking a potential conversion to an LDP.
3550 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3551 int64_t NewOffset) -> bool {
3552 int64_t MinOffset, MaxOffset;
3553 switch (NumBytes) {
3554 default:
3555 return true;
3556 case 4:
3557 MinOffset = -256;
3558 MaxOffset = 252;
3559 break;
3560 case 8:
3561 MinOffset = -512;
3562 MaxOffset = 504;
3563 break;
3564 case 16:
3565 MinOffset = -1024;
3566 MaxOffset = 1008;
3567 break;
3568 }
3569 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3570 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3571 };
3572 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3573 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3574 int64_t NewOffset = OldOffset + Disp;
3575 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3576 return false;
3577 // If the old offset would fit into an LDP, but the new offset wouldn't,
3578 // bail out.
3579 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3580 return false;
3581 AM.BaseReg = AddrI.getOperand(1).getReg();
3582 AM.ScaledReg = 0;
3583 AM.Scale = 0;
3584 AM.Displacement = NewOffset;
3586 return true;
3587 };
3588
3589 auto canFoldAddRegIntoAddrMode =
3590 [&](int64_t Scale,
3592 if (MemI.getOperand(2).getImm() != 0)
3593 return false;
3594 if ((unsigned)Scale != Scale)
3595 return false;
3596 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3597 return false;
3598 AM.BaseReg = AddrI.getOperand(1).getReg();
3599 AM.ScaledReg = AddrI.getOperand(2).getReg();
3600 AM.Scale = Scale;
3601 AM.Displacement = 0;
3602 AM.Form = Form;
3603 return true;
3604 };
3605
3606 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3607 unsigned Opcode = MemI.getOpcode();
3608 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3609 Subtarget.isSTRQroSlow();
3610 };
3611
3612 int64_t Disp = 0;
3613 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3614 switch (AddrI.getOpcode()) {
3615 default:
3616 return false;
3617
3618 case AArch64::ADDXri:
3619 // add Xa, Xn, #N
3620 // ldr Xd, [Xa, #M]
3621 // ->
3622 // ldr Xd, [Xn, #N'+M]
3623 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3624 return canFoldAddSubImmIntoAddrMode(Disp);
3625
3626 case AArch64::SUBXri:
3627 // sub Xa, Xn, #N
3628 // ldr Xd, [Xa, #M]
3629 // ->
3630 // ldr Xd, [Xn, #N'+M]
3631 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3632 return canFoldAddSubImmIntoAddrMode(-Disp);
3633
3634 case AArch64::ADDXrs: {
3635 // add Xa, Xn, Xm, lsl #N
3636 // ldr Xd, [Xa]
3637 // ->
3638 // ldr Xd, [Xn, Xm, lsl #N]
3639
3640 // Don't fold the add if the result would be slower, unless optimising for
3641 // size.
3642 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3644 return false;
3645 Shift = AArch64_AM::getShiftValue(Shift);
3646 if (!OptSize) {
3647 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3648 return false;
3649 if (avoidSlowSTRQ(MemI))
3650 return false;
3651 }
3652 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3653 }
3654
3655 case AArch64::ADDXrr:
3656 // add Xa, Xn, Xm
3657 // ldr Xd, [Xa]
3658 // ->
3659 // ldr Xd, [Xn, Xm, lsl #0]
3660
3661 // Don't fold the add if the result would be slower, unless optimising for
3662 // size.
3663 if (!OptSize && avoidSlowSTRQ(MemI))
3664 return false;
3665 return canFoldAddRegIntoAddrMode(1);
3666
3667 case AArch64::ADDXrx:
3668 // add Xa, Xn, Wm, {s,u}xtw #N
3669 // ldr Xd, [Xa]
3670 // ->
3671 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3672
3673 // Don't fold the add if the result would be slower, unless optimising for
3674 // size.
3675 if (!OptSize && avoidSlowSTRQ(MemI))
3676 return false;
3677
3678 // Can fold only sign-/zero-extend of a word.
3679 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3681 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3682 return false;
3683
3684 return canFoldAddRegIntoAddrMode(
3685 1ULL << AArch64_AM::getArithShiftValue(Imm),
3688 }
3689}
3690
3691// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3692// return the opcode of an instruction performing the same operation, but using
3693// the [Reg, Reg] addressing mode.
3694static unsigned regOffsetOpcode(unsigned Opcode) {
3695 switch (Opcode) {
3696 default:
3697 llvm_unreachable("Address folding not implemented for instruction");
3698
3699 case AArch64::LDURQi:
3700 case AArch64::LDRQui:
3701 return AArch64::LDRQroX;
3702 case AArch64::STURQi:
3703 case AArch64::STRQui:
3704 return AArch64::STRQroX;
3705 case AArch64::LDURDi:
3706 case AArch64::LDRDui:
3707 return AArch64::LDRDroX;
3708 case AArch64::STURDi:
3709 case AArch64::STRDui:
3710 return AArch64::STRDroX;
3711 case AArch64::LDURXi:
3712 case AArch64::LDRXui:
3713 return AArch64::LDRXroX;
3714 case AArch64::STURXi:
3715 case AArch64::STRXui:
3716 return AArch64::STRXroX;
3717 case AArch64::LDURWi:
3718 case AArch64::LDRWui:
3719 return AArch64::LDRWroX;
3720 case AArch64::LDURSWi:
3721 case AArch64::LDRSWui:
3722 return AArch64::LDRSWroX;
3723 case AArch64::STURWi:
3724 case AArch64::STRWui:
3725 return AArch64::STRWroX;
3726 case AArch64::LDURHi:
3727 case AArch64::LDRHui:
3728 return AArch64::LDRHroX;
3729 case AArch64::STURHi:
3730 case AArch64::STRHui:
3731 return AArch64::STRHroX;
3732 case AArch64::LDURHHi:
3733 case AArch64::LDRHHui:
3734 return AArch64::LDRHHroX;
3735 case AArch64::STURHHi:
3736 case AArch64::STRHHui:
3737 return AArch64::STRHHroX;
3738 case AArch64::LDURSHXi:
3739 case AArch64::LDRSHXui:
3740 return AArch64::LDRSHXroX;
3741 case AArch64::LDURSHWi:
3742 case AArch64::LDRSHWui:
3743 return AArch64::LDRSHWroX;
3744 case AArch64::LDURBi:
3745 case AArch64::LDRBui:
3746 return AArch64::LDRBroX;
3747 case AArch64::LDURBBi:
3748 case AArch64::LDRBBui:
3749 return AArch64::LDRBBroX;
3750 case AArch64::LDURSBXi:
3751 case AArch64::LDRSBXui:
3752 return AArch64::LDRSBXroX;
3753 case AArch64::LDURSBWi:
3754 case AArch64::LDRSBWui:
3755 return AArch64::LDRSBWroX;
3756 case AArch64::STURBi:
3757 case AArch64::STRBui:
3758 return AArch64::STRBroX;
3759 case AArch64::STURBBi:
3760 case AArch64::STRBBui:
3761 return AArch64::STRBBroX;
3762 }
3763}
3764
3765// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3766// the opcode of an instruction performing the same operation, but using the
3767// [Reg, #Imm] addressing mode with scaled offset.
3768unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3769 switch (Opcode) {
3770 default:
3771 llvm_unreachable("Address folding not implemented for instruction");
3772
3773 case AArch64::LDURQi:
3774 Scale = 16;
3775 return AArch64::LDRQui;
3776 case AArch64::STURQi:
3777 Scale = 16;
3778 return AArch64::STRQui;
3779 case AArch64::LDURDi:
3780 Scale = 8;
3781 return AArch64::LDRDui;
3782 case AArch64::STURDi:
3783 Scale = 8;
3784 return AArch64::STRDui;
3785 case AArch64::LDURXi:
3786 Scale = 8;
3787 return AArch64::LDRXui;
3788 case AArch64::STURXi:
3789 Scale = 8;
3790 return AArch64::STRXui;
3791 case AArch64::LDURWi:
3792 Scale = 4;
3793 return AArch64::LDRWui;
3794 case AArch64::LDURSWi:
3795 Scale = 4;
3796 return AArch64::LDRSWui;
3797 case AArch64::STURWi:
3798 Scale = 4;
3799 return AArch64::STRWui;
3800 case AArch64::LDURHi:
3801 Scale = 2;
3802 return AArch64::LDRHui;
3803 case AArch64::STURHi:
3804 Scale = 2;
3805 return AArch64::STRHui;
3806 case AArch64::LDURHHi:
3807 Scale = 2;
3808 return AArch64::LDRHHui;
3809 case AArch64::STURHHi:
3810 Scale = 2;
3811 return AArch64::STRHHui;
3812 case AArch64::LDURSHXi:
3813 Scale = 2;
3814 return AArch64::LDRSHXui;
3815 case AArch64::LDURSHWi:
3816 Scale = 2;
3817 return AArch64::LDRSHWui;
3818 case AArch64::LDURBi:
3819 Scale = 1;
3820 return AArch64::LDRBui;
3821 case AArch64::LDURBBi:
3822 Scale = 1;
3823 return AArch64::LDRBBui;
3824 case AArch64::LDURSBXi:
3825 Scale = 1;
3826 return AArch64::LDRSBXui;
3827 case AArch64::LDURSBWi:
3828 Scale = 1;
3829 return AArch64::LDRSBWui;
3830 case AArch64::STURBi:
3831 Scale = 1;
3832 return AArch64::STRBui;
3833 case AArch64::STURBBi:
3834 Scale = 1;
3835 return AArch64::STRBBui;
3836 case AArch64::LDRQui:
3837 case AArch64::STRQui:
3838 Scale = 16;
3839 return Opcode;
3840 case AArch64::LDRDui:
3841 case AArch64::STRDui:
3842 case AArch64::LDRXui:
3843 case AArch64::STRXui:
3844 Scale = 8;
3845 return Opcode;
3846 case AArch64::LDRWui:
3847 case AArch64::LDRSWui:
3848 case AArch64::STRWui:
3849 Scale = 4;
3850 return Opcode;
3851 case AArch64::LDRHui:
3852 case AArch64::STRHui:
3853 case AArch64::LDRHHui:
3854 case AArch64::STRHHui:
3855 case AArch64::LDRSHXui:
3856 case AArch64::LDRSHWui:
3857 Scale = 2;
3858 return Opcode;
3859 case AArch64::LDRBui:
3860 case AArch64::LDRBBui:
3861 case AArch64::LDRSBXui:
3862 case AArch64::LDRSBWui:
3863 case AArch64::STRBui:
3864 case AArch64::STRBBui:
3865 Scale = 1;
3866 return Opcode;
3867 }
3868}
3869
3870// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3871// the opcode of an instruction performing the same operation, but using the
3872// [Reg, #Imm] addressing mode with unscaled offset.
3873unsigned unscaledOffsetOpcode(unsigned Opcode) {
3874 switch (Opcode) {
3875 default:
3876 llvm_unreachable("Address folding not implemented for instruction");
3877
3878 case AArch64::LDURQi:
3879 case AArch64::STURQi:
3880 case AArch64::LDURDi:
3881 case AArch64::STURDi:
3882 case AArch64::LDURXi:
3883 case AArch64::STURXi:
3884 case AArch64::LDURWi:
3885 case AArch64::LDURSWi:
3886 case AArch64::STURWi:
3887 case AArch64::LDURHi:
3888 case AArch64::STURHi:
3889 case AArch64::LDURHHi:
3890 case AArch64::STURHHi:
3891 case AArch64::LDURSHXi:
3892 case AArch64::LDURSHWi:
3893 case AArch64::LDURBi:
3894 case AArch64::STURBi:
3895 case AArch64::LDURBBi:
3896 case AArch64::STURBBi:
3897 case AArch64::LDURSBWi:
3898 case AArch64::LDURSBXi:
3899 return Opcode;
3900 case AArch64::LDRQui:
3901 return AArch64::LDURQi;
3902 case AArch64::STRQui:
3903 return AArch64::STURQi;
3904 case AArch64::LDRDui:
3905 return AArch64::LDURDi;
3906 case AArch64::STRDui:
3907 return AArch64::STURDi;
3908 case AArch64::LDRXui:
3909 return AArch64::LDURXi;
3910 case AArch64::STRXui:
3911 return AArch64::STURXi;
3912 case AArch64::LDRWui:
3913 return AArch64::LDURWi;
3914 case AArch64::LDRSWui:
3915 return AArch64::LDURSWi;
3916 case AArch64::STRWui:
3917 return AArch64::STURWi;
3918 case AArch64::LDRHui:
3919 return AArch64::LDURHi;
3920 case AArch64::STRHui:
3921 return AArch64::STURHi;
3922 case AArch64::LDRHHui:
3923 return AArch64::LDURHHi;
3924 case AArch64::STRHHui:
3925 return AArch64::STURHHi;
3926 case AArch64::LDRSHXui:
3927 return AArch64::LDURSHXi;
3928 case AArch64::LDRSHWui:
3929 return AArch64::LDURSHWi;
3930 case AArch64::LDRBBui:
3931 return AArch64::LDURBBi;
3932 case AArch64::LDRBui:
3933 return AArch64::LDURBi;
3934 case AArch64::STRBBui:
3935 return AArch64::STURBBi;
3936 case AArch64::STRBui:
3937 return AArch64::STURBi;
3938 case AArch64::LDRSBWui:
3939 return AArch64::LDURSBWi;
3940 case AArch64::LDRSBXui:
3941 return AArch64::LDURSBXi;
3942 }
3943}
3944
3945// Given the opcode of a memory load/store instruction, return the opcode of an
3946// instruction performing the same operation, but using
3947// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3948// offset register.
3949static unsigned offsetExtendOpcode(unsigned Opcode) {
3950 switch (Opcode) {
3951 default:
3952 llvm_unreachable("Address folding not implemented for instruction");
3953
3954 case AArch64::LDRQroX:
3955 case AArch64::LDURQi:
3956 case AArch64::LDRQui:
3957 return AArch64::LDRQroW;
3958 case AArch64::STRQroX:
3959 case AArch64::STURQi:
3960 case AArch64::STRQui:
3961 return AArch64::STRQroW;
3962 case AArch64::LDRDroX:
3963 case AArch64::LDURDi:
3964 case AArch64::LDRDui:
3965 return AArch64::LDRDroW;
3966 case AArch64::STRDroX:
3967 case AArch64::STURDi:
3968 case AArch64::STRDui:
3969 return AArch64::STRDroW;
3970 case AArch64::LDRXroX:
3971 case AArch64::LDURXi:
3972 case AArch64::LDRXui:
3973 return AArch64::LDRXroW;
3974 case AArch64::STRXroX:
3975 case AArch64::STURXi:
3976 case AArch64::STRXui:
3977 return AArch64::STRXroW;
3978 case AArch64::LDRWroX:
3979 case AArch64::LDURWi:
3980 case AArch64::LDRWui:
3981 return AArch64::LDRWroW;
3982 case AArch64::LDRSWroX:
3983 case AArch64::LDURSWi:
3984 case AArch64::LDRSWui:
3985 return AArch64::LDRSWroW;
3986 case AArch64::STRWroX:
3987 case AArch64::STURWi:
3988 case AArch64::STRWui:
3989 return AArch64::STRWroW;
3990 case AArch64::LDRHroX:
3991 case AArch64::LDURHi:
3992 case AArch64::LDRHui:
3993 return AArch64::LDRHroW;
3994 case AArch64::STRHroX:
3995 case AArch64::STURHi:
3996 case AArch64::STRHui:
3997 return AArch64::STRHroW;
3998 case AArch64::LDRHHroX:
3999 case AArch64::LDURHHi:
4000 case AArch64::LDRHHui:
4001 return AArch64::LDRHHroW;
4002 case AArch64::STRHHroX:
4003 case AArch64::STURHHi:
4004 case AArch64::STRHHui:
4005 return AArch64::STRHHroW;
4006 case AArch64::LDRSHXroX:
4007 case AArch64::LDURSHXi:
4008 case AArch64::LDRSHXui:
4009 return AArch64::LDRSHXroW;
4010 case AArch64::LDRSHWroX:
4011 case AArch64::LDURSHWi:
4012 case AArch64::LDRSHWui:
4013 return AArch64::LDRSHWroW;
4014 case AArch64::LDRBroX:
4015 case AArch64::LDURBi:
4016 case AArch64::LDRBui:
4017 return AArch64::LDRBroW;
4018 case AArch64::LDRBBroX:
4019 case AArch64::LDURBBi:
4020 case AArch64::LDRBBui:
4021 return AArch64::LDRBBroW;
4022 case AArch64::LDRSBXroX:
4023 case AArch64::LDURSBXi:
4024 case AArch64::LDRSBXui:
4025 return AArch64::LDRSBXroW;
4026 case AArch64::LDRSBWroX:
4027 case AArch64::LDURSBWi:
4028 case AArch64::LDRSBWui:
4029 return AArch64::LDRSBWroW;
4030 case AArch64::STRBroX:
4031 case AArch64::STURBi:
4032 case AArch64::STRBui:
4033 return AArch64::STRBroW;
4034 case AArch64::STRBBroX:
4035 case AArch64::STURBBi:
4036 case AArch64::STRBBui:
4037 return AArch64::STRBBroW;
4038 }
4039}
4040
4042 const ExtAddrMode &AM) const {
4043
4044 const DebugLoc &DL = MemI.getDebugLoc();
4045 MachineBasicBlock &MBB = *MemI.getParent();
4047
4049 if (AM.ScaledReg) {
4050 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4051 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4052 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4053 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4054 .addReg(MemI.getOperand(0).getReg(),
4055 MemI.mayLoad() ? RegState::Define : 0)
4056 .addReg(AM.BaseReg)
4057 .addReg(AM.ScaledReg)
4058 .addImm(0)
4059 .addImm(AM.Scale > 1)
4060 .setMemRefs(MemI.memoperands())
4061 .setMIFlags(MemI.getFlags());
4062 return B.getInstr();
4063 }
4064
4065 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4066 "Addressing mode not supported for folding");
4067
4068 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4069 unsigned Scale = 1;
4070 unsigned Opcode = MemI.getOpcode();
4071 if (isInt<9>(AM.Displacement))
4072 Opcode = unscaledOffsetOpcode(Opcode);
4073 else
4074 Opcode = scaledOffsetOpcode(Opcode, Scale);
4075
4076 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4077 .addReg(MemI.getOperand(0).getReg(),
4078 MemI.mayLoad() ? RegState::Define : 0)
4079 .addReg(AM.BaseReg)
4080 .addImm(AM.Displacement / Scale)
4081 .setMemRefs(MemI.memoperands())
4082 .setMIFlags(MemI.getFlags());
4083 return B.getInstr();
4084 }
4085
4088 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4089 assert(AM.ScaledReg && !AM.Displacement &&
4090 "Address offset can be a register or an immediate, but not both");
4091 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4092 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4093 // Make sure the offset register is in the correct register class.
4094 Register OffsetReg = AM.ScaledReg;
4095 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4096 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4097 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4098 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4099 .addReg(AM.ScaledReg, 0, AArch64::sub_32);
4100 }
4101 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4102 .addReg(MemI.getOperand(0).getReg(),
4103 MemI.mayLoad() ? RegState::Define : 0)
4104 .addReg(AM.BaseReg)
4105 .addReg(OffsetReg)
4107 .addImm(AM.Scale != 1)
4108 .setMemRefs(MemI.memoperands())
4109 .setMIFlags(MemI.getFlags());
4110
4111 return B.getInstr();
4112 }
4113
4115 "Function must not be called with an addressing mode it can't handle");
4116}
4117
4118/// Return true if the opcode is a post-index ld/st instruction, which really
4119/// loads from base+0.
4120static bool isPostIndexLdStOpcode(unsigned Opcode) {
4121 switch (Opcode) {
4122 default:
4123 return false;
4124 case AArch64::LD1Fourv16b_POST:
4125 case AArch64::LD1Fourv1d_POST:
4126 case AArch64::LD1Fourv2d_POST:
4127 case AArch64::LD1Fourv2s_POST:
4128 case AArch64::LD1Fourv4h_POST:
4129 case AArch64::LD1Fourv4s_POST:
4130 case AArch64::LD1Fourv8b_POST:
4131 case AArch64::LD1Fourv8h_POST:
4132 case AArch64::LD1Onev16b_POST:
4133 case AArch64::LD1Onev1d_POST:
4134 case AArch64::LD1Onev2d_POST:
4135 case AArch64::LD1Onev2s_POST:
4136 case AArch64::LD1Onev4h_POST:
4137 case AArch64::LD1Onev4s_POST:
4138 case AArch64::LD1Onev8b_POST:
4139 case AArch64::LD1Onev8h_POST:
4140 case AArch64::LD1Rv16b_POST:
4141 case AArch64::LD1Rv1d_POST:
4142 case AArch64::LD1Rv2d_POST:
4143 case AArch64::LD1Rv2s_POST:
4144 case AArch64::LD1Rv4h_POST:
4145 case AArch64::LD1Rv4s_POST:
4146 case AArch64::LD1Rv8b_POST:
4147 case AArch64::LD1Rv8h_POST:
4148 case AArch64::LD1Threev16b_POST:
4149 case AArch64::LD1Threev1d_POST:
4150 case AArch64::LD1Threev2d_POST:
4151 case AArch64::LD1Threev2s_POST:
4152 case AArch64::LD1Threev4h_POST:
4153 case AArch64::LD1Threev4s_POST:
4154 case AArch64::LD1Threev8b_POST:
4155 case AArch64::LD1Threev8h_POST:
4156 case AArch64::LD1Twov16b_POST:
4157 case AArch64::LD1Twov1d_POST:
4158 case AArch64::LD1Twov2d_POST:
4159 case AArch64::LD1Twov2s_POST:
4160 case AArch64::LD1Twov4h_POST:
4161 case AArch64::LD1Twov4s_POST:
4162 case AArch64::LD1Twov8b_POST:
4163 case AArch64::LD1Twov8h_POST:
4164 case AArch64::LD1i16_POST:
4165 case AArch64::LD1i32_POST:
4166 case AArch64::LD1i64_POST:
4167 case AArch64::LD1i8_POST:
4168 case AArch64::LD2Rv16b_POST:
4169 case AArch64::LD2Rv1d_POST:
4170 case AArch64::LD2Rv2d_POST:
4171 case AArch64::LD2Rv2s_POST:
4172 case AArch64::LD2Rv4h_POST:
4173 case AArch64::LD2Rv4s_POST:
4174 case AArch64::LD2Rv8b_POST:
4175 case AArch64::LD2Rv8h_POST:
4176 case AArch64::LD2Twov16b_POST:
4177 case AArch64::LD2Twov2d_POST:
4178 case AArch64::LD2Twov2s_POST:
4179 case AArch64::LD2Twov4h_POST:
4180 case AArch64::LD2Twov4s_POST:
4181 case AArch64::LD2Twov8b_POST:
4182 case AArch64::LD2Twov8h_POST:
4183 case AArch64::LD2i16_POST:
4184 case AArch64::LD2i32_POST:
4185 case AArch64::LD2i64_POST:
4186 case AArch64::LD2i8_POST:
4187 case AArch64::LD3Rv16b_POST:
4188 case AArch64::LD3Rv1d_POST:
4189 case AArch64::LD3Rv2d_POST:
4190 case AArch64::LD3Rv2s_POST:
4191 case AArch64::LD3Rv4h_POST:
4192 case AArch64::LD3Rv4s_POST:
4193 case AArch64::LD3Rv8b_POST:
4194 case AArch64::LD3Rv8h_POST:
4195 case AArch64::LD3Threev16b_POST:
4196 case AArch64::LD3Threev2d_POST:
4197 case AArch64::LD3Threev2s_POST:
4198 case AArch64::LD3Threev4h_POST:
4199 case AArch64::LD3Threev4s_POST:
4200 case AArch64::LD3Threev8b_POST:
4201 case AArch64::LD3Threev8h_POST:
4202 case AArch64::LD3i16_POST:
4203 case AArch64::LD3i32_POST:
4204 case AArch64::LD3i64_POST:
4205 case AArch64::LD3i8_POST:
4206 case AArch64::LD4Fourv16b_POST:
4207 case AArch64::LD4Fourv2d_POST:
4208 case AArch64::LD4Fourv2s_POST:
4209 case AArch64::LD4Fourv4h_POST:
4210 case AArch64::LD4Fourv4s_POST:
4211 case AArch64::LD4Fourv8b_POST:
4212 case AArch64::LD4Fourv8h_POST:
4213 case AArch64::LD4Rv16b_POST:
4214 case AArch64::LD4Rv1d_POST:
4215 case AArch64::LD4Rv2d_POST:
4216 case AArch64::LD4Rv2s_POST:
4217 case AArch64::LD4Rv4h_POST:
4218 case AArch64::LD4Rv4s_POST:
4219 case AArch64::LD4Rv8b_POST:
4220 case AArch64::LD4Rv8h_POST:
4221 case AArch64::LD4i16_POST:
4222 case AArch64::LD4i32_POST:
4223 case AArch64::LD4i64_POST:
4224 case AArch64::LD4i8_POST:
4225 case AArch64::LDAPRWpost:
4226 case AArch64::LDAPRXpost:
4227 case AArch64::LDIAPPWpost:
4228 case AArch64::LDIAPPXpost:
4229 case AArch64::LDPDpost:
4230 case AArch64::LDPQpost:
4231 case AArch64::LDPSWpost:
4232 case AArch64::LDPSpost:
4233 case AArch64::LDPWpost:
4234 case AArch64::LDPXpost:
4235 case AArch64::LDRBBpost:
4236 case AArch64::LDRBpost:
4237 case AArch64::LDRDpost:
4238 case AArch64::LDRHHpost:
4239 case AArch64::LDRHpost:
4240 case AArch64::LDRQpost:
4241 case AArch64::LDRSBWpost:
4242 case AArch64::LDRSBXpost:
4243 case AArch64::LDRSHWpost:
4244 case AArch64::LDRSHXpost:
4245 case AArch64::LDRSWpost:
4246 case AArch64::LDRSpost:
4247 case AArch64::LDRWpost:
4248 case AArch64::LDRXpost:
4249 case AArch64::ST1Fourv16b_POST:
4250 case AArch64::ST1Fourv1d_POST:
4251 case AArch64::ST1Fourv2d_POST:
4252 case AArch64::ST1Fourv2s_POST:
4253 case AArch64::ST1Fourv4h_POST:
4254 case AArch64::ST1Fourv4s_POST:
4255 case AArch64::ST1Fourv8b_POST:
4256 case AArch64::ST1Fourv8h_POST:
4257 case AArch64::ST1Onev16b_POST:
4258 case AArch64::ST1Onev1d_POST:
4259 case AArch64::ST1Onev2d_POST:
4260 case AArch64::ST1Onev2s_POST:
4261 case AArch64::ST1Onev4h_POST:
4262 case AArch64::ST1Onev4s_POST:
4263 case AArch64::ST1Onev8b_POST:
4264 case AArch64::ST1Onev8h_POST:
4265 case AArch64::ST1Threev16b_POST:
4266 case AArch64::ST1Threev1d_POST:
4267 case AArch64::ST1Threev2d_POST:
4268 case AArch64::ST1Threev2s_POST:
4269 case AArch64::ST1Threev4h_POST:
4270 case AArch64::ST1Threev4s_POST:
4271 case AArch64::ST1Threev8b_POST:
4272 case AArch64::ST1Threev8h_POST:
4273 case AArch64::ST1Twov16b_POST:
4274 case AArch64::ST1Twov1d_POST:
4275 case AArch64::ST1Twov2d_POST:
4276 case AArch64::ST1Twov2s_POST:
4277 case AArch64::ST1Twov4h_POST:
4278 case AArch64::ST1Twov4s_POST:
4279 case AArch64::ST1Twov8b_POST:
4280 case AArch64::ST1Twov8h_POST:
4281 case AArch64::ST1i16_POST:
4282 case AArch64::ST1i32_POST:
4283 case AArch64::ST1i64_POST:
4284 case AArch64::ST1i8_POST:
4285 case AArch64::ST2GPostIndex:
4286 case AArch64::ST2Twov16b_POST:
4287 case AArch64::ST2Twov2d_POST:
4288 case AArch64::ST2Twov2s_POST:
4289 case AArch64::ST2Twov4h_POST:
4290 case AArch64::ST2Twov4s_POST:
4291 case AArch64::ST2Twov8b_POST:
4292 case AArch64::ST2Twov8h_POST:
4293 case AArch64::ST2i16_POST:
4294 case AArch64::ST2i32_POST:
4295 case AArch64::ST2i64_POST:
4296 case AArch64::ST2i8_POST:
4297 case AArch64::ST3Threev16b_POST:
4298 case AArch64::ST3Threev2d_POST:
4299 case AArch64::ST3Threev2s_POST:
4300 case AArch64::ST3Threev4h_POST:
4301 case AArch64::ST3Threev4s_POST:
4302 case AArch64::ST3Threev8b_POST:
4303 case AArch64::ST3Threev8h_POST:
4304 case AArch64::ST3i16_POST:
4305 case AArch64::ST3i32_POST:
4306 case AArch64::ST3i64_POST:
4307 case AArch64::ST3i8_POST:
4308 case AArch64::ST4Fourv16b_POST:
4309 case AArch64::ST4Fourv2d_POST:
4310 case AArch64::ST4Fourv2s_POST:
4311 case AArch64::ST4Fourv4h_POST:
4312 case AArch64::ST4Fourv4s_POST:
4313 case AArch64::ST4Fourv8b_POST:
4314 case AArch64::ST4Fourv8h_POST:
4315 case AArch64::ST4i16_POST:
4316 case AArch64::ST4i32_POST:
4317 case AArch64::ST4i64_POST:
4318 case AArch64::ST4i8_POST:
4319 case AArch64::STGPostIndex:
4320 case AArch64::STGPpost:
4321 case AArch64::STPDpost:
4322 case AArch64::STPQpost:
4323 case AArch64::STPSpost:
4324 case AArch64::STPWpost:
4325 case AArch64::STPXpost:
4326 case AArch64::STRBBpost:
4327 case AArch64::STRBpost:
4328 case AArch64::STRDpost:
4329 case AArch64::STRHHpost:
4330 case AArch64::STRHpost:
4331 case AArch64::STRQpost:
4332 case AArch64::STRSpost:
4333 case AArch64::STRWpost:
4334 case AArch64::STRXpost:
4335 case AArch64::STZ2GPostIndex:
4336 case AArch64::STZGPostIndex:
4337 return true;
4338 }
4339}
4340
4342 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4343 bool &OffsetIsScalable, TypeSize &Width,
4344 const TargetRegisterInfo *TRI) const {
4345 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4346 // Handle only loads/stores with base register followed by immediate offset.
4347 if (LdSt.getNumExplicitOperands() == 3) {
4348 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4349 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4350 !LdSt.getOperand(2).isImm())
4351 return false;
4352 } else if (LdSt.getNumExplicitOperands() == 4) {
4353 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4354 if (!LdSt.getOperand(1).isReg() ||
4355 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4356 !LdSt.getOperand(3).isImm())
4357 return false;
4358 } else
4359 return false;
4360
4361 // Get the scaling factor for the instruction and set the width for the
4362 // instruction.
4363 TypeSize Scale(0U, false);
4364 int64_t Dummy1, Dummy2;
4365
4366 // If this returns false, then it's an instruction we don't want to handle.
4367 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4368 return false;
4369
4370 // Compute the offset. Offset is calculated as the immediate operand
4371 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4372 // set to 1. Postindex are a special case which have an offset of 0.
4373 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4374 BaseOp = &LdSt.getOperand(2);
4375 Offset = 0;
4376 } else if (LdSt.getNumExplicitOperands() == 3) {
4377 BaseOp = &LdSt.getOperand(1);
4378 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4379 } else {
4380 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4381 BaseOp = &LdSt.getOperand(2);
4382 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4383 }
4384 OffsetIsScalable = Scale.isScalable();
4385
4386 return BaseOp->isReg() || BaseOp->isFI();
4387}
4388
4391 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4392 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4393 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4394 return OfsOp;
4395}
4396
4397bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4398 TypeSize &Width, int64_t &MinOffset,
4399 int64_t &MaxOffset) {
4400 switch (Opcode) {
4401 // Not a memory operation or something we want to handle.
4402 default:
4403 Scale = TypeSize::getFixed(0);
4404 Width = TypeSize::getFixed(0);
4405 MinOffset = MaxOffset = 0;
4406 return false;
4407 // LDR / STR
4408 case AArch64::LDRQui:
4409 case AArch64::STRQui:
4410 Scale = TypeSize::getFixed(16);
4411 Width = TypeSize::getFixed(16);
4412 MinOffset = 0;
4413 MaxOffset = 4095;
4414 break;
4415 case AArch64::LDRXui:
4416 case AArch64::LDRDui:
4417 case AArch64::STRXui:
4418 case AArch64::STRDui:
4419 case AArch64::PRFMui:
4420 Scale = TypeSize::getFixed(8);
4421 Width = TypeSize::getFixed(8);
4422 MinOffset = 0;
4423 MaxOffset = 4095;
4424 break;
4425 case AArch64::LDRWui:
4426 case AArch64::LDRSui:
4427 case AArch64::LDRSWui:
4428 case AArch64::STRWui:
4429 case AArch64::STRSui:
4430 Scale = TypeSize::getFixed(4);
4431 Width = TypeSize::getFixed(4);
4432 MinOffset = 0;
4433 MaxOffset = 4095;
4434 break;
4435 case AArch64::LDRHui:
4436 case AArch64::LDRHHui:
4437 case AArch64::LDRSHWui:
4438 case AArch64::LDRSHXui:
4439 case AArch64::STRHui:
4440 case AArch64::STRHHui:
4441 Scale = TypeSize::getFixed(2);
4442 Width = TypeSize::getFixed(2);
4443 MinOffset = 0;
4444 MaxOffset = 4095;
4445 break;
4446 case AArch64::LDRBui:
4447 case AArch64::LDRBBui:
4448 case AArch64::LDRSBWui:
4449 case AArch64::LDRSBXui:
4450 case AArch64::STRBui:
4451 case AArch64::STRBBui:
4452 Scale = TypeSize::getFixed(1);
4453 Width = TypeSize::getFixed(1);
4454 MinOffset = 0;
4455 MaxOffset = 4095;
4456 break;
4457 // post/pre inc
4458 case AArch64::STRQpre:
4459 case AArch64::LDRQpost:
4460 Scale = TypeSize::getFixed(1);
4461 Width = TypeSize::getFixed(16);
4462 MinOffset = -256;
4463 MaxOffset = 255;
4464 break;
4465 case AArch64::LDRDpost:
4466 case AArch64::LDRDpre:
4467 case AArch64::LDRXpost:
4468 case AArch64::LDRXpre:
4469 case AArch64::STRDpost:
4470 case AArch64::STRDpre:
4471 case AArch64::STRXpost:
4472 case AArch64::STRXpre:
4473 Scale = TypeSize::getFixed(1);
4474 Width = TypeSize::getFixed(8);
4475 MinOffset = -256;
4476 MaxOffset = 255;
4477 break;
4478 case AArch64::STRWpost:
4479 case AArch64::STRWpre:
4480 case AArch64::LDRWpost:
4481 case AArch64::LDRWpre:
4482 case AArch64::STRSpost:
4483 case AArch64::STRSpre:
4484 case AArch64::LDRSpost:
4485 case AArch64::LDRSpre:
4486 Scale = TypeSize::getFixed(1);
4487 Width = TypeSize::getFixed(4);
4488 MinOffset = -256;
4489 MaxOffset = 255;
4490 break;
4491 case AArch64::LDRHpost:
4492 case AArch64::LDRHpre:
4493 case AArch64::STRHpost:
4494 case AArch64::STRHpre:
4495 case AArch64::LDRHHpost:
4496 case AArch64::LDRHHpre:
4497 case AArch64::STRHHpost:
4498 case AArch64::STRHHpre:
4499 Scale = TypeSize::getFixed(1);
4500 Width = TypeSize::getFixed(2);
4501 MinOffset = -256;
4502 MaxOffset = 255;
4503 break;
4504 case AArch64::LDRBpost:
4505 case AArch64::LDRBpre:
4506 case AArch64::STRBpost:
4507 case AArch64::STRBpre:
4508 case AArch64::LDRBBpost:
4509 case AArch64::LDRBBpre:
4510 case AArch64::STRBBpost:
4511 case AArch64::STRBBpre:
4512 Scale = TypeSize::getFixed(1);
4513 Width = TypeSize::getFixed(1);
4514 MinOffset = -256;
4515 MaxOffset = 255;
4516 break;
4517 // Unscaled
4518 case AArch64::LDURQi:
4519 case AArch64::STURQi:
4520 Scale = TypeSize::getFixed(1);
4521 Width = TypeSize::getFixed(16);
4522 MinOffset = -256;
4523 MaxOffset = 255;
4524 break;
4525 case AArch64::LDURXi:
4526 case AArch64::LDURDi:
4527 case AArch64::LDAPURXi:
4528 case AArch64::STURXi:
4529 case AArch64::STURDi:
4530 case AArch64::STLURXi:
4531 case AArch64::PRFUMi:
4532 Scale = TypeSize::getFixed(1);
4533 Width = TypeSize::getFixed(8);
4534 MinOffset = -256;
4535 MaxOffset = 255;
4536 break;
4537 case AArch64::LDURWi:
4538 case AArch64::LDURSi:
4539 case AArch64::LDURSWi:
4540 case AArch64::LDAPURi:
4541 case AArch64::LDAPURSWi:
4542 case AArch64::STURWi:
4543 case AArch64::STURSi:
4544 case AArch64::STLURWi:
4545 Scale = TypeSize::getFixed(1);
4546 Width = TypeSize::getFixed(4);
4547 MinOffset = -256;
4548 MaxOffset = 255;
4549 break;
4550 case AArch64::LDURHi:
4551 case AArch64::LDURHHi:
4552 case AArch64::LDURSHXi:
4553 case AArch64::LDURSHWi:
4554 case AArch64::LDAPURHi:
4555 case AArch64::LDAPURSHWi:
4556 case AArch64::LDAPURSHXi:
4557 case AArch64::STURHi:
4558 case AArch64::STURHHi:
4559 case AArch64::STLURHi:
4560 Scale = TypeSize::getFixed(1);
4561 Width = TypeSize::getFixed(2);
4562 MinOffset = -256;
4563 MaxOffset = 255;
4564 break;
4565 case AArch64::LDURBi:
4566 case AArch64::LDURBBi:
4567 case AArch64::LDURSBXi:
4568 case AArch64::LDURSBWi:
4569 case AArch64::LDAPURBi:
4570 case AArch64::LDAPURSBWi:
4571 case AArch64::LDAPURSBXi:
4572 case AArch64::STURBi:
4573 case AArch64::STURBBi:
4574 case AArch64::STLURBi:
4575 Scale = TypeSize::getFixed(1);
4576 Width = TypeSize::getFixed(1);
4577 MinOffset = -256;
4578 MaxOffset = 255;
4579 break;
4580 // LDP / STP (including pre/post inc)
4581 case AArch64::LDPQi:
4582 case AArch64::LDNPQi:
4583 case AArch64::STPQi:
4584 case AArch64::STNPQi:
4585 case AArch64::LDPQpost:
4586 case AArch64::LDPQpre:
4587 case AArch64::STPQpost:
4588 case AArch64::STPQpre:
4589 Scale = TypeSize::getFixed(16);
4590 Width = TypeSize::getFixed(16 * 2);
4591 MinOffset = -64;
4592 MaxOffset = 63;
4593 break;
4594 case AArch64::LDPXi:
4595 case AArch64::LDPDi:
4596 case AArch64::LDNPXi:
4597 case AArch64::LDNPDi:
4598 case AArch64::STPXi:
4599 case AArch64::STPDi:
4600 case AArch64::STNPXi:
4601 case AArch64::STNPDi:
4602 case AArch64::LDPDpost:
4603 case AArch64::LDPDpre:
4604 case AArch64::LDPXpost:
4605 case AArch64::LDPXpre:
4606 case AArch64::STPDpost:
4607 case AArch64::STPDpre:
4608 case AArch64::STPXpost:
4609 case AArch64::STPXpre:
4610 Scale = TypeSize::getFixed(8);
4611 Width = TypeSize::getFixed(8 * 2);
4612 MinOffset = -64;
4613 MaxOffset = 63;
4614 break;
4615 case AArch64::LDPWi:
4616 case AArch64::LDPSi:
4617 case AArch64::LDNPWi:
4618 case AArch64::LDNPSi:
4619 case AArch64::STPWi:
4620 case AArch64::STPSi:
4621 case AArch64::STNPWi:
4622 case AArch64::STNPSi:
4623 case AArch64::LDPSpost:
4624 case AArch64::LDPSpre:
4625 case AArch64::LDPWpost:
4626 case AArch64::LDPWpre:
4627 case AArch64::STPSpost:
4628 case AArch64::STPSpre:
4629 case AArch64::STPWpost:
4630 case AArch64::STPWpre:
4631 Scale = TypeSize::getFixed(4);
4632 Width = TypeSize::getFixed(4 * 2);
4633 MinOffset = -64;
4634 MaxOffset = 63;
4635 break;
4636 case AArch64::StoreSwiftAsyncContext:
4637 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4638 Scale = TypeSize::getFixed(1);
4639 Width = TypeSize::getFixed(8);
4640 MinOffset = 0;
4641 MaxOffset = 4095;
4642 break;
4643 case AArch64::ADDG:
4644 Scale = TypeSize::getFixed(16);
4645 Width = TypeSize::getFixed(0);
4646 MinOffset = 0;
4647 MaxOffset = 63;
4648 break;
4649 case AArch64::TAGPstack:
4650 Scale = TypeSize::getFixed(16);
4651 Width = TypeSize::getFixed(0);
4652 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4653 // of 63 (not 64!).
4654 MinOffset = -63;
4655 MaxOffset = 63;
4656 break;
4657 case AArch64::LDG:
4658 case AArch64::STGi:
4659 case AArch64::STGPreIndex:
4660 case AArch64::STGPostIndex:
4661 case AArch64::STZGi:
4662 case AArch64::STZGPreIndex:
4663 case AArch64::STZGPostIndex:
4664 Scale = TypeSize::getFixed(16);
4665 Width = TypeSize::getFixed(16);
4666 MinOffset = -256;
4667 MaxOffset = 255;
4668 break;
4669 // SVE
4670 case AArch64::STR_ZZZZXI:
4671 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4672 case AArch64::LDR_ZZZZXI:
4673 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4674 Scale = TypeSize::getScalable(16);
4675 Width = TypeSize::getScalable(16 * 4);
4676 MinOffset = -256;
4677 MaxOffset = 252;
4678 break;
4679 case AArch64::STR_ZZZXI:
4680 case AArch64::LDR_ZZZXI:
4681 Scale = TypeSize::getScalable(16);
4682 Width = TypeSize::getScalable(16 * 3);
4683 MinOffset = -256;
4684 MaxOffset = 253;
4685 break;
4686 case AArch64::STR_ZZXI:
4687 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4688 case AArch64::LDR_ZZXI:
4689 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4690 Scale = TypeSize::getScalable(16);
4691 Width = TypeSize::getScalable(16 * 2);
4692 MinOffset = -256;
4693 MaxOffset = 254;
4694 break;
4695 case AArch64::LDR_PXI:
4696 case AArch64::STR_PXI:
4697 Scale = TypeSize::getScalable(2);
4698 Width = TypeSize::getScalable(2);
4699 MinOffset = -256;
4700 MaxOffset = 255;
4701 break;
4702 case AArch64::LDR_PPXI:
4703 case AArch64::STR_PPXI:
4704 Scale = TypeSize::getScalable(2);
4705 Width = TypeSize::getScalable(2 * 2);
4706 MinOffset = -256;
4707 MaxOffset = 254;
4708 break;
4709 case AArch64::LDR_ZXI:
4710 case AArch64::STR_ZXI:
4711 Scale = TypeSize::getScalable(16);
4712 Width = TypeSize::getScalable(16);
4713 MinOffset = -256;
4714 MaxOffset = 255;
4715 break;
4716 case AArch64::LD1B_IMM:
4717 case AArch64::LD1H_IMM:
4718 case AArch64::LD1W_IMM:
4719 case AArch64::LD1D_IMM:
4720 case AArch64::LDNT1B_ZRI:
4721 case AArch64::LDNT1H_ZRI:
4722 case AArch64::LDNT1W_ZRI:
4723 case AArch64::LDNT1D_ZRI:
4724 case AArch64::ST1B_IMM:
4725 case AArch64::ST1H_IMM:
4726 case AArch64::ST1W_IMM:
4727 case AArch64::ST1D_IMM:
4728 case AArch64::STNT1B_ZRI:
4729 case AArch64::STNT1H_ZRI:
4730 case AArch64::STNT1W_ZRI:
4731 case AArch64::STNT1D_ZRI:
4732 case AArch64::LDNF1B_IMM:
4733 case AArch64::LDNF1H_IMM:
4734 case AArch64::LDNF1W_IMM:
4735 case AArch64::LDNF1D_IMM:
4736 // A full vectors worth of data
4737 // Width = mbytes * elements
4738 Scale = TypeSize::getScalable(16);
4739 Width = TypeSize::getScalable(16);
4740 MinOffset = -8;
4741 MaxOffset = 7;
4742 break;
4743 case AArch64::LD2B_IMM:
4744 case AArch64::LD2H_IMM:
4745 case AArch64::LD2W_IMM:
4746 case AArch64::LD2D_IMM:
4747 case AArch64::ST2B_IMM:
4748 case AArch64::ST2H_IMM:
4749 case AArch64::ST2W_IMM:
4750 case AArch64::ST2D_IMM:
4751 Scale = TypeSize::getScalable(32);
4752 Width = TypeSize::getScalable(16 * 2);
4753 MinOffset = -8;
4754 MaxOffset = 7;
4755 break;
4756 case AArch64::LD3B_IMM:
4757 case AArch64::LD3H_IMM:
4758 case AArch64::LD3W_IMM:
4759 case AArch64::LD3D_IMM:
4760 case AArch64::ST3B_IMM:
4761 case AArch64::ST3H_IMM:
4762 case AArch64::ST3W_IMM:
4763 case AArch64::ST3D_IMM:
4764 Scale = TypeSize::getScalable(48);
4765 Width = TypeSize::getScalable(16 * 3);
4766 MinOffset = -8;
4767 MaxOffset = 7;
4768 break;
4769 case AArch64::LD4B_IMM:
4770 case AArch64::LD4H_IMM:
4771 case AArch64::LD4W_IMM:
4772 case AArch64::LD4D_IMM:
4773 case AArch64::ST4B_IMM:
4774 case AArch64::ST4H_IMM:
4775 case AArch64::ST4W_IMM:
4776 case AArch64::ST4D_IMM:
4777 Scale = TypeSize::getScalable(64);
4778 Width = TypeSize::getScalable(16 * 4);
4779 MinOffset = -8;
4780 MaxOffset = 7;
4781 break;
4782 case AArch64::LD1B_H_IMM:
4783 case AArch64::LD1SB_H_IMM:
4784 case AArch64::LD1H_S_IMM:
4785 case AArch64::LD1SH_S_IMM:
4786 case AArch64::LD1W_D_IMM:
4787 case AArch64::LD1SW_D_IMM:
4788 case AArch64::ST1B_H_IMM:
4789 case AArch64::ST1H_S_IMM:
4790 case AArch64::ST1W_D_IMM:
4791 case AArch64::LDNF1B_H_IMM:
4792 case AArch64::LDNF1SB_H_IMM:
4793 case AArch64::LDNF1H_S_IMM:
4794 case AArch64::LDNF1SH_S_IMM:
4795 case AArch64::LDNF1W_D_IMM:
4796 case AArch64::LDNF1SW_D_IMM:
4797 // A half vector worth of data
4798 // Width = mbytes * elements
4799 Scale = TypeSize::getScalable(8);
4800 Width = TypeSize::getScalable(8);
4801 MinOffset = -8;
4802 MaxOffset = 7;
4803 break;
4804 case AArch64::LD1B_S_IMM:
4805 case AArch64::LD1SB_S_IMM:
4806 case AArch64::LD1H_D_IMM:
4807 case AArch64::LD1SH_D_IMM:
4808 case AArch64::ST1B_S_IMM:
4809 case AArch64::ST1H_D_IMM:
4810 case AArch64::LDNF1B_S_IMM:
4811 case AArch64::LDNF1SB_S_IMM:
4812 case AArch64::LDNF1H_D_IMM:
4813 case AArch64::LDNF1SH_D_IMM:
4814 // A quarter vector worth of data
4815 // Width = mbytes * elements
4816 Scale = TypeSize::getScalable(4);
4817 Width = TypeSize::getScalable(4);
4818 MinOffset = -8;
4819 MaxOffset = 7;
4820 break;
4821 case AArch64::LD1B_D_IMM:
4822 case AArch64::LD1SB_D_IMM:
4823 case AArch64::ST1B_D_IMM:
4824 case AArch64::LDNF1B_D_IMM:
4825 case AArch64::LDNF1SB_D_IMM:
4826 // A eighth vector worth of data
4827 // Width = mbytes * elements
4828 Scale = TypeSize::getScalable(2);
4829 Width = TypeSize::getScalable(2);
4830 MinOffset = -8;
4831 MaxOffset = 7;
4832 break;
4833 case AArch64::ST2Gi:
4834 case AArch64::ST2GPreIndex:
4835 case AArch64::ST2GPostIndex:
4836 case AArch64::STZ2Gi:
4837 case AArch64::STZ2GPreIndex:
4838 case AArch64::STZ2GPostIndex:
4839 Scale = TypeSize::getFixed(16);
4840 Width = TypeSize::getFixed(32);
4841 MinOffset = -256;
4842 MaxOffset = 255;
4843 break;
4844 case AArch64::STGPi:
4845 case AArch64::STGPpost:
4846 case AArch64::STGPpre:
4847 Scale = TypeSize::getFixed(16);
4848 Width = TypeSize::getFixed(16);
4849 MinOffset = -64;
4850 MaxOffset = 63;
4851 break;
4852 case AArch64::LD1RB_IMM:
4853 case AArch64::LD1RB_H_IMM:
4854 case AArch64::LD1RB_S_IMM:
4855 case AArch64::LD1RB_D_IMM:
4856 case AArch64::LD1RSB_H_IMM:
4857 case AArch64::LD1RSB_S_IMM:
4858 case AArch64::LD1RSB_D_IMM:
4859 Scale = TypeSize::getFixed(1);
4860 Width = TypeSize::getFixed(1);
4861 MinOffset = 0;
4862 MaxOffset = 63;
4863 break;
4864 case AArch64::LD1RH_IMM:
4865 case AArch64::LD1RH_S_IMM:
4866 case AArch64::LD1RH_D_IMM:
4867 case AArch64::LD1RSH_S_IMM:
4868 case AArch64::LD1RSH_D_IMM:
4869 Scale = TypeSize::getFixed(2);
4870 Width = TypeSize::getFixed(2);
4871 MinOffset = 0;
4872 MaxOffset = 63;
4873 break;
4874 case AArch64::LD1RW_IMM:
4875 case AArch64::LD1RW_D_IMM:
4876 case AArch64::LD1RSW_IMM:
4877 Scale = TypeSize::getFixed(4);
4878 Width = TypeSize::getFixed(4);
4879 MinOffset = 0;
4880 MaxOffset = 63;
4881 break;
4882 case AArch64::LD1RD_IMM:
4883 Scale = TypeSize::getFixed(8);
4884 Width = TypeSize::getFixed(8);
4885 MinOffset = 0;
4886 MaxOffset = 63;
4887 break;
4888 }
4889
4890 return true;
4891}
4892
4893// Scaling factor for unscaled load or store.
4895 switch (Opc) {
4896 default:
4897 llvm_unreachable("Opcode has unknown scale!");
4898 case AArch64::LDRBBui:
4899 case AArch64::LDURBBi:
4900 case AArch64::LDRSBWui:
4901 case AArch64::LDURSBWi:
4902 case AArch64::STRBBui:
4903 case AArch64::STURBBi:
4904 return 1;
4905 case AArch64::LDRHHui:
4906 case AArch64::LDURHHi:
4907 case AArch64::LDRSHWui:
4908 case AArch64::LDURSHWi:
4909 case AArch64::STRHHui:
4910 case AArch64::STURHHi:
4911 return 2;
4912 case AArch64::LDRSui:
4913 case AArch64::LDURSi:
4914 case AArch64::LDRSpre:
4915 case AArch64::LDRSWui:
4916 case AArch64::LDURSWi:
4917 case AArch64::LDRSWpre:
4918 case AArch64::LDRWpre:
4919 case AArch64::LDRWui:
4920 case AArch64::LDURWi:
4921 case AArch64::STRSui:
4922 case AArch64::STURSi:
4923 case AArch64::STRSpre:
4924 case AArch64::STRWui:
4925 case AArch64::STURWi:
4926 case AArch64::STRWpre:
4927 case AArch64::LDPSi:
4928 case AArch64::LDPSWi:
4929 case AArch64::LDPWi:
4930 case AArch64::STPSi:
4931 case AArch64::STPWi:
4932 return 4;
4933 case AArch64::LDRDui:
4934 case AArch64::LDURDi:
4935 case AArch64::LDRDpre:
4936 case AArch64::LDRXui:
4937 case AArch64::LDURXi:
4938 case AArch64::LDRXpre:
4939 case AArch64::STRDui:
4940 case AArch64::STURDi:
4941 case AArch64::STRDpre:
4942 case AArch64::STRXui:
4943 case AArch64::STURXi:
4944 case AArch64::STRXpre:
4945 case AArch64::LDPDi:
4946 case AArch64::LDPXi:
4947 case AArch64::STPDi:
4948 case AArch64::STPXi:
4949 return 8;
4950 case AArch64::LDRQui:
4951 case AArch64::LDURQi:
4952 case AArch64::STRQui:
4953 case AArch64::STURQi:
4954 case AArch64::STRQpre:
4955 case AArch64::LDPQi:
4956 case AArch64::LDRQpre:
4957 case AArch64::STPQi:
4958 case AArch64::STGi:
4959 case AArch64::STZGi:
4960 case AArch64::ST2Gi:
4961 case AArch64::STZ2Gi:
4962 case AArch64::STGPi:
4963 return 16;
4964 }
4965}
4966
4968 switch (MI.getOpcode()) {
4969 default:
4970 return false;
4971 case AArch64::LDRWpre:
4972 case AArch64::LDRXpre:
4973 case AArch64::LDRSWpre:
4974 case AArch64::LDRSpre:
4975 case AArch64::LDRDpre:
4976 case AArch64::LDRQpre:
4977 return true;
4978 }
4979}
4980
4982 switch (MI.getOpcode()) {
4983 default:
4984 return false;
4985 case AArch64::STRWpre:
4986 case AArch64::STRXpre:
4987 case AArch64::STRSpre:
4988 case AArch64::STRDpre:
4989 case AArch64::STRQpre:
4990 return true;
4991 }
4992}
4993
4995 return isPreLd(MI) || isPreSt(MI);
4996}
4997
4999 switch (MI.getOpcode()) {
5000 default:
5001 return false;
5002 case AArch64::LDPSi:
5003 case AArch64::LDPSWi:
5004 case AArch64::LDPDi:
5005 case AArch64::LDPQi:
5006 case AArch64::LDPWi:
5007 case AArch64::LDPXi:
5008 case AArch64::STPSi:
5009 case AArch64::STPDi:
5010 case AArch64::STPQi:
5011 case AArch64::STPWi:
5012 case AArch64::STPXi:
5013 case AArch64::STGPi:
5014 return true;
5015 }
5016}
5017
5019 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5020 unsigned Idx =
5022 : 1;
5023 return MI.getOperand(Idx);
5024}
5025
5026const MachineOperand &
5028 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5029 unsigned Idx =
5031 : 2;
5032 return MI.getOperand(Idx);
5033}
5034
5035const MachineOperand &
5037 switch (MI.getOpcode()) {
5038 default:
5039 llvm_unreachable("Unexpected opcode");
5040 case AArch64::LDRBroX:
5041 case AArch64::LDRBBroX:
5042 case AArch64::LDRSBXroX:
5043 case AArch64::LDRSBWroX:
5044 case AArch64::LDRHroX:
5045 case AArch64::LDRHHroX:
5046 case AArch64::LDRSHXroX:
5047 case AArch64::LDRSHWroX:
5048 case AArch64::LDRWroX:
5049 case AArch64::LDRSroX:
5050 case AArch64::LDRSWroX:
5051 case AArch64::LDRDroX:
5052 case AArch64::LDRXroX:
5053 case AArch64::LDRQroX:
5054 return MI.getOperand(4);
5055 }
5056}
5057
5059 Register Reg) {
5060 if (MI.getParent() == nullptr)
5061 return nullptr;
5062 const MachineFunction *MF = MI.getParent()->getParent();
5063 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5064}
5065
5067 auto IsHFPR = [&](const MachineOperand &Op) {
5068 if (!Op.isReg())
5069 return false;
5070 auto Reg = Op.getReg();
5071 if (Reg.isPhysical())
5072 return AArch64::FPR16RegClass.contains(Reg);
5073 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5074 return TRC == &AArch64::FPR16RegClass ||
5075 TRC == &AArch64::FPR16_loRegClass;
5076 };
5077 return llvm::any_of(MI.operands(), IsHFPR);
5078}
5079
5081 auto IsQFPR = [&](const MachineOperand &Op) {
5082 if (!Op.isReg())
5083 return false;
5084 auto Reg = Op.getReg();
5085 if (Reg.isPhysical())
5086 return AArch64::FPR128RegClass.contains(Reg);
5087 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5088 return TRC == &AArch64::FPR128RegClass ||
5089 TRC == &AArch64::FPR128_loRegClass;
5090 };
5091 return llvm::any_of(MI.operands(), IsQFPR);
5092}
5093
5095 switch (MI.getOpcode()) {
5096 case AArch64::BRK:
5097 case AArch64::HLT:
5098 case AArch64::PACIASP:
5099 case AArch64::PACIBSP:
5100 // Implicit BTI behavior.
5101 return true;
5102 case AArch64::PAUTH_PROLOGUE:
5103 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5104 return true;
5105 case AArch64::HINT: {
5106 unsigned Imm = MI.getOperand(0).getImm();
5107 // Explicit BTI instruction.
5108 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5109 return true;
5110 // PACI(A|B)SP instructions.
5111 if (Imm == 25 || Imm == 27)
5112 return true;
5113 return false;
5114 }
5115 default:
5116 return false;
5117 }
5118}
5119
5121 if (Reg == 0)
5122 return false;
5123 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5124 return AArch64::FPR128RegClass.contains(Reg) ||
5125 AArch64::FPR64RegClass.contains(Reg) ||
5126 AArch64::FPR32RegClass.contains(Reg) ||
5127 AArch64::FPR16RegClass.contains(Reg) ||
5128 AArch64::FPR8RegClass.contains(Reg);
5129}
5130
5132 auto IsFPR = [&](const MachineOperand &Op) {
5133 if (!Op.isReg())
5134 return false;
5135 auto Reg = Op.getReg();
5136 if (Reg.isPhysical())
5137 return isFpOrNEON(Reg);
5138
5139 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5140 return TRC == &AArch64::FPR128RegClass ||
5141 TRC == &AArch64::FPR128_loRegClass ||
5142 TRC == &AArch64::FPR64RegClass ||
5143 TRC == &AArch64::FPR64_loRegClass ||
5144 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5145 TRC == &AArch64::FPR8RegClass;
5146 };
5147 return llvm::any_of(MI.operands(), IsFPR);
5148}
5149
5150// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5151// scaled.
5152static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5154
5155 // If the byte-offset isn't a multiple of the stride, we can't scale this
5156 // offset.
5157 if (Offset % Scale != 0)
5158 return false;
5159
5160 // Convert the byte-offset used by unscaled into an "element" offset used
5161 // by the scaled pair load/store instructions.
5162 Offset /= Scale;
5163 return true;
5164}
5165
5166static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5167 if (FirstOpc == SecondOpc)
5168 return true;
5169 // We can also pair sign-ext and zero-ext instructions.
5170 switch (FirstOpc) {
5171 default:
5172 return false;
5173 case AArch64::STRSui:
5174 case AArch64::STURSi:
5175 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5176 case AArch64::STRDui:
5177 case AArch64::STURDi:
5178 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5179 case AArch64::STRQui:
5180 case AArch64::STURQi:
5181 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5182 case AArch64::STRWui:
5183 case AArch64::STURWi:
5184 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5185 case AArch64::STRXui:
5186 case AArch64::STURXi:
5187 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5188 case AArch64::LDRSui:
5189 case AArch64::LDURSi:
5190 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5191 case AArch64::LDRDui:
5192 case AArch64::LDURDi:
5193 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5194 case AArch64::LDRQui:
5195 case AArch64::LDURQi:
5196 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5197 case AArch64::LDRWui:
5198 case AArch64::LDURWi:
5199 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5200 case AArch64::LDRSWui:
5201 case AArch64::LDURSWi:
5202 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5203 case AArch64::LDRXui:
5204 case AArch64::LDURXi:
5205 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5206 }
5207 // These instructions can't be paired based on their opcodes.
5208 return false;
5209}
5210
5211static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5212 int64_t Offset1, unsigned Opcode1, int FI2,
5213 int64_t Offset2, unsigned Opcode2) {
5214 // Accesses through fixed stack object frame indices may access a different
5215 // fixed stack slot. Check that the object offsets + offsets match.
5216 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5217 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5218 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5219 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5220 // Convert to scaled object offsets.
5221 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5222 if (ObjectOffset1 % Scale1 != 0)
5223 return false;
5224 ObjectOffset1 /= Scale1;
5225 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5226 if (ObjectOffset2 % Scale2 != 0)
5227 return false;
5228 ObjectOffset2 /= Scale2;
5229 ObjectOffset1 += Offset1;
5230 ObjectOffset2 += Offset2;
5231 return ObjectOffset1 + 1 == ObjectOffset2;
5232 }
5233
5234 return FI1 == FI2;
5235}
5236
5237/// Detect opportunities for ldp/stp formation.
5238///
5239/// Only called for LdSt for which getMemOperandWithOffset returns true.
5241 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5242 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5243 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5244 unsigned NumBytes) const {
5245 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5246 const MachineOperand &BaseOp1 = *BaseOps1.front();
5247 const MachineOperand &BaseOp2 = *BaseOps2.front();
5248 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5249 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5250 if (BaseOp1.getType() != BaseOp2.getType())
5251 return false;
5252
5253 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5254 "Only base registers and frame indices are supported.");
5255
5256 // Check for both base regs and base FI.
5257 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5258 return false;
5259
5260 // Only cluster up to a single pair.
5261 if (ClusterSize > 2)
5262 return false;
5263
5264 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5265 return false;
5266
5267 // Can we pair these instructions based on their opcodes?
5268 unsigned FirstOpc = FirstLdSt.getOpcode();
5269 unsigned SecondOpc = SecondLdSt.getOpcode();
5270 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5271 return false;
5272
5273 // Can't merge volatiles or load/stores that have a hint to avoid pair
5274 // formation, for example.
5275 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5276 !isCandidateToMergeOrPair(SecondLdSt))
5277 return false;
5278
5279 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5280 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5281 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5282 return false;
5283
5284 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5285 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5286 return false;
5287
5288 // Pairwise instructions have a 7-bit signed offset field.
5289 if (Offset1 > 63 || Offset1 < -64)
5290 return false;
5291
5292 // The caller should already have ordered First/SecondLdSt by offset.
5293 // Note: except for non-equal frame index bases
5294 if (BaseOp1.isFI()) {
5295 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5296 "Caller should have ordered offsets.");
5297
5298 const MachineFrameInfo &MFI =
5299 FirstLdSt.getParent()->getParent()->getFrameInfo();
5300 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5301 BaseOp2.getIndex(), Offset2, SecondOpc);
5302 }
5303
5304 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5305
5306 return Offset1 + 1 == Offset2;
5307}
5308
5310 MCRegister Reg, unsigned SubIdx,
5311 unsigned State,
5312 const TargetRegisterInfo *TRI) {
5313 if (!SubIdx)
5314 return MIB.addReg(Reg, State);
5315
5316 if (Reg.isPhysical())
5317 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5318 return MIB.addReg(Reg, State, SubIdx);
5319}
5320
5321static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5322 unsigned NumRegs) {
5323 // We really want the positive remainder mod 32 here, that happens to be
5324 // easily obtainable with a mask.
5325 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5326}
5327
5330 const DebugLoc &DL, MCRegister DestReg,
5331 MCRegister SrcReg, bool KillSrc,
5332 unsigned Opcode,
5333 ArrayRef<unsigned> Indices) const {
5334 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5336 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5337 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5338 unsigned NumRegs = Indices.size();
5339
5340 int SubReg = 0, End = NumRegs, Incr = 1;
5341 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5342 SubReg = NumRegs - 1;
5343 End = -1;
5344 Incr = -1;
5345 }
5346
5347 for (; SubReg != End; SubReg += Incr) {
5348 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5349 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5350 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
5351 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5352 }
5353}
5354
5357 const DebugLoc &DL, MCRegister DestReg,
5358 MCRegister SrcReg, bool KillSrc,
5359 unsigned Opcode, unsigned ZeroReg,
5360 llvm::ArrayRef<unsigned> Indices) const {
5362 unsigned NumRegs = Indices.size();
5363
5364#ifndef NDEBUG
5365 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5366 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5367 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5368 "GPR reg sequences should not be able to overlap");
5369#endif
5370
5371 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5372 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5373 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5374 MIB.addReg(ZeroReg);
5375 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5376 MIB.addImm(0);
5377 }
5378}
5379
5382 const DebugLoc &DL, Register DestReg,
5383 Register SrcReg, bool KillSrc,
5384 bool RenamableDest,
5385 bool RenamableSrc) const {
5386 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5387 AArch64::GPR32spRegClass.contains(SrcReg)) {
5388 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5389 // If either operand is WSP, expand to ADD #0.
5390 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5391 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5392 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5393 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5394 &AArch64::GPR64spRegClass);
5395 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5396 &AArch64::GPR64spRegClass);
5397 // This instruction is reading and writing X registers. This may upset
5398 // the register scavenger and machine verifier, so we need to indicate
5399 // that we are reading an undefined value from SrcRegX, but a proper
5400 // value from SrcReg.
5401 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5402 .addReg(SrcRegX, RegState::Undef)
5403 .addImm(0)
5405 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5406 } else {
5407 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5408 .addReg(SrcReg, getKillRegState(KillSrc))
5409 .addImm(0)
5411 }
5412 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5413 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5414 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5415 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5416 &AArch64::GPR64spRegClass);
5417 assert(DestRegX.isValid() && "Destination super-reg not valid");
5418 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5419 &AArch64::GPR64spRegClass);
5420 assert(SrcRegX.isValid() && "Source super-reg not valid");
5421 // This instruction is reading and writing X registers. This may upset
5422 // the register scavenger and machine verifier, so we need to indicate
5423 // that we are reading an undefined value from SrcRegX, but a proper
5424 // value from SrcReg.
5425 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5426 .addReg(AArch64::XZR)
5427 .addReg(SrcRegX, RegState::Undef)
5428 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5429 } else {
5430 // Otherwise, expand to ORR WZR.
5431 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5432 .addReg(AArch64::WZR)
5433 .addReg(SrcReg, getKillRegState(KillSrc));
5434 }
5435 return;
5436 }
5437
5438 // GPR32 zeroing
5439 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5440 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5441 !Subtarget.hasZeroCycleZeroingGPR32()) {
5442 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5443 &AArch64::GPR64spRegClass);
5444 assert(DestRegX.isValid() && "Destination super-reg not valid");
5445 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5446 .addImm(0)
5448 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5449 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5450 .addImm(0)
5452 } else {
5453 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5454 .addReg(AArch64::WZR)
5455 .addReg(AArch64::WZR);
5456 }
5457 return;
5458 }
5459
5460 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5461 AArch64::GPR64spRegClass.contains(SrcReg)) {
5462 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5463 // If either operand is SP, expand to ADD #0.
5464 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5465 .addReg(SrcReg, getKillRegState(KillSrc))
5466 .addImm(0)
5468 } else {
5469 // Otherwise, expand to ORR XZR.
5470 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5471 .addReg(AArch64::XZR)
5472 .addReg(SrcReg, getKillRegState(KillSrc));
5473 }
5474 return;
5475 }
5476
5477 // GPR64 zeroing
5478 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5479 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5480 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5481 .addImm(0)
5483 } else {
5484 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5485 .addReg(AArch64::XZR)
5486 .addReg(AArch64::XZR);
5487 }
5488 return;
5489 }
5490
5491 // Copy a Predicate register by ORRing with itself.
5492 if (AArch64::PPRRegClass.contains(DestReg) &&
5493 AArch64::PPRRegClass.contains(SrcReg)) {
5494 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5495 "Unexpected SVE register.");
5496 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5497 .addReg(SrcReg) // Pg
5498 .addReg(SrcReg)
5499 .addReg(SrcReg, getKillRegState(KillSrc));
5500 return;
5501 }
5502
5503 // Copy a predicate-as-counter register by ORRing with itself as if it
5504 // were a regular predicate (mask) register.
5505 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5506 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5507 if (DestIsPNR || SrcIsPNR) {
5508 auto ToPPR = [](MCRegister R) -> MCRegister {
5509 return (R - AArch64::PN0) + AArch64::P0;
5510 };
5511 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5512 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5513
5514 if (PPRSrcReg != PPRDestReg) {
5515 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5516 .addReg(PPRSrcReg) // Pg
5517 .addReg(PPRSrcReg)
5518 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5519 if (DestIsPNR)
5520 NewMI.addDef(DestReg, RegState::Implicit);
5521 }
5522 return;
5523 }
5524
5525 // Copy a Z register by ORRing with itself.
5526 if (AArch64::ZPRRegClass.contains(DestReg) &&
5527 AArch64::ZPRRegClass.contains(SrcReg)) {
5528 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5529 "Unexpected SVE register.");
5530 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5531 .addReg(SrcReg)
5532 .addReg(SrcReg, getKillRegState(KillSrc));
5533 return;
5534 }
5535
5536 // Copy a Z register pair by copying the individual sub-registers.
5537 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5538 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5539 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5540 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5541 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5542 "Unexpected SVE register.");
5543 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5544 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5545 Indices);
5546 return;
5547 }
5548
5549 // Copy a Z register triple by copying the individual sub-registers.
5550 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5551 AArch64::ZPR3RegClass.contains(SrcReg)) {
5552 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5553 "Unexpected SVE register.");
5554 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5555 AArch64::zsub2};
5556 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5557 Indices);
5558 return;
5559 }
5560
5561 // Copy a Z register quad by copying the individual sub-registers.
5562 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5563 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5564 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5565 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5566 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5567 "Unexpected SVE register.");
5568 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5569 AArch64::zsub2, AArch64::zsub3};
5570 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5571 Indices);
5572 return;
5573 }
5574
5575 // Copy a DDDD register quad by copying the individual sub-registers.
5576 if (AArch64::DDDDRegClass.contains(DestReg) &&
5577 AArch64::DDDDRegClass.contains(SrcReg)) {
5578 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5579 AArch64::dsub2, AArch64::dsub3};
5580 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5581 Indices);
5582 return;
5583 }
5584
5585 // Copy a DDD register triple by copying the individual sub-registers.
5586 if (AArch64::DDDRegClass.contains(DestReg) &&
5587 AArch64::DDDRegClass.contains(SrcReg)) {
5588 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5589 AArch64::dsub2};
5590 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5591 Indices);
5592 return;
5593 }
5594
5595 // Copy a DD register pair by copying the individual sub-registers.
5596 if (AArch64::DDRegClass.contains(DestReg) &&
5597 AArch64::DDRegClass.contains(SrcReg)) {
5598 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5599 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5600 Indices);
5601 return;
5602 }
5603
5604 // Copy a QQQQ register quad by copying the individual sub-registers.
5605 if (AArch64::QQQQRegClass.contains(DestReg) &&
5606 AArch64::QQQQRegClass.contains(SrcReg)) {
5607 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5608 AArch64::qsub2, AArch64::qsub3};
5609 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5610 Indices);
5611 return;
5612 }
5613
5614 // Copy a QQQ register triple by copying the individual sub-registers.
5615 if (AArch64::QQQRegClass.contains(DestReg) &&
5616 AArch64::QQQRegClass.contains(SrcReg)) {
5617 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5618 AArch64::qsub2};
5619 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5620 Indices);
5621 return;
5622 }
5623
5624 // Copy a QQ register pair by copying the individual sub-registers.
5625 if (AArch64::QQRegClass.contains(DestReg) &&
5626 AArch64::QQRegClass.contains(SrcReg)) {
5627 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5628 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5629 Indices);
5630 return;
5631 }
5632
5633 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5634 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5635 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5636 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5637 AArch64::XZR, Indices);
5638 return;
5639 }
5640
5641 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5642 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5643 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5644 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5645 AArch64::WZR, Indices);
5646 return;
5647 }
5648
5649 if (AArch64::FPR128RegClass.contains(DestReg) &&
5650 AArch64::FPR128RegClass.contains(SrcReg)) {
5651 if (Subtarget.isSVEorStreamingSVEAvailable() &&
5652 !Subtarget.isNeonAvailable())
5653 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5654 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5655 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5656 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5657 else if (Subtarget.isNeonAvailable())
5658 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5659 .addReg(SrcReg)
5660 .addReg(SrcReg, getKillRegState(KillSrc));
5661 else {
5662 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5663 .addReg(AArch64::SP, RegState::Define)
5664 .addReg(SrcReg, getKillRegState(KillSrc))
5665 .addReg(AArch64::SP)
5666 .addImm(-16);
5667 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5668 .addReg(AArch64::SP, RegState::Define)
5669 .addReg(DestReg, RegState::Define)
5670 .addReg(AArch64::SP)
5671 .addImm(16);
5672 }
5673 return;
5674 }
5675
5676 if (AArch64::FPR64RegClass.contains(DestReg) &&
5677 AArch64::FPR64RegClass.contains(SrcReg)) {
5678 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5679 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5680 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
5681 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
5682 &AArch64::FPR128RegClass);
5683 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
5684 &AArch64::FPR128RegClass);
5685 // This instruction is reading and writing Q registers. This may upset
5686 // the register scavenger and machine verifier, so we need to indicate
5687 // that we are reading an undefined value from SrcRegQ, but a proper
5688 // value from SrcReg.
5689 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5690 .addReg(SrcRegQ, RegState::Undef)
5691 .addReg(SrcRegQ, RegState::Undef)
5692 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5693 } else {
5694 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5695 .addReg(SrcReg, getKillRegState(KillSrc));
5696 }
5697 return;
5698 }
5699
5700 if (AArch64::FPR32RegClass.contains(DestReg) &&
5701 AArch64::FPR32RegClass.contains(SrcReg)) {
5702 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5703 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5704 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
5705 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5706 &AArch64::FPR128RegClass);
5707 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5708 &AArch64::FPR128RegClass);
5709 // This instruction is reading and writing Q registers. This may upset
5710 // the register scavenger and machine verifier, so we need to indicate
5711 // that we are reading an undefined value from SrcRegQ, but a proper
5712 // value from SrcReg.
5713 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5714 .addReg(SrcRegQ, RegState::Undef)
5715 .addReg(SrcRegQ, RegState::Undef)
5716 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5717 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5718 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5719 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5720 &AArch64::FPR64RegClass);
5721 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5722 &AArch64::FPR64RegClass);
5723 // This instruction is reading and writing D registers. This may upset
5724 // the register scavenger and machine verifier, so we need to indicate
5725 // that we are reading an undefined value from SrcRegD, but a proper
5726 // value from SrcReg.
5727 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5728 .addReg(SrcRegD, RegState::Undef)
5729 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5730 } else {
5731 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5732 .addReg(SrcReg, getKillRegState(KillSrc));
5733 }
5734 return;
5735 }
5736
5737 if (AArch64::FPR16RegClass.contains(DestReg) &&
5738 AArch64::FPR16RegClass.contains(SrcReg)) {
5739 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5740 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5741 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
5742 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5743 &AArch64::FPR128RegClass);
5744 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5745 &AArch64::FPR128RegClass);
5746 // This instruction is reading and writing Q registers. This may upset
5747 // the register scavenger and machine verifier, so we need to indicate
5748 // that we are reading an undefined value from SrcRegQ, but a proper
5749 // value from SrcReg.
5750 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5751 .addReg(SrcRegQ, RegState::Undef)
5752 .addReg(SrcRegQ, RegState::Undef)
5753 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5754 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5755 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5756 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5757 &AArch64::FPR64RegClass);
5758 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5759 &AArch64::FPR64RegClass);
5760 // This instruction is reading and writing D registers. This may upset
5761 // the register scavenger and machine verifier, so we need to indicate
5762 // that we are reading an undefined value from SrcRegD, but a proper
5763 // value from SrcReg.
5764 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5765 .addReg(SrcRegD, RegState::Undef)
5766 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5767 } else {
5768 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5769 &AArch64::FPR32RegClass);
5770 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5771 &AArch64::FPR32RegClass);
5772 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5773 .addReg(SrcReg, getKillRegState(KillSrc));
5774 }
5775 return;
5776 }
5777
5778 if (AArch64::FPR8RegClass.contains(DestReg) &&
5779 AArch64::FPR8RegClass.contains(SrcReg)) {
5780 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5781 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5782 !Subtarget.hasZeroCycleRegMoveFPR64() && Subtarget.isNeonAvailable()) {
5783 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5784 &AArch64::FPR128RegClass);
5785 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5786 &AArch64::FPR128RegClass);
5787 // This instruction is reading and writing Q registers. This may upset
5788 // the register scavenger and machine verifier, so we need to indicate
5789 // that we are reading an undefined value from SrcRegQ, but a proper
5790 // value from SrcReg.
5791 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5792 .addReg(SrcRegQ, RegState::Undef)
5793 .addReg(SrcRegQ, RegState::Undef)
5794 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5795 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5796 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5797 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5798 &AArch64::FPR64RegClass);
5799 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5800 &AArch64::FPR64RegClass);
5801 // This instruction is reading and writing D registers. This may upset
5802 // the register scavenger and machine verifier, so we need to indicate
5803 // that we are reading an undefined value from SrcRegD, but a proper
5804 // value from SrcReg.
5805 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5806 .addReg(SrcRegD, RegState::Undef)
5807 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5808 } else {
5809 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5810 &AArch64::FPR32RegClass);
5811 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5812 &AArch64::FPR32RegClass);
5813 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5814 .addReg(SrcReg, getKillRegState(KillSrc));
5815 }
5816 return;
5817 }
5818
5819 // Copies between GPR64 and FPR64.
5820 if (AArch64::FPR64RegClass.contains(DestReg) &&
5821 AArch64::GPR64RegClass.contains(SrcReg)) {
5822 if (AArch64::XZR == SrcReg) {
5823 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
5824 } else {
5825 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5826 .addReg(SrcReg, getKillRegState(KillSrc));
5827 }
5828 return;
5829 }
5830 if (AArch64::GPR64RegClass.contains(DestReg) &&
5831 AArch64::FPR64RegClass.contains(SrcReg)) {
5832 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5833 .addReg(SrcReg, getKillRegState(KillSrc));
5834 return;
5835 }
5836 // Copies between GPR32 and FPR32.
5837 if (AArch64::FPR32RegClass.contains(DestReg) &&
5838 AArch64::GPR32RegClass.contains(SrcReg)) {
5839 if (AArch64::WZR == SrcReg) {
5840 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
5841 } else {
5842 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5843 .addReg(SrcReg, getKillRegState(KillSrc));
5844 }
5845 return;
5846 }
5847 if (AArch64::GPR32RegClass.contains(DestReg) &&
5848 AArch64::FPR32RegClass.contains(SrcReg)) {
5849 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
5850 .addReg(SrcReg, getKillRegState(KillSrc));
5851 return;
5852 }
5853
5854 if (DestReg == AArch64::NZCV) {
5855 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
5856 BuildMI(MBB, I, DL, get(AArch64::MSR))
5857 .addImm(AArch64SysReg::NZCV)
5858 .addReg(SrcReg, getKillRegState(KillSrc))
5859 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
5860 return;
5861 }
5862
5863 if (SrcReg == AArch64::NZCV) {
5864 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
5865 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
5866 .addImm(AArch64SysReg::NZCV)
5867 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
5868 return;
5869 }
5870
5871#ifndef NDEBUG
5872 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
5873 << "\n";
5874#endif
5875 llvm_unreachable("unimplemented reg-to-reg copy");
5876}
5877
5880 MachineBasicBlock::iterator InsertBefore,
5881 const MCInstrDesc &MCID,
5882 Register SrcReg, bool IsKill,
5883 unsigned SubIdx0, unsigned SubIdx1, int FI,
5884 MachineMemOperand *MMO) {
5885 Register SrcReg0 = SrcReg;
5886 Register SrcReg1 = SrcReg;
5887 if (SrcReg.isPhysical()) {
5888 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
5889 SubIdx0 = 0;
5890 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
5891 SubIdx1 = 0;
5892 }
5893 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5894 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
5895 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
5896 .addFrameIndex(FI)
5897 .addImm(0)
5898 .addMemOperand(MMO);
5899}
5900
5903 Register SrcReg, bool isKill, int FI,
5904 const TargetRegisterClass *RC,
5905 Register VReg,
5906 MachineInstr::MIFlag Flags) const {
5907 MachineFunction &MF = *MBB.getParent();
5908 MachineFrameInfo &MFI = MF.getFrameInfo();
5909
5911 MachineMemOperand *MMO =
5913 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5914 unsigned Opc = 0;
5915 bool Offset = true;
5917 unsigned StackID = TargetStackID::Default;
5918 switch (RI.getSpillSize(*RC)) {
5919 case 1:
5920 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5921 Opc = AArch64::STRBui;
5922 break;
5923 case 2: {
5924 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5925 Opc = AArch64::STRHui;
5926 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
5927 AArch64::PPRRegClass.hasSubClassEq(RC)) {
5928 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5929 "Unexpected register store without SVE store instructions");
5930 Opc = AArch64::STR_PXI;
5932 }
5933 break;
5934 }
5935 case 4:
5936 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5937 Opc = AArch64::STRWui;
5938 if (SrcReg.isVirtual())
5939 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
5940 else
5941 assert(SrcReg != AArch64::WSP);
5942 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5943 Opc = AArch64::STRSui;
5944 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5945 Opc = AArch64::STR_PPXI;
5947 }
5948 break;
5949 case 8:
5950 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5951 Opc = AArch64::STRXui;
5952 if (SrcReg.isVirtual())
5953 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5954 else
5955 assert(SrcReg != AArch64::SP);
5956 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5957 Opc = AArch64::STRDui;
5958 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5960 get(AArch64::STPWi), SrcReg, isKill,
5961 AArch64::sube32, AArch64::subo32, FI, MMO);
5962 return;
5963 }
5964 break;
5965 case 16:
5966 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5967 Opc = AArch64::STRQui;
5968 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5969 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5970 Opc = AArch64::ST1Twov1d;
5971 Offset = false;
5972 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5974 get(AArch64::STPXi), SrcReg, isKill,
5975 AArch64::sube64, AArch64::subo64, FI, MMO);
5976 return;
5977 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5978 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5979 "Unexpected register store without SVE store instructions");
5980 Opc = AArch64::STR_ZXI;
5982 }
5983 break;
5984 case 24:
5985 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5986 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5987 Opc = AArch64::ST1Threev1d;
5988 Offset = false;
5989 }
5990 break;
5991 case 32:
5992 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5993 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5994 Opc = AArch64::ST1Fourv1d;
5995 Offset = false;
5996 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5997 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5998 Opc = AArch64::ST1Twov2d;
5999 Offset = false;
6000 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6001 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6002 "Unexpected register store without SVE store instructions");
6003 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6005 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6006 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6007 "Unexpected register store without SVE store instructions");
6008 Opc = AArch64::STR_ZZXI;
6010 }
6011 break;
6012 case 48:
6013 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6014 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6015 Opc = AArch64::ST1Threev2d;
6016 Offset = false;
6017 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6018 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6019 "Unexpected register store without SVE store instructions");
6020 Opc = AArch64::STR_ZZZXI;
6022 }
6023 break;
6024 case 64:
6025 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6026 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6027 Opc = AArch64::ST1Fourv2d;
6028 Offset = false;
6029 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6030 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6031 "Unexpected register store without SVE store instructions");
6032 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6034 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6035 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6036 "Unexpected register store without SVE store instructions");
6037 Opc = AArch64::STR_ZZZZXI;
6039 }
6040 break;
6041 }
6042 assert(Opc && "Unknown register class");
6043 MFI.setStackID(FI, StackID);
6044
6046 .addReg(SrcReg, getKillRegState(isKill))
6047 .addFrameIndex(FI);
6048
6049 if (Offset)
6050 MI.addImm(0);
6051 if (PNRReg.isValid())
6052 MI.addDef(PNRReg, RegState::Implicit);
6053 MI.addMemOperand(MMO);
6054}
6055
6058 MachineBasicBlock::iterator InsertBefore,
6059 const MCInstrDesc &MCID,
6060 Register DestReg, unsigned SubIdx0,
6061 unsigned SubIdx1, int FI,
6062 MachineMemOperand *MMO) {
6063 Register DestReg0 = DestReg;
6064 Register DestReg1 = DestReg;
6065 bool IsUndef = true;
6066 if (DestReg.isPhysical()) {
6067 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6068 SubIdx0 = 0;
6069 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6070 SubIdx1 = 0;
6071 IsUndef = false;
6072 }
6073 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6074 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6075 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6076 .addFrameIndex(FI)
6077 .addImm(0)
6078 .addMemOperand(MMO);
6079}
6080
6083 Register DestReg, int FI,
6084 const TargetRegisterClass *RC,
6085 Register VReg,
6086 MachineInstr::MIFlag Flags) const {
6087 MachineFunction &MF = *MBB.getParent();
6088 MachineFrameInfo &MFI = MF.getFrameInfo();
6090 MachineMemOperand *MMO =
6092 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6093
6094 unsigned Opc = 0;
6095 bool Offset = true;
6096 unsigned StackID = TargetStackID::Default;
6098 switch (TRI.getSpillSize(*RC)) {
6099 case 1:
6100 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6101 Opc = AArch64::LDRBui;
6102 break;
6103 case 2: {
6104 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6105 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6106 Opc = AArch64::LDRHui;
6107 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6108 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6109 "Unexpected register load without SVE load instructions");
6110 if (IsPNR)
6111 PNRReg = DestReg;
6112 Opc = AArch64::LDR_PXI;
6114 }
6115 break;
6116 }
6117 case 4:
6118 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6119 Opc = AArch64::LDRWui;
6120 if (DestReg.isVirtual())
6121 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6122 else
6123 assert(DestReg != AArch64::WSP);
6124 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6125 Opc = AArch64::LDRSui;
6126 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6127 Opc = AArch64::LDR_PPXI;
6129 }
6130 break;
6131 case 8:
6132 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6133 Opc = AArch64::LDRXui;
6134 if (DestReg.isVirtual())
6135 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6136 else
6137 assert(DestReg != AArch64::SP);
6138 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6139 Opc = AArch64::LDRDui;
6140 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6142 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6143 AArch64::subo32, FI, MMO);
6144 return;
6145 }
6146 break;
6147 case 16:
6148 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6149 Opc = AArch64::LDRQui;
6150 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6151 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6152 Opc = AArch64::LD1Twov1d;
6153 Offset = false;
6154 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6156 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6157 AArch64::subo64, FI, MMO);
6158 return;
6159 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6160 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6161 "Unexpected register load without SVE load instructions");
6162 Opc = AArch64::LDR_ZXI;
6164 }
6165 break;
6166 case 24:
6167 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6168 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6169 Opc = AArch64::LD1Threev1d;
6170 Offset = false;
6171 }
6172 break;
6173 case 32:
6174 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6175 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6176 Opc = AArch64::LD1Fourv1d;
6177 Offset = false;
6178 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6179 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6180 Opc = AArch64::LD1Twov2d;
6181 Offset = false;
6182 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6183 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6184 "Unexpected register load without SVE load instructions");
6185 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6187 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6188 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6189 "Unexpected register load without SVE load instructions");
6190 Opc = AArch64::LDR_ZZXI;
6192 }
6193 break;
6194 case 48:
6195 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6196 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6197 Opc = AArch64::LD1Threev2d;
6198 Offset = false;
6199 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6200 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6201 "Unexpected register load without SVE load instructions");
6202 Opc = AArch64::LDR_ZZZXI;
6204 }
6205 break;
6206 case 64:
6207 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6208 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6209 Opc = AArch64::LD1Fourv2d;
6210 Offset = false;
6211 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6212 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6213 "Unexpected register load without SVE load instructions");
6214 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6216 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6217 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6218 "Unexpected register load without SVE load instructions");
6219 Opc = AArch64::LDR_ZZZZXI;
6221 }
6222 break;
6223 }
6224
6225 assert(Opc && "Unknown register class");
6226 MFI.setStackID(FI, StackID);
6227
6229 .addReg(DestReg, getDefRegState(true))
6230 .addFrameIndex(FI);
6231 if (Offset)
6232 MI.addImm(0);
6233 if (PNRReg.isValid() && !PNRReg.isVirtual())
6234 MI.addDef(PNRReg, RegState::Implicit);
6235 MI.addMemOperand(MMO);
6236}
6237
6239 const MachineInstr &UseMI,
6240 const TargetRegisterInfo *TRI) {
6241 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6242 UseMI.getIterator()),
6243 [TRI](const MachineInstr &I) {
6244 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6245 I.readsRegister(AArch64::NZCV, TRI);
6246 });
6247}
6248
6249void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6250 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6251 // The smallest scalable element supported by scaled SVE addressing
6252 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6253 // byte offset must always be a multiple of 2.
6254 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6255
6256 // VGSized offsets are divided by '2', because the VG register is the
6257 // the number of 64bit granules as opposed to 128bit vector chunks,
6258 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6259 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6260 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6261 ByteSized = Offset.getFixed();
6262 VGSized = Offset.getScalable() / 2;
6263}
6264
6265/// Returns the offset in parts to which this frame offset can be
6266/// decomposed for the purpose of describing a frame offset.
6267/// For non-scalable offsets this is simply its byte size.
6268void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6269 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6270 int64_t &NumDataVectors) {
6271 // The smallest scalable element supported by scaled SVE addressing
6272 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6273 // byte offset must always be a multiple of 2.
6274 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6275
6276 NumBytes = Offset.getFixed();
6277 NumDataVectors = 0;
6278 NumPredicateVectors = Offset.getScalable() / 2;
6279 // This method is used to get the offsets to adjust the frame offset.
6280 // If the function requires ADDPL to be used and needs more than two ADDPL
6281 // instructions, part of the offset is folded into NumDataVectors so that it
6282 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6283 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6284 NumPredicateVectors > 62) {
6285 NumDataVectors = NumPredicateVectors / 8;
6286 NumPredicateVectors -= NumDataVectors * 8;
6287 }
6288}
6289
6290// Convenience function to create a DWARF expression for: Constant `Operation`.
6291// This helper emits compact sequences for common cases. For example, for`-15
6292// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6295 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6296 // -Constant (1 to 31)
6297 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6298 Operation = dwarf::DW_OP_minus;
6299 } else if (Constant >= 0 && Constant <= 31) {
6300 // Literal value 0 to 31
6301 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6302 } else {
6303 // Signed constant
6304 Expr.push_back(dwarf::DW_OP_consts);
6306 }
6307 return Expr.push_back(Operation);
6308}
6309
6310// Convenience function to create a DWARF expression for a register.
6311static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6312 Expr.push_back((char)dwarf::DW_OP_bregx);
6314 Expr.push_back(0);
6315}
6316
6317// Convenience function to create a DWARF expression for loading a register from
6318// a CFA offset.
6320 int64_t OffsetFromDefCFA) {
6321 // This assumes the top of the DWARF stack contains the CFA.
6322 Expr.push_back(dwarf::DW_OP_dup);
6323 // Add the offset to the register.
6324 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6325 // Dereference the address (loads a 64 bit value)..
6326 Expr.push_back(dwarf::DW_OP_deref);
6327}
6328
6329// Convenience function to create a comment for
6330// (+/-) NumBytes (* RegScale)?
6331static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6332 StringRef RegScale = {}) {
6333 if (NumBytes) {
6334 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6335 if (!RegScale.empty())
6336 Comment << ' ' << RegScale;
6337 }
6338}
6339
6340// Creates an MCCFIInstruction:
6341// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6343 unsigned Reg,
6344 const StackOffset &Offset) {
6345 int64_t NumBytes, NumVGScaledBytes;
6346 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6347 NumVGScaledBytes);
6348 std::string CommentBuffer;
6349 llvm::raw_string_ostream Comment(CommentBuffer);
6350
6351 if (Reg == AArch64::SP)
6352 Comment << "sp";
6353 else if (Reg == AArch64::FP)
6354 Comment << "fp";
6355 else
6356 Comment << printReg(Reg, &TRI);
6357
6358 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6359 SmallString<64> Expr;
6360 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6361 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6362 // Reg + NumBytes
6363 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6364 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6365 appendOffsetComment(NumBytes, Comment);
6366 if (NumVGScaledBytes) {
6367 // + VG * NumVGScaledBytes
6368 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6369 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6370 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6371 Expr.push_back(dwarf::DW_OP_plus);
6372 }
6373
6374 // Wrap this into DW_CFA_def_cfa.
6375 SmallString<64> DefCfaExpr;
6376 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6377 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6378 DefCfaExpr.append(Expr.str());
6379 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6380 Comment.str());
6381}
6382
6384 unsigned FrameReg, unsigned Reg,
6385 const StackOffset &Offset,
6386 bool LastAdjustmentWasScalable) {
6387 if (Offset.getScalable())
6388 return createDefCFAExpression(TRI, Reg, Offset);
6389
6390 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6391 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6392
6393 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6394 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6395}
6396
6399 const StackOffset &OffsetFromDefCFA,
6400 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6401 int64_t NumBytes, NumVGScaledBytes;
6402 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6403 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6404
6405 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6406
6407 // Non-scalable offsets can use DW_CFA_offset directly.
6408 if (!NumVGScaledBytes)
6409 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6410
6411 std::string CommentBuffer;
6412 llvm::raw_string_ostream Comment(CommentBuffer);
6413 Comment << printReg(Reg, &TRI) << " @ cfa";
6414
6415 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6416 assert(NumVGScaledBytes && "Expected scalable offset");
6417 SmallString<64> OffsetExpr;
6418 // + VG * NumVGScaledBytes
6419 StringRef VGRegScale;
6420 if (IncomingVGOffsetFromDefCFA) {
6421 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6422 VGRegScale = "* IncomingVG";
6423 } else {
6424 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6425 VGRegScale = "* VG";
6426 }
6427 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6428 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6429 OffsetExpr.push_back(dwarf::DW_OP_plus);
6430 if (NumBytes) {
6431 // + NumBytes
6432 appendOffsetComment(NumBytes, Comment);
6433 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6434 }
6435
6436 // Wrap this into DW_CFA_expression
6437 SmallString<64> CfaExpr;
6438 CfaExpr.push_back(dwarf::DW_CFA_expression);
6439 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6440 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6441 CfaExpr.append(OffsetExpr.str());
6442
6443 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6444 Comment.str());
6445}
6446
6447// Helper function to emit a frame offset adjustment from a given
6448// pointer (SrcReg), stored into DestReg. This function is explicit
6449// in that it requires the opcode.
6452 const DebugLoc &DL, unsigned DestReg,
6453 unsigned SrcReg, int64_t Offset, unsigned Opc,
6454 const TargetInstrInfo *TII,
6455 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6456 bool *HasWinCFI, bool EmitCFAOffset,
6457 StackOffset CFAOffset, unsigned FrameReg) {
6458 int Sign = 1;
6459 unsigned MaxEncoding, ShiftSize;
6460 switch (Opc) {
6461 case AArch64::ADDXri:
6462 case AArch64::ADDSXri:
6463 case AArch64::SUBXri:
6464 case AArch64::SUBSXri:
6465 MaxEncoding = 0xfff;
6466 ShiftSize = 12;
6467 break;
6468 case AArch64::ADDVL_XXI:
6469 case AArch64::ADDPL_XXI:
6470 case AArch64::ADDSVL_XXI:
6471 case AArch64::ADDSPL_XXI:
6472 MaxEncoding = 31;
6473 ShiftSize = 0;
6474 if (Offset < 0) {
6475 MaxEncoding = 32;
6476 Sign = -1;
6477 Offset = -Offset;
6478 }
6479 break;
6480 default:
6481 llvm_unreachable("Unsupported opcode");
6482 }
6483
6484 // `Offset` can be in bytes or in "scalable bytes".
6485 int VScale = 1;
6486 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6487 VScale = 16;
6488 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6489 VScale = 2;
6490
6491 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6492 // scratch register. If DestReg is a virtual register, use it as the
6493 // scratch register; otherwise, create a new virtual register (to be
6494 // replaced by the scavenger at the end of PEI). That case can be optimized
6495 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6496 // register can be loaded with offset%8 and the add/sub can use an extending
6497 // instruction with LSL#3.
6498 // Currently the function handles any offsets but generates a poor sequence
6499 // of code.
6500 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6501
6502 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6503 Register TmpReg = DestReg;
6504 if (TmpReg == AArch64::XZR)
6505 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6506 &AArch64::GPR64RegClass);
6507 do {
6508 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6509 unsigned LocalShiftSize = 0;
6510 if (ThisVal > MaxEncoding) {
6511 ThisVal = ThisVal >> ShiftSize;
6512 LocalShiftSize = ShiftSize;
6513 }
6514 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6515 "Encoding cannot handle value that big");
6516
6517 Offset -= ThisVal << LocalShiftSize;
6518 if (Offset == 0)
6519 TmpReg = DestReg;
6520 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6521 .addReg(SrcReg)
6522 .addImm(Sign * (int)ThisVal);
6523 if (ShiftSize)
6524 MBI = MBI.addImm(
6526 MBI = MBI.setMIFlag(Flag);
6527
6528 auto Change =
6529 VScale == 1
6530 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6531 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6532 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6533 CFAOffset += Change;
6534 else
6535 CFAOffset -= Change;
6536 if (EmitCFAOffset && DestReg == TmpReg) {
6537 MachineFunction &MF = *MBB.getParent();
6538 const TargetSubtargetInfo &STI = MF.getSubtarget();
6539 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6540
6541 unsigned CFIIndex = MF.addFrameInst(
6542 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6543 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6544 .addCFIIndex(CFIIndex)
6545 .setMIFlags(Flag);
6546 }
6547
6548 if (NeedsWinCFI) {
6549 int Imm = (int)(ThisVal << LocalShiftSize);
6550 if (VScale != 1 && DestReg == AArch64::SP) {
6551 if (HasWinCFI)
6552 *HasWinCFI = true;
6553 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6554 .addImm(ThisVal)
6555 .setMIFlag(Flag);
6556 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6557 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6558 assert(VScale == 1 && "Expected non-scalable operation");
6559 if (HasWinCFI)
6560 *HasWinCFI = true;
6561 if (Imm == 0)
6562 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6563 else
6564 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6565 .addImm(Imm)
6566 .setMIFlag(Flag);
6567 assert(Offset == 0 && "Expected remaining offset to be zero to "
6568 "emit a single SEH directive");
6569 } else if (DestReg == AArch64::SP) {
6570 assert(VScale == 1 && "Expected non-scalable operation");
6571 if (HasWinCFI)
6572 *HasWinCFI = true;
6573 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6574 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6575 .addImm(Imm)
6576 .setMIFlag(Flag);
6577 }
6578 }
6579
6580 SrcReg = TmpReg;
6581 } while (Offset);
6582}
6583
6586 unsigned DestReg, unsigned SrcReg,
6588 MachineInstr::MIFlag Flag, bool SetNZCV,
6589 bool NeedsWinCFI, bool *HasWinCFI,
6590 bool EmitCFAOffset, StackOffset CFAOffset,
6591 unsigned FrameReg) {
6592 // If a function is marked as arm_locally_streaming, then the runtime value of
6593 // vscale in the prologue/epilogue is different the runtime value of vscale
6594 // in the function's body. To avoid having to consider multiple vscales,
6595 // we can use `addsvl` to allocate any scalable stack-slots, which under
6596 // most circumstances will be only locals, not callee-save slots.
6597 const Function &F = MBB.getParent()->getFunction();
6598 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6599
6600 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6601 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6602 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6603
6604 // Insert ADDSXri for scalable offset at the end.
6605 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6606 if (NeedsFinalDefNZCV)
6607 SetNZCV = false;
6608
6609 // First emit non-scalable frame offsets, or a simple 'mov'.
6610 if (Bytes || (!Offset && SrcReg != DestReg)) {
6611 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6612 "SP increment/decrement not 8-byte aligned");
6613 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6614 if (Bytes < 0) {
6615 Bytes = -Bytes;
6616 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6617 }
6618 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6619 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6620 FrameReg);
6621 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6622 ? StackOffset::getFixed(-Bytes)
6623 : StackOffset::getFixed(Bytes);
6624 SrcReg = DestReg;
6625 FrameReg = DestReg;
6626 }
6627
6628 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6629 "WinCFI can't allocate fractions of an SVE data vector");
6630
6631 if (NumDataVectors) {
6632 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6633 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6634 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6635 FrameReg);
6636 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6637 SrcReg = DestReg;
6638 }
6639
6640 if (NumPredicateVectors) {
6641 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6642 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6643 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6644 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6645 FrameReg);
6646 }
6647
6648 if (NeedsFinalDefNZCV)
6649 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
6650 .addReg(DestReg)
6651 .addImm(0)
6652 .addImm(0);
6653}
6654
6657 MachineBasicBlock::iterator InsertPt, int FrameIndex,
6658 LiveIntervals *LIS, VirtRegMap *VRM) const {
6659 // This is a bit of a hack. Consider this instruction:
6660 //
6661 // %0 = COPY %sp; GPR64all:%0
6662 //
6663 // We explicitly chose GPR64all for the virtual register so such a copy might
6664 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6665 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6666 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6667 //
6668 // To prevent that, we are going to constrain the %0 register class here.
6669 if (MI.isFullCopy()) {
6670 Register DstReg = MI.getOperand(0).getReg();
6671 Register SrcReg = MI.getOperand(1).getReg();
6672 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6673 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6674 return nullptr;
6675 }
6676 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6677 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6678 return nullptr;
6679 }
6680 // Nothing can folded with copy from/to NZCV.
6681 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6682 return nullptr;
6683 }
6684
6685 // Handle the case where a copy is being spilled or filled but the source
6686 // and destination register class don't match. For example:
6687 //
6688 // %0 = COPY %xzr; GPR64common:%0
6689 //
6690 // In this case we can still safely fold away the COPY and generate the
6691 // following spill code:
6692 //
6693 // STRXui %xzr, %stack.0
6694 //
6695 // This also eliminates spilled cross register class COPYs (e.g. between x and
6696 // d regs) of the same size. For example:
6697 //
6698 // %0 = COPY %1; GPR64:%0, FPR64:%1
6699 //
6700 // will be filled as
6701 //
6702 // LDRDui %0, fi<#0>
6703 //
6704 // instead of
6705 //
6706 // LDRXui %Temp, fi<#0>
6707 // %0 = FMOV %Temp
6708 //
6709 if (MI.isCopy() && Ops.size() == 1 &&
6710 // Make sure we're only folding the explicit COPY defs/uses.
6711 (Ops[0] == 0 || Ops[0] == 1)) {
6712 bool IsSpill = Ops[0] == 0;
6713 bool IsFill = !IsSpill;
6715 const MachineRegisterInfo &MRI = MF.getRegInfo();
6716 MachineBasicBlock &MBB = *MI.getParent();
6717 const MachineOperand &DstMO = MI.getOperand(0);
6718 const MachineOperand &SrcMO = MI.getOperand(1);
6719 Register DstReg = DstMO.getReg();
6720 Register SrcReg = SrcMO.getReg();
6721 // This is slightly expensive to compute for physical regs since
6722 // getMinimalPhysRegClass is slow.
6723 auto getRegClass = [&](unsigned Reg) {
6724 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6725 : TRI.getMinimalPhysRegClass(Reg);
6726 };
6727
6728 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6729 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6730 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6731 "Mismatched register size in non subreg COPY");
6732 if (IsSpill)
6733 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6734 getRegClass(SrcReg), Register());
6735 else
6736 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6737 getRegClass(DstReg), Register());
6738 return &*--InsertPt;
6739 }
6740
6741 // Handle cases like spilling def of:
6742 //
6743 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6744 //
6745 // where the physical register source can be widened and stored to the full
6746 // virtual reg destination stack slot, in this case producing:
6747 //
6748 // STRXui %xzr, %stack.0
6749 //
6750 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6751 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6752 assert(SrcMO.getSubReg() == 0 &&
6753 "Unexpected subreg on physical register");
6754 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6755 FrameIndex, &AArch64::GPR64RegClass, Register());
6756 return &*--InsertPt;
6757 }
6758
6759 // Handle cases like filling use of:
6760 //
6761 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6762 //
6763 // where we can load the full virtual reg source stack slot, into the subreg
6764 // destination, in this case producing:
6765 //
6766 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6767 //
6768 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6769 const TargetRegisterClass *FillRC = nullptr;
6770 switch (DstMO.getSubReg()) {
6771 default:
6772 break;
6773 case AArch64::sub_32:
6774 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
6775 FillRC = &AArch64::GPR32RegClass;
6776 break;
6777 case AArch64::ssub:
6778 FillRC = &AArch64::FPR32RegClass;
6779 break;
6780 case AArch64::dsub:
6781 FillRC = &AArch64::FPR64RegClass;
6782 break;
6783 }
6784
6785 if (FillRC) {
6786 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6787 TRI.getRegSizeInBits(*FillRC) &&
6788 "Mismatched regclass size on folded subreg COPY");
6789 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
6790 Register());
6791 MachineInstr &LoadMI = *--InsertPt;
6792 MachineOperand &LoadDst = LoadMI.getOperand(0);
6793 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6794 LoadDst.setSubReg(DstMO.getSubReg());
6795 LoadDst.setIsUndef();
6796 return &LoadMI;
6797 }
6798 }
6799 }
6800
6801 // Cannot fold.
6802 return nullptr;
6803}
6804
6806 StackOffset &SOffset,
6807 bool *OutUseUnscaledOp,
6808 unsigned *OutUnscaledOp,
6809 int64_t *EmittableOffset) {
6810 // Set output values in case of early exit.
6811 if (EmittableOffset)
6812 *EmittableOffset = 0;
6813 if (OutUseUnscaledOp)
6814 *OutUseUnscaledOp = false;
6815 if (OutUnscaledOp)
6816 *OutUnscaledOp = 0;
6817
6818 // Exit early for structured vector spills/fills as they can't take an
6819 // immediate offset.
6820 switch (MI.getOpcode()) {
6821 default:
6822 break;
6823 case AArch64::LD1Rv1d:
6824 case AArch64::LD1Rv2s:
6825 case AArch64::LD1Rv2d:
6826 case AArch64::LD1Rv4h:
6827 case AArch64::LD1Rv4s:
6828 case AArch64::LD1Rv8b:
6829 case AArch64::LD1Rv8h:
6830 case AArch64::LD1Rv16b:
6831 case AArch64::LD1Twov2d:
6832 case AArch64::LD1Threev2d:
6833 case AArch64::LD1Fourv2d:
6834 case AArch64::LD1Twov1d:
6835 case AArch64::LD1Threev1d:
6836 case AArch64::LD1Fourv1d:
6837 case AArch64::ST1Twov2d:
6838 case AArch64::ST1Threev2d:
6839 case AArch64::ST1Fourv2d:
6840 case AArch64::ST1Twov1d:
6841 case AArch64::ST1Threev1d:
6842 case AArch64::ST1Fourv1d:
6843 case AArch64::ST1i8:
6844 case AArch64::ST1i16:
6845 case AArch64::ST1i32:
6846 case AArch64::ST1i64:
6847 case AArch64::IRG:
6848 case AArch64::IRGstack:
6849 case AArch64::STGloop:
6850 case AArch64::STZGloop:
6852 }
6853
6854 // Get the min/max offset and the scale.
6855 TypeSize ScaleValue(0U, false), Width(0U, false);
6856 int64_t MinOff, MaxOff;
6857 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
6858 MaxOff))
6859 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6860
6861 // Construct the complete offset.
6862 bool IsMulVL = ScaleValue.isScalable();
6863 unsigned Scale = ScaleValue.getKnownMinValue();
6864 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
6865
6866 const MachineOperand &ImmOpnd =
6867 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
6868 Offset += ImmOpnd.getImm() * Scale;
6869
6870 // If the offset doesn't match the scale, we rewrite the instruction to
6871 // use the unscaled instruction instead. Likewise, if we have a negative
6872 // offset and there is an unscaled op to use.
6873 std::optional<unsigned> UnscaledOp =
6875 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
6876 if (useUnscaledOp &&
6877 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
6878 MaxOff))
6879 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6880
6881 Scale = ScaleValue.getKnownMinValue();
6882 assert(IsMulVL == ScaleValue.isScalable() &&
6883 "Unscaled opcode has different value for scalable");
6884
6885 int64_t Remainder = Offset % Scale;
6886 assert(!(Remainder && useUnscaledOp) &&
6887 "Cannot have remainder when using unscaled op");
6888
6889 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
6890 int64_t NewOffset = Offset / Scale;
6891 if (MinOff <= NewOffset && NewOffset <= MaxOff)
6892 Offset = Remainder;
6893 else {
6894 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
6895 Offset = Offset - (NewOffset * Scale);
6896 }
6897
6898 if (EmittableOffset)
6899 *EmittableOffset = NewOffset;
6900 if (OutUseUnscaledOp)
6901 *OutUseUnscaledOp = useUnscaledOp;
6902 if (OutUnscaledOp && UnscaledOp)
6903 *OutUnscaledOp = *UnscaledOp;
6904
6905 if (IsMulVL)
6906 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
6907 else
6908 SOffset = StackOffset::get(Offset, SOffset.getScalable());
6910 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
6911}
6912
6914 unsigned FrameReg, StackOffset &Offset,
6915 const AArch64InstrInfo *TII) {
6916 unsigned Opcode = MI.getOpcode();
6917 unsigned ImmIdx = FrameRegIdx + 1;
6918
6919 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
6920 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
6921 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
6922 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
6923 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
6924 MI.eraseFromParent();
6925 Offset = StackOffset();
6926 return true;
6927 }
6928
6929 int64_t NewOffset;
6930 unsigned UnscaledOp;
6931 bool UseUnscaledOp;
6932 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
6933 &UnscaledOp, &NewOffset);
6936 // Replace the FrameIndex with FrameReg.
6937 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
6938 if (UseUnscaledOp)
6939 MI.setDesc(TII->get(UnscaledOp));
6940
6941 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
6942 return !Offset;
6943 }
6944
6945 return false;
6946}
6947
6953
6954MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
6955
6956// AArch64 supports MachineCombiner.
6957bool AArch64InstrInfo::useMachineCombiner() const { return true; }
6958
6959// True when Opc sets flag
6960static bool isCombineInstrSettingFlag(unsigned Opc) {
6961 switch (Opc) {
6962 case AArch64::ADDSWrr:
6963 case AArch64::ADDSWri:
6964 case AArch64::ADDSXrr:
6965 case AArch64::ADDSXri:
6966 case AArch64::SUBSWrr:
6967 case AArch64::SUBSXrr:
6968 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6969 case AArch64::SUBSWri:
6970 case AArch64::SUBSXri:
6971 return true;
6972 default:
6973 break;
6974 }
6975 return false;
6976}
6977
6978// 32b Opcodes that can be combined with a MUL
6979static bool isCombineInstrCandidate32(unsigned Opc) {
6980 switch (Opc) {
6981 case AArch64::ADDWrr:
6982 case AArch64::ADDWri:
6983 case AArch64::SUBWrr:
6984 case AArch64::ADDSWrr:
6985 case AArch64::ADDSWri:
6986 case AArch64::SUBSWrr:
6987 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6988 case AArch64::SUBWri:
6989 case AArch64::SUBSWri:
6990 return true;
6991 default:
6992 break;
6993 }
6994 return false;
6995}
6996
6997// 64b Opcodes that can be combined with a MUL
6998static bool isCombineInstrCandidate64(unsigned Opc) {
6999 switch (Opc) {
7000 case AArch64::ADDXrr:
7001 case AArch64::ADDXri:
7002 case AArch64::SUBXrr:
7003 case AArch64::ADDSXrr:
7004 case AArch64::ADDSXri:
7005 case AArch64::SUBSXrr:
7006 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7007 case AArch64::SUBXri:
7008 case AArch64::SUBSXri:
7009 case AArch64::ADDv8i8:
7010 case AArch64::ADDv16i8:
7011 case AArch64::ADDv4i16:
7012 case AArch64::ADDv8i16:
7013 case AArch64::ADDv2i32:
7014 case AArch64::ADDv4i32:
7015 case AArch64::SUBv8i8:
7016 case AArch64::SUBv16i8:
7017 case AArch64::SUBv4i16:
7018 case AArch64::SUBv8i16:
7019 case AArch64::SUBv2i32:
7020 case AArch64::SUBv4i32:
7021 return true;
7022 default:
7023 break;
7024 }
7025 return false;
7026}
7027
7028// FP Opcodes that can be combined with a FMUL.
7029static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7030 switch (Inst.getOpcode()) {
7031 default:
7032 break;
7033 case AArch64::FADDHrr:
7034 case AArch64::FADDSrr:
7035 case AArch64::FADDDrr:
7036 case AArch64::FADDv4f16:
7037 case AArch64::FADDv8f16:
7038 case AArch64::FADDv2f32:
7039 case AArch64::FADDv2f64:
7040 case AArch64::FADDv4f32:
7041 case AArch64::FSUBHrr:
7042 case AArch64::FSUBSrr:
7043 case AArch64::FSUBDrr:
7044 case AArch64::FSUBv4f16:
7045 case AArch64::FSUBv8f16:
7046 case AArch64::FSUBv2f32:
7047 case AArch64::FSUBv2f64:
7048 case AArch64::FSUBv4f32:
7050 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7051 // the target options or if FADD/FSUB has the contract fast-math flag.
7052 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7054 }
7055 return false;
7056}
7057
7058// Opcodes that can be combined with a MUL
7062
7063//
7064// Utility routine that checks if \param MO is defined by an
7065// \param CombineOpc instruction in the basic block \param MBB
7067 unsigned CombineOpc, unsigned ZeroReg = 0,
7068 bool CheckZeroReg = false) {
7069 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7070 MachineInstr *MI = nullptr;
7071
7072 if (MO.isReg() && MO.getReg().isVirtual())
7073 MI = MRI.getUniqueVRegDef(MO.getReg());
7074 // And it needs to be in the trace (otherwise, it won't have a depth).
7075 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7076 return false;
7077 // Must only used by the user we combine with.
7078 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7079 return false;
7080
7081 if (CheckZeroReg) {
7082 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7083 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7084 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7085 // The third input reg must be zero.
7086 if (MI->getOperand(3).getReg() != ZeroReg)
7087 return false;
7088 }
7089
7090 if (isCombineInstrSettingFlag(CombineOpc) &&
7091 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7092 return false;
7093
7094 return true;
7095}
7096
7097//
7098// Is \param MO defined by an integer multiply and can be combined?
7100 unsigned MulOpc, unsigned ZeroReg) {
7101 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7102}
7103
7104//
7105// Is \param MO defined by a floating-point multiply and can be combined?
7107 unsigned MulOpc) {
7108 return canCombine(MBB, MO, MulOpc);
7109}
7110
7111// TODO: There are many more machine instruction opcodes to match:
7112// 1. Other data types (integer, vectors)
7113// 2. Other math / logic operations (xor, or)
7114// 3. Other forms of the same operation (intrinsics and other variants)
7115bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7116 bool Invert) const {
7117 if (Invert)
7118 return false;
7119 switch (Inst.getOpcode()) {
7120 // == Floating-point types ==
7121 // -- Floating-point instructions --
7122 case AArch64::FADDHrr:
7123 case AArch64::FADDSrr:
7124 case AArch64::FADDDrr:
7125 case AArch64::FMULHrr:
7126 case AArch64::FMULSrr:
7127 case AArch64::FMULDrr:
7128 case AArch64::FMULX16:
7129 case AArch64::FMULX32:
7130 case AArch64::FMULX64:
7131 // -- Advanced SIMD instructions --
7132 case AArch64::FADDv4f16:
7133 case AArch64::FADDv8f16:
7134 case AArch64::FADDv2f32:
7135 case AArch64::FADDv4f32:
7136 case AArch64::FADDv2f64:
7137 case AArch64::FMULv4f16:
7138 case AArch64::FMULv8f16:
7139 case AArch64::FMULv2f32:
7140 case AArch64::FMULv4f32:
7141 case AArch64::FMULv2f64:
7142 case AArch64::FMULXv4f16:
7143 case AArch64::FMULXv8f16:
7144 case AArch64::FMULXv2f32:
7145 case AArch64::FMULXv4f32:
7146 case AArch64::FMULXv2f64:
7147 // -- SVE instructions --
7148 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7149 // in the SVE instruction set (though there are predicated ones).
7150 case AArch64::FADD_ZZZ_H:
7151 case AArch64::FADD_ZZZ_S:
7152 case AArch64::FADD_ZZZ_D:
7153 case AArch64::FMUL_ZZZ_H:
7154 case AArch64::FMUL_ZZZ_S:
7155 case AArch64::FMUL_ZZZ_D:
7158
7159 // == Integer types ==
7160 // -- Base instructions --
7161 // Opcodes MULWrr and MULXrr don't exist because
7162 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7163 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7164 // The machine-combiner does not support three-source-operands machine
7165 // instruction. So we cannot reassociate MULs.
7166 case AArch64::ADDWrr:
7167 case AArch64::ADDXrr:
7168 case AArch64::ANDWrr:
7169 case AArch64::ANDXrr:
7170 case AArch64::ORRWrr:
7171 case AArch64::ORRXrr:
7172 case AArch64::EORWrr:
7173 case AArch64::EORXrr:
7174 case AArch64::EONWrr:
7175 case AArch64::EONXrr:
7176 // -- Advanced SIMD instructions --
7177 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7178 // in the Advanced SIMD instruction set.
7179 case AArch64::ADDv8i8:
7180 case AArch64::ADDv16i8:
7181 case AArch64::ADDv4i16:
7182 case AArch64::ADDv8i16:
7183 case AArch64::ADDv2i32:
7184 case AArch64::ADDv4i32:
7185 case AArch64::ADDv1i64:
7186 case AArch64::ADDv2i64:
7187 case AArch64::MULv8i8:
7188 case AArch64::MULv16i8:
7189 case AArch64::MULv4i16:
7190 case AArch64::MULv8i16:
7191 case AArch64::MULv2i32:
7192 case AArch64::MULv4i32:
7193 case AArch64::ANDv8i8:
7194 case AArch64::ANDv16i8:
7195 case AArch64::ORRv8i8:
7196 case AArch64::ORRv16i8:
7197 case AArch64::EORv8i8:
7198 case AArch64::EORv16i8:
7199 // -- SVE instructions --
7200 case AArch64::ADD_ZZZ_B:
7201 case AArch64::ADD_ZZZ_H:
7202 case AArch64::ADD_ZZZ_S:
7203 case AArch64::ADD_ZZZ_D:
7204 case AArch64::MUL_ZZZ_B:
7205 case AArch64::MUL_ZZZ_H:
7206 case AArch64::MUL_ZZZ_S:
7207 case AArch64::MUL_ZZZ_D:
7208 case AArch64::AND_ZZZ:
7209 case AArch64::ORR_ZZZ:
7210 case AArch64::EOR_ZZZ:
7211 return true;
7212
7213 default:
7214 return false;
7215 }
7216}
7217
7218/// Find instructions that can be turned into madd.
7220 SmallVectorImpl<unsigned> &Patterns) {
7221 unsigned Opc = Root.getOpcode();
7222 MachineBasicBlock &MBB = *Root.getParent();
7223 bool Found = false;
7224
7226 return false;
7228 int Cmp_NZCV =
7229 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7230 // When NZCV is live bail out.
7231 if (Cmp_NZCV == -1)
7232 return false;
7233 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7234 // When opcode can't change bail out.
7235 // CHECKME: do we miss any cases for opcode conversion?
7236 if (NewOpc == Opc)
7237 return false;
7238 Opc = NewOpc;
7239 }
7240
7241 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7242 unsigned Pattern) {
7243 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7244 Patterns.push_back(Pattern);
7245 Found = true;
7246 }
7247 };
7248
7249 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7250 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7251 Patterns.push_back(Pattern);
7252 Found = true;
7253 }
7254 };
7255
7257
7258 switch (Opc) {
7259 default:
7260 break;
7261 case AArch64::ADDWrr:
7262 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7263 "ADDWrr does not have register operands");
7264 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7265 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7266 break;
7267 case AArch64::ADDXrr:
7268 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7269 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7270 break;
7271 case AArch64::SUBWrr:
7272 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7273 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7274 break;
7275 case AArch64::SUBXrr:
7276 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7277 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7278 break;
7279 case AArch64::ADDWri:
7280 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7281 break;
7282 case AArch64::ADDXri:
7283 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7284 break;
7285 case AArch64::SUBWri:
7286 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7287 break;
7288 case AArch64::SUBXri:
7289 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7290 break;
7291 case AArch64::ADDv8i8:
7292 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7293 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7294 break;
7295 case AArch64::ADDv16i8:
7296 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7297 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7298 break;
7299 case AArch64::ADDv4i16:
7300 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7301 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7302 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7303 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7304 break;
7305 case AArch64::ADDv8i16:
7306 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7307 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7308 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7309 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7310 break;
7311 case AArch64::ADDv2i32:
7312 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7313 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7314 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7315 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7316 break;
7317 case AArch64::ADDv4i32:
7318 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7319 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7320 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7321 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7322 break;
7323 case AArch64::SUBv8i8:
7324 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7325 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7326 break;
7327 case AArch64::SUBv16i8:
7328 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7329 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7330 break;
7331 case AArch64::SUBv4i16:
7332 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7333 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7334 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7335 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7336 break;
7337 case AArch64::SUBv8i16:
7338 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7339 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7340 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7341 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7342 break;
7343 case AArch64::SUBv2i32:
7344 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7345 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7346 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7347 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7348 break;
7349 case AArch64::SUBv4i32:
7350 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7351 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7352 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7353 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7354 break;
7355 }
7356 return Found;
7357}
7358
7359bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7360 switch (Opcode) {
7361 default:
7362 break;
7363 case AArch64::UABALB_ZZZ_D:
7364 case AArch64::UABALB_ZZZ_H:
7365 case AArch64::UABALB_ZZZ_S:
7366 case AArch64::UABALT_ZZZ_D:
7367 case AArch64::UABALT_ZZZ_H:
7368 case AArch64::UABALT_ZZZ_S:
7369 case AArch64::SABALB_ZZZ_D:
7370 case AArch64::SABALB_ZZZ_S:
7371 case AArch64::SABALB_ZZZ_H:
7372 case AArch64::SABALT_ZZZ_D:
7373 case AArch64::SABALT_ZZZ_S:
7374 case AArch64::SABALT_ZZZ_H:
7375 case AArch64::UABALv16i8_v8i16:
7376 case AArch64::UABALv2i32_v2i64:
7377 case AArch64::UABALv4i16_v4i32:
7378 case AArch64::UABALv4i32_v2i64:
7379 case AArch64::UABALv8i16_v4i32:
7380 case AArch64::UABALv8i8_v8i16:
7381 case AArch64::UABAv16i8:
7382 case AArch64::UABAv2i32:
7383 case AArch64::UABAv4i16:
7384 case AArch64::UABAv4i32:
7385 case AArch64::UABAv8i16:
7386 case AArch64::UABAv8i8:
7387 case AArch64::SABALv16i8_v8i16:
7388 case AArch64::SABALv2i32_v2i64:
7389 case AArch64::SABALv4i16_v4i32:
7390 case AArch64::SABALv4i32_v2i64:
7391 case AArch64::SABALv8i16_v4i32:
7392 case AArch64::SABALv8i8_v8i16:
7393 case AArch64::SABAv16i8:
7394 case AArch64::SABAv2i32:
7395 case AArch64::SABAv4i16:
7396 case AArch64::SABAv4i32:
7397 case AArch64::SABAv8i16:
7398 case AArch64::SABAv8i8:
7399 return true;
7400 }
7401
7402 return false;
7403}
7404
7405unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7406 unsigned AccumulationOpcode) const {
7407 switch (AccumulationOpcode) {
7408 default:
7409 llvm_unreachable("Unsupported accumulation Opcode!");
7410 case AArch64::UABALB_ZZZ_D:
7411 return AArch64::UABDLB_ZZZ_D;
7412 case AArch64::UABALB_ZZZ_H:
7413 return AArch64::UABDLB_ZZZ_H;
7414 case AArch64::UABALB_ZZZ_S:
7415 return AArch64::UABDLB_ZZZ_S;
7416 case AArch64::UABALT_ZZZ_D:
7417 return AArch64::UABDLT_ZZZ_D;
7418 case AArch64::UABALT_ZZZ_H:
7419 return AArch64::UABDLT_ZZZ_H;
7420 case AArch64::UABALT_ZZZ_S:
7421 return AArch64::UABDLT_ZZZ_S;
7422 case AArch64::UABALv16i8_v8i16:
7423 return AArch64::UABDLv16i8_v8i16;
7424 case AArch64::UABALv2i32_v2i64:
7425 return AArch64::UABDLv2i32_v2i64;
7426 case AArch64::UABALv4i16_v4i32:
7427 return AArch64::UABDLv4i16_v4i32;
7428 case AArch64::UABALv4i32_v2i64:
7429 return AArch64::UABDLv4i32_v2i64;
7430 case AArch64::UABALv8i16_v4i32:
7431 return AArch64::UABDLv8i16_v4i32;
7432 case AArch64::UABALv8i8_v8i16:
7433 return AArch64::UABDLv8i8_v8i16;
7434 case AArch64::UABAv16i8:
7435 return AArch64::UABDv16i8;
7436 case AArch64::UABAv2i32:
7437 return AArch64::UABDv2i32;
7438 case AArch64::UABAv4i16:
7439 return AArch64::UABDv4i16;
7440 case AArch64::UABAv4i32:
7441 return AArch64::UABDv4i32;
7442 case AArch64::UABAv8i16:
7443 return AArch64::UABDv8i16;
7444 case AArch64::UABAv8i8:
7445 return AArch64::UABDv8i8;
7446 case AArch64::SABALB_ZZZ_D:
7447 return AArch64::SABDLB_ZZZ_D;
7448 case AArch64::SABALB_ZZZ_S:
7449 return AArch64::SABDLB_ZZZ_S;
7450 case AArch64::SABALB_ZZZ_H:
7451 return AArch64::SABDLB_ZZZ_H;
7452 case AArch64::SABALT_ZZZ_D:
7453 return AArch64::SABDLT_ZZZ_D;
7454 case AArch64::SABALT_ZZZ_S:
7455 return AArch64::SABDLT_ZZZ_S;
7456 case AArch64::SABALT_ZZZ_H:
7457 return AArch64::SABDLT_ZZZ_H;
7458 case AArch64::SABALv16i8_v8i16:
7459 return AArch64::SABDLv16i8_v8i16;
7460 case AArch64::SABALv2i32_v2i64:
7461 return AArch64::SABDLv2i32_v2i64;
7462 case AArch64::SABALv4i16_v4i32:
7463 return AArch64::SABDLv4i16_v4i32;
7464 case AArch64::SABALv4i32_v2i64:
7465 return AArch64::SABDLv4i32_v2i64;
7466 case AArch64::SABALv8i16_v4i32:
7467 return AArch64::SABDLv8i16_v4i32;
7468 case AArch64::SABALv8i8_v8i16:
7469 return AArch64::SABDLv8i8_v8i16;
7470 case AArch64::SABAv16i8:
7471 return AArch64::SABDv16i8;
7472 case AArch64::SABAv2i32:
7473 return AArch64::SABAv2i32;
7474 case AArch64::SABAv4i16:
7475 return AArch64::SABDv4i16;
7476 case AArch64::SABAv4i32:
7477 return AArch64::SABDv4i32;
7478 case AArch64::SABAv8i16:
7479 return AArch64::SABDv8i16;
7480 case AArch64::SABAv8i8:
7481 return AArch64::SABDv8i8;
7482 }
7483}
7484
7485/// Floating-Point Support
7486
7487/// Find instructions that can be turned into madd.
7489 SmallVectorImpl<unsigned> &Patterns) {
7490
7491 if (!isCombineInstrCandidateFP(Root))
7492 return false;
7493
7494 MachineBasicBlock &MBB = *Root.getParent();
7495 bool Found = false;
7496
7497 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7498 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7499 Patterns.push_back(Pattern);
7500 return true;
7501 }
7502 return false;
7503 };
7504
7506
7507 switch (Root.getOpcode()) {
7508 default:
7509 assert(false && "Unsupported FP instruction in combiner\n");
7510 break;
7511 case AArch64::FADDHrr:
7512 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7513 "FADDHrr does not have register operands");
7514
7515 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7516 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7517 break;
7518 case AArch64::FADDSrr:
7519 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7520 "FADDSrr does not have register operands");
7521
7522 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7523 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7524
7525 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7526 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7527 break;
7528 case AArch64::FADDDrr:
7529 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7530 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7531
7532 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7533 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7534 break;
7535 case AArch64::FADDv4f16:
7536 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7537 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7538
7539 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7540 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7541 break;
7542 case AArch64::FADDv8f16:
7543 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7544 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7545
7546 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7547 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7548 break;
7549 case AArch64::FADDv2f32:
7550 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7551 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7552
7553 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7554 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7555 break;
7556 case AArch64::FADDv2f64:
7557 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7558 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7559
7560 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7561 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7562 break;
7563 case AArch64::FADDv4f32:
7564 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7565 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7566
7567 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7568 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7569 break;
7570 case AArch64::FSUBHrr:
7571 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7572 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7573 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7574 break;
7575 case AArch64::FSUBSrr:
7576 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7577
7578 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7579 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7580
7581 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7582 break;
7583 case AArch64::FSUBDrr:
7584 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7585
7586 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7587 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7588
7589 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7590 break;
7591 case AArch64::FSUBv4f16:
7592 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7593 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7594
7595 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7596 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7597 break;
7598 case AArch64::FSUBv8f16:
7599 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7600 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7601
7602 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7603 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7604 break;
7605 case AArch64::FSUBv2f32:
7606 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7607 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7608
7609 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7610 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7611 break;
7612 case AArch64::FSUBv2f64:
7613 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7614 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7615
7616 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7617 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7618 break;
7619 case AArch64::FSUBv4f32:
7620 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7621 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7622
7623 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7624 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7625 break;
7626 }
7627 return Found;
7628}
7629
7631 SmallVectorImpl<unsigned> &Patterns) {
7632 MachineBasicBlock &MBB = *Root.getParent();
7633 bool Found = false;
7634
7635 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7636 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7637 MachineOperand &MO = Root.getOperand(Operand);
7638 MachineInstr *MI = nullptr;
7639 if (MO.isReg() && MO.getReg().isVirtual())
7640 MI = MRI.getUniqueVRegDef(MO.getReg());
7641 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7642 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7643 MI->getOperand(1).getReg().isVirtual())
7644 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7645 if (MI && MI->getOpcode() == Opcode) {
7646 Patterns.push_back(Pattern);
7647 return true;
7648 }
7649 return false;
7650 };
7651
7653
7654 switch (Root.getOpcode()) {
7655 default:
7656 return false;
7657 case AArch64::FMULv2f32:
7658 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7659 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7660 break;
7661 case AArch64::FMULv2f64:
7662 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7663 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7664 break;
7665 case AArch64::FMULv4f16:
7666 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7667 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7668 break;
7669 case AArch64::FMULv4f32:
7670 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7671 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7672 break;
7673 case AArch64::FMULv8f16:
7674 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7675 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7676 break;
7677 }
7678
7679 return Found;
7680}
7681
7683 SmallVectorImpl<unsigned> &Patterns) {
7684 unsigned Opc = Root.getOpcode();
7685 MachineBasicBlock &MBB = *Root.getParent();
7686 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7687
7688 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7689 MachineOperand &MO = Root.getOperand(1);
7690 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
7691 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7692 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7696 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7697 Patterns.push_back(Pattern);
7698 return true;
7699 }
7700 return false;
7701 };
7702
7703 switch (Opc) {
7704 default:
7705 break;
7706 case AArch64::FNEGDr:
7707 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7708 case AArch64::FNEGSr:
7709 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7710 }
7711
7712 return false;
7713}
7714
7715/// Return true when a code sequence can improve throughput. It
7716/// should be called only for instructions in loops.
7717/// \param Pattern - combiner pattern
7719 switch (Pattern) {
7720 default:
7721 break;
7827 return true;
7828 } // end switch (Pattern)
7829 return false;
7830}
7831
7832/// Find other MI combine patterns.
7834 SmallVectorImpl<unsigned> &Patterns) {
7835 // A - (B + C) ==> (A - B) - C or (A - C) - B
7836 unsigned Opc = Root.getOpcode();
7837 MachineBasicBlock &MBB = *Root.getParent();
7838
7839 switch (Opc) {
7840 case AArch64::SUBWrr:
7841 case AArch64::SUBSWrr:
7842 case AArch64::SUBXrr:
7843 case AArch64::SUBSXrr:
7844 // Found candidate root.
7845 break;
7846 default:
7847 return false;
7848 }
7849
7851 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
7852 -1)
7853 return false;
7854
7855 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
7856 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
7857 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
7858 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
7861 return true;
7862 }
7863
7864 return false;
7865}
7866
7867/// Check if the given instruction forms a gather load pattern that can be
7868/// optimized for better Memory-Level Parallelism (MLP). This function
7869/// identifies chains of NEON lane load instructions that load data from
7870/// different memory addresses into individual lanes of a 128-bit vector
7871/// register, then attempts to split the pattern into parallel loads to break
7872/// the serial dependency between instructions.
7873///
7874/// Pattern Matched:
7875/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
7876/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
7877///
7878/// Transformed Into:
7879/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
7880/// to combine the results, enabling better memory-level parallelism.
7881///
7882/// Supported Element Types:
7883/// - 32-bit elements (LD1i32, 4 lanes total)
7884/// - 16-bit elements (LD1i16, 8 lanes total)
7885/// - 8-bit elements (LD1i8, 16 lanes total)
7887 SmallVectorImpl<unsigned> &Patterns,
7888 unsigned LoadLaneOpCode, unsigned NumLanes) {
7889 const MachineFunction *MF = Root.getMF();
7890
7891 // Early exit if optimizing for size.
7892 if (MF->getFunction().hasMinSize())
7893 return false;
7894
7895 const MachineRegisterInfo &MRI = MF->getRegInfo();
7897
7898 // The root of the pattern must load into the last lane of the vector.
7899 if (Root.getOperand(2).getImm() != NumLanes - 1)
7900 return false;
7901
7902 // Check that we have load into all lanes except lane 0.
7903 // For each load we also want to check that:
7904 // 1. It has a single non-debug use (since we will be replacing the virtual
7905 // register)
7906 // 2. That the addressing mode only uses a single pointer operand
7907 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7908 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
7909 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
7911 while (!RemainingLanes.empty() && CurrInstr &&
7912 CurrInstr->getOpcode() == LoadLaneOpCode &&
7913 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
7914 CurrInstr->getNumOperands() == 4) {
7915 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
7916 LoadInstrs.push_back(CurrInstr);
7917 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
7918 }
7919
7920 // Check that we have found a match for lanes N-1.. 1.
7921 if (!RemainingLanes.empty())
7922 return false;
7923
7924 // Match the SUBREG_TO_REG sequence.
7925 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
7926 return false;
7927
7928 // Verify that the subreg to reg loads an integer into the first lane.
7929 auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
7930 unsigned SingleLaneSizeInBits = 128 / NumLanes;
7931 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
7932 return false;
7933
7934 // Verify that it also has a single non debug use.
7935 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
7936 return false;
7937
7938 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
7939
7940 // If there is any chance of aliasing, do not apply the pattern.
7941 // Walk backward through the MBB starting from Root.
7942 // Exit early if we've encountered all load instructions or hit the search
7943 // limit.
7944 auto MBBItr = Root.getIterator();
7945 unsigned RemainingSteps = GatherOptSearchLimit;
7946 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
7947 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
7948 const MachineBasicBlock *MBB = Root.getParent();
7949
7950 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
7951 !RemainingLoadInstrs.empty();
7952 --MBBItr, --RemainingSteps) {
7953 const MachineInstr &CurrInstr = *MBBItr;
7954
7955 // Remove this instruction from remaining loads if it's one we're tracking.
7956 RemainingLoadInstrs.erase(&CurrInstr);
7957
7958 // Check for potential aliasing with any of the load instructions to
7959 // optimize.
7960 if (CurrInstr.isLoadFoldBarrier())
7961 return false;
7962 }
7963
7964 // If we hit the search limit without finding all load instructions,
7965 // don't match the pattern.
7966 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
7967 return false;
7968
7969 switch (NumLanes) {
7970 case 4:
7972 break;
7973 case 8:
7975 break;
7976 case 16:
7978 break;
7979 default:
7980 llvm_unreachable("Got bad number of lanes for gather pattern.");
7981 }
7982
7983 return true;
7984}
7985
7986/// Search for patterns of LD instructions we can optimize.
7988 SmallVectorImpl<unsigned> &Patterns) {
7989
7990 // The pattern searches for loads into single lanes.
7991 switch (Root.getOpcode()) {
7992 case AArch64::LD1i32:
7993 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
7994 case AArch64::LD1i16:
7995 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
7996 case AArch64::LD1i8:
7997 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
7998 default:
7999 return false;
8000 }
8001}
8002
8003/// Generate optimized instruction sequence for gather load patterns to improve
8004/// Memory-Level Parallelism (MLP). This function transforms a chain of
8005/// sequential NEON lane loads into parallel vector loads that can execute
8006/// concurrently.
8007static void
8011 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8012 unsigned Pattern, unsigned NumLanes) {
8013 MachineFunction &MF = *Root.getParent()->getParent();
8016
8017 // Gather the initial load instructions to build the pattern.
8018 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8019 MachineInstr *CurrInstr = &Root;
8020 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8021 LoadToLaneInstrs.push_back(CurrInstr);
8022 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8023 }
8024
8025 // Sort the load instructions according to the lane.
8026 llvm::sort(LoadToLaneInstrs,
8027 [](const MachineInstr *A, const MachineInstr *B) {
8028 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8029 });
8030
8031 MachineInstr *SubregToReg = CurrInstr;
8032 LoadToLaneInstrs.push_back(
8033 MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
8034 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8035
8036 const TargetRegisterClass *FPR128RegClass =
8037 MRI.getRegClass(Root.getOperand(0).getReg());
8038
8039 // Helper lambda to create a LD1 instruction.
8040 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8041 Register SrcRegister, unsigned Lane,
8042 Register OffsetRegister,
8043 bool OffsetRegisterKillState) {
8044 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8045 MachineInstrBuilder LoadIndexIntoRegister =
8046 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8047 NewRegister)
8048 .addReg(SrcRegister)
8049 .addImm(Lane)
8050 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
8051 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8052 InsInstrs.push_back(LoadIndexIntoRegister);
8053 return NewRegister;
8054 };
8055
8056 // Helper to create load instruction based on the NumLanes in the NEON
8057 // register we are rewriting.
8058 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8059 Register OffsetReg,
8060 bool KillState) -> MachineInstrBuilder {
8061 unsigned Opcode;
8062 switch (NumLanes) {
8063 case 4:
8064 Opcode = AArch64::LDRSui;
8065 break;
8066 case 8:
8067 Opcode = AArch64::LDRHui;
8068 break;
8069 case 16:
8070 Opcode = AArch64::LDRBui;
8071 break;
8072 default:
8074 "Got unsupported number of lanes in machine-combiner gather pattern");
8075 }
8076 // Immediate offset load
8077 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8078 .addReg(OffsetReg)
8079 .addImm(0);
8080 };
8081
8082 // Load the remaining lanes into register 0.
8083 auto LanesToLoadToReg0 =
8084 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8085 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8086 Register PrevReg = SubregToReg->getOperand(0).getReg();
8087 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8088 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8089 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8090 OffsetRegOperand.getReg(),
8091 OffsetRegOperand.isKill());
8092 DelInstrs.push_back(LoadInstr);
8093 }
8094 Register LastLoadReg0 = PrevReg;
8095
8096 // First load into register 1. Perform an integer load to zero out the upper
8097 // lanes in a single instruction.
8098 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8099 MachineInstr *OriginalSplitLoad =
8100 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8101 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8102 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8103
8104 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8105 OriginalSplitLoad->getOperand(3);
8106 MachineInstrBuilder MiddleIndexLoadInstr =
8107 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8108 OriginalSplitToLoadOffsetOperand.getReg(),
8109 OriginalSplitToLoadOffsetOperand.isKill());
8110
8111 InstrIdxForVirtReg.insert(
8112 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8113 InsInstrs.push_back(MiddleIndexLoadInstr);
8114 DelInstrs.push_back(OriginalSplitLoad);
8115
8116 // Subreg To Reg instruction for register 1.
8117 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8118 unsigned SubregType;
8119 switch (NumLanes) {
8120 case 4:
8121 SubregType = AArch64::ssub;
8122 break;
8123 case 8:
8124 SubregType = AArch64::hsub;
8125 break;
8126 case 16:
8127 SubregType = AArch64::bsub;
8128 break;
8129 default:
8131 "Got invalid NumLanes for machine-combiner gather pattern");
8132 }
8133
8134 auto SubRegToRegInstr =
8135 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8136 DestRegForSubregToReg)
8137 .addImm(0)
8138 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8139 .addImm(SubregType);
8140 InstrIdxForVirtReg.insert(
8141 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8142 InsInstrs.push_back(SubRegToRegInstr);
8143
8144 // Load remaining lanes into register 1.
8145 auto LanesToLoadToReg1 =
8146 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8147 LoadToLaneInstrsAscending.end());
8148 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8149 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8150 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8151 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8152 OffsetRegOperand.getReg(),
8153 OffsetRegOperand.isKill());
8154
8155 // Do not add the last reg to DelInstrs - it will be removed later.
8156 if (Index == NumLanes / 2 - 2) {
8157 break;
8158 }
8159 DelInstrs.push_back(LoadInstr);
8160 }
8161 Register LastLoadReg1 = PrevReg;
8162
8163 // Create the final zip instruction to combine the results.
8164 MachineInstrBuilder ZipInstr =
8165 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8166 Root.getOperand(0).getReg())
8167 .addReg(LastLoadReg0)
8168 .addReg(LastLoadReg1);
8169 InsInstrs.push_back(ZipInstr);
8170}
8171
8185
8186/// Return true when there is potentially a faster code sequence for an
8187/// instruction chain ending in \p Root. All potential patterns are listed in
8188/// the \p Pattern vector. Pattern should be sorted in priority order since the
8189/// pattern evaluator stops checking as soon as it finds a faster sequence.
8190
8191bool AArch64InstrInfo::getMachineCombinerPatterns(
8192 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8193 bool DoRegPressureReduce) const {
8194 // Integer patterns
8195 if (getMaddPatterns(Root, Patterns))
8196 return true;
8197 // Floating point patterns
8198 if (getFMULPatterns(Root, Patterns))
8199 return true;
8200 if (getFMAPatterns(Root, Patterns))
8201 return true;
8202 if (getFNEGPatterns(Root, Patterns))
8203 return true;
8204
8205 // Other patterns
8206 if (getMiscPatterns(Root, Patterns))
8207 return true;
8208
8209 // Load patterns
8210 if (getLoadPatterns(Root, Patterns))
8211 return true;
8212
8213 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8214 DoRegPressureReduce);
8215}
8216
8218/// genFusedMultiply - Generate fused multiply instructions.
8219/// This function supports both integer and floating point instructions.
8220/// A typical example:
8221/// F|MUL I=A,B,0
8222/// F|ADD R,I,C
8223/// ==> F|MADD R,A,B,C
8224/// \param MF Containing MachineFunction
8225/// \param MRI Register information
8226/// \param TII Target information
8227/// \param Root is the F|ADD instruction
8228/// \param [out] InsInstrs is a vector of machine instructions and will
8229/// contain the generated madd instruction
8230/// \param IdxMulOpd is index of operand in Root that is the result of
8231/// the F|MUL. In the example above IdxMulOpd is 1.
8232/// \param MaddOpc the opcode fo the f|madd instruction
8233/// \param RC Register class of operands
8234/// \param kind of fma instruction (addressing mode) to be generated
8235/// \param ReplacedAddend is the result register from the instruction
8236/// replacing the non-combined operand, if any.
8237static MachineInstr *
8239 const TargetInstrInfo *TII, MachineInstr &Root,
8240 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8241 unsigned MaddOpc, const TargetRegisterClass *RC,
8243 const Register *ReplacedAddend = nullptr) {
8244 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8245
8246 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8247 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8248 Register ResultReg = Root.getOperand(0).getReg();
8249 Register SrcReg0 = MUL->getOperand(1).getReg();
8250 bool Src0IsKill = MUL->getOperand(1).isKill();
8251 Register SrcReg1 = MUL->getOperand(2).getReg();
8252 bool Src1IsKill = MUL->getOperand(2).isKill();
8253
8254 Register SrcReg2;
8255 bool Src2IsKill;
8256 if (ReplacedAddend) {
8257 // If we just generated a new addend, we must be it's only use.
8258 SrcReg2 = *ReplacedAddend;
8259 Src2IsKill = true;
8260 } else {
8261 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8262 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8263 }
8264
8265 if (ResultReg.isVirtual())
8266 MRI.constrainRegClass(ResultReg, RC);
8267 if (SrcReg0.isVirtual())
8268 MRI.constrainRegClass(SrcReg0, RC);
8269 if (SrcReg1.isVirtual())
8270 MRI.constrainRegClass(SrcReg1, RC);
8271 if (SrcReg2.isVirtual())
8272 MRI.constrainRegClass(SrcReg2, RC);
8273
8275 if (kind == FMAInstKind::Default)
8276 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8277 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8278 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8279 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8280 else if (kind == FMAInstKind::Indexed)
8281 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8282 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8283 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8284 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8285 .addImm(MUL->getOperand(3).getImm());
8286 else if (kind == FMAInstKind::Accumulator)
8287 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8288 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8289 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8290 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8291 else
8292 assert(false && "Invalid FMA instruction kind \n");
8293 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8294 InsInstrs.push_back(MIB);
8295 return MUL;
8296}
8297
8298static MachineInstr *
8300 const TargetInstrInfo *TII, MachineInstr &Root,
8302 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8303
8304 unsigned Opc = 0;
8305 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8306 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8307 Opc = AArch64::FNMADDSrrr;
8308 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8309 Opc = AArch64::FNMADDDrrr;
8310 else
8311 return nullptr;
8312
8313 Register ResultReg = Root.getOperand(0).getReg();
8314 Register SrcReg0 = MAD->getOperand(1).getReg();
8315 Register SrcReg1 = MAD->getOperand(2).getReg();
8316 Register SrcReg2 = MAD->getOperand(3).getReg();
8317 bool Src0IsKill = MAD->getOperand(1).isKill();
8318 bool Src1IsKill = MAD->getOperand(2).isKill();
8319 bool Src2IsKill = MAD->getOperand(3).isKill();
8320 if (ResultReg.isVirtual())
8321 MRI.constrainRegClass(ResultReg, RC);
8322 if (SrcReg0.isVirtual())
8323 MRI.constrainRegClass(SrcReg0, RC);
8324 if (SrcReg1.isVirtual())
8325 MRI.constrainRegClass(SrcReg1, RC);
8326 if (SrcReg2.isVirtual())
8327 MRI.constrainRegClass(SrcReg2, RC);
8328
8330 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8331 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8332 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8333 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8334 InsInstrs.push_back(MIB);
8335
8336 return MAD;
8337}
8338
8339/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8340static MachineInstr *
8343 unsigned IdxDupOp, unsigned MulOpc,
8345 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8346 "Invalid index of FMUL operand");
8347
8348 MachineFunction &MF = *Root.getMF();
8350
8351 MachineInstr *Dup =
8352 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8353
8354 if (Dup->getOpcode() == TargetOpcode::COPY)
8355 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8356
8357 Register DupSrcReg = Dup->getOperand(1).getReg();
8358 MRI.clearKillFlags(DupSrcReg);
8359 MRI.constrainRegClass(DupSrcReg, RC);
8360
8361 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8362
8363 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8364 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8365
8366 Register ResultReg = Root.getOperand(0).getReg();
8367
8369 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8370 .add(MulOp)
8371 .addReg(DupSrcReg)
8372 .addImm(DupSrcLane);
8373
8374 InsInstrs.push_back(MIB);
8375 return &Root;
8376}
8377
8378/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8379/// instructions.
8380///
8381/// \see genFusedMultiply
8385 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8386 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8388}
8389
8390/// genNeg - Helper to generate an intermediate negation of the second operand
8391/// of Root
8393 const TargetInstrInfo *TII, MachineInstr &Root,
8395 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8396 unsigned MnegOpc, const TargetRegisterClass *RC) {
8397 Register NewVR = MRI.createVirtualRegister(RC);
8399 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8400 .add(Root.getOperand(2));
8401 InsInstrs.push_back(MIB);
8402
8403 assert(InstrIdxForVirtReg.empty());
8404 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8405
8406 return NewVR;
8407}
8408
8409/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8410/// instructions with an additional negation of the accumulator
8414 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8415 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8416 assert(IdxMulOpd == 1);
8417
8418 Register NewVR =
8419 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8420 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8421 FMAInstKind::Accumulator, &NewVR);
8422}
8423
8424/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8425/// instructions.
8426///
8427/// \see genFusedMultiply
8431 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8432 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8434}
8435
8436/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8437/// instructions with an additional negation of the accumulator
8441 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8442 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8443 assert(IdxMulOpd == 1);
8444
8445 Register NewVR =
8446 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8447
8448 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8449 FMAInstKind::Indexed, &NewVR);
8450}
8451
8452/// genMaddR - Generate madd instruction and combine mul and add using
8453/// an extra virtual register
8454/// Example - an ADD intermediate needs to be stored in a register:
8455/// MUL I=A,B,0
8456/// ADD R,I,Imm
8457/// ==> ORR V, ZR, Imm
8458/// ==> MADD R,A,B,V
8459/// \param MF Containing MachineFunction
8460/// \param MRI Register information
8461/// \param TII Target information
8462/// \param Root is the ADD instruction
8463/// \param [out] InsInstrs is a vector of machine instructions and will
8464/// contain the generated madd instruction
8465/// \param IdxMulOpd is index of operand in Root that is the result of
8466/// the MUL. In the example above IdxMulOpd is 1.
8467/// \param MaddOpc the opcode fo the madd instruction
8468/// \param VR is a virtual register that holds the value of an ADD operand
8469/// (V in the example above).
8470/// \param RC Register class of operands
8472 const TargetInstrInfo *TII, MachineInstr &Root,
8474 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8475 const TargetRegisterClass *RC) {
8476 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8477
8478 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8479 Register ResultReg = Root.getOperand(0).getReg();
8480 Register SrcReg0 = MUL->getOperand(1).getReg();
8481 bool Src0IsKill = MUL->getOperand(1).isKill();
8482 Register SrcReg1 = MUL->getOperand(2).getReg();
8483 bool Src1IsKill = MUL->getOperand(2).isKill();
8484
8485 if (ResultReg.isVirtual())
8486 MRI.constrainRegClass(ResultReg, RC);
8487 if (SrcReg0.isVirtual())
8488 MRI.constrainRegClass(SrcReg0, RC);
8489 if (SrcReg1.isVirtual())
8490 MRI.constrainRegClass(SrcReg1, RC);
8492 MRI.constrainRegClass(VR, RC);
8493
8495 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8496 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8497 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8498 .addReg(VR);
8499 // Insert the MADD
8500 InsInstrs.push_back(MIB);
8501 return MUL;
8502}
8503
8504/// Do the following transformation
8505/// A - (B + C) ==> (A - B) - C
8506/// A - (B + C) ==> (A - C) - B
8508 const TargetInstrInfo *TII, MachineInstr &Root,
8511 unsigned IdxOpd1,
8512 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8513 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8514 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8515 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8516
8517 Register ResultReg = Root.getOperand(0).getReg();
8518 Register RegA = Root.getOperand(1).getReg();
8519 bool RegAIsKill = Root.getOperand(1).isKill();
8520 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8521 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8522 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8523 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8524 Register NewVR =
8525 MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(2).getReg()));
8526
8527 unsigned Opcode = Root.getOpcode();
8528 if (Opcode == AArch64::SUBSWrr)
8529 Opcode = AArch64::SUBWrr;
8530 else if (Opcode == AArch64::SUBSXrr)
8531 Opcode = AArch64::SUBXrr;
8532 else
8533 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8534 "Unexpected instruction opcode.");
8535
8536 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8537 Flags &= ~MachineInstr::NoSWrap;
8538 Flags &= ~MachineInstr::NoUWrap;
8539
8540 MachineInstrBuilder MIB1 =
8541 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8542 .addReg(RegA, getKillRegState(RegAIsKill))
8543 .addReg(RegB, getKillRegState(RegBIsKill))
8544 .setMIFlags(Flags);
8545 MachineInstrBuilder MIB2 =
8546 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8547 .addReg(NewVR, getKillRegState(true))
8548 .addReg(RegC, getKillRegState(RegCIsKill))
8549 .setMIFlags(Flags);
8550
8551 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8552 InsInstrs.push_back(MIB1);
8553 InsInstrs.push_back(MIB2);
8554 DelInstrs.push_back(AddMI);
8555 DelInstrs.push_back(&Root);
8556}
8557
8558unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8559 unsigned int AccumulatorOpCode) const {
8560 switch (AccumulatorOpCode) {
8561 case AArch64::UABALB_ZZZ_D:
8562 case AArch64::SABALB_ZZZ_D:
8563 case AArch64::UABALT_ZZZ_D:
8564 case AArch64::SABALT_ZZZ_D:
8565 return AArch64::ADD_ZZZ_D;
8566 case AArch64::UABALB_ZZZ_H:
8567 case AArch64::SABALB_ZZZ_H:
8568 case AArch64::UABALT_ZZZ_H:
8569 case AArch64::SABALT_ZZZ_H:
8570 return AArch64::ADD_ZZZ_H;
8571 case AArch64::UABALB_ZZZ_S:
8572 case AArch64::SABALB_ZZZ_S:
8573 case AArch64::UABALT_ZZZ_S:
8574 case AArch64::SABALT_ZZZ_S:
8575 return AArch64::ADD_ZZZ_S;
8576 case AArch64::UABALv16i8_v8i16:
8577 case AArch64::SABALv8i8_v8i16:
8578 case AArch64::SABAv8i16:
8579 case AArch64::UABAv8i16:
8580 return AArch64::ADDv8i16;
8581 case AArch64::SABALv2i32_v2i64:
8582 case AArch64::UABALv2i32_v2i64:
8583 case AArch64::SABALv4i32_v2i64:
8584 return AArch64::ADDv2i64;
8585 case AArch64::UABALv4i16_v4i32:
8586 case AArch64::SABALv4i16_v4i32:
8587 case AArch64::SABALv8i16_v4i32:
8588 case AArch64::SABAv4i32:
8589 case AArch64::UABAv4i32:
8590 return AArch64::ADDv4i32;
8591 case AArch64::UABALv4i32_v2i64:
8592 return AArch64::ADDv2i64;
8593 case AArch64::UABALv8i16_v4i32:
8594 return AArch64::ADDv4i32;
8595 case AArch64::UABALv8i8_v8i16:
8596 case AArch64::SABALv16i8_v8i16:
8597 return AArch64::ADDv8i16;
8598 case AArch64::UABAv16i8:
8599 case AArch64::SABAv16i8:
8600 return AArch64::ADDv16i8;
8601 case AArch64::UABAv4i16:
8602 case AArch64::SABAv4i16:
8603 return AArch64::ADDv4i16;
8604 case AArch64::UABAv2i32:
8605 case AArch64::SABAv2i32:
8606 return AArch64::ADDv2i32;
8607 case AArch64::UABAv8i8:
8608 case AArch64::SABAv8i8:
8609 return AArch64::ADDv8i8;
8610 default:
8611 llvm_unreachable("Unknown accumulator opcode");
8612 }
8613}
8614
8615/// When getMachineCombinerPatterns() finds potential patterns,
8616/// this function generates the instructions that could replace the
8617/// original code sequence
8618void AArch64InstrInfo::genAlternativeCodeSequence(
8619 MachineInstr &Root, unsigned Pattern,
8622 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8623 MachineBasicBlock &MBB = *Root.getParent();
8624 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8625 MachineFunction &MF = *MBB.getParent();
8626 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8627
8628 MachineInstr *MUL = nullptr;
8629 const TargetRegisterClass *RC;
8630 unsigned Opc;
8631 switch (Pattern) {
8632 default:
8633 // Reassociate instructions.
8634 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8635 DelInstrs, InstrIdxForVirtReg);
8636 return;
8638 // A - (B + C)
8639 // ==> (A - B) - C
8640 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
8641 InstrIdxForVirtReg);
8642 return;
8644 // A - (B + C)
8645 // ==> (A - C) - B
8646 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
8647 InstrIdxForVirtReg);
8648 return;
8651 // MUL I=A,B,0
8652 // ADD R,I,C
8653 // ==> MADD R,A,B,C
8654 // --- Create(MADD);
8656 Opc = AArch64::MADDWrrr;
8657 RC = &AArch64::GPR32RegClass;
8658 } else {
8659 Opc = AArch64::MADDXrrr;
8660 RC = &AArch64::GPR64RegClass;
8661 }
8662 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8663 break;
8666 // MUL I=A,B,0
8667 // ADD R,C,I
8668 // ==> MADD R,A,B,C
8669 // --- Create(MADD);
8671 Opc = AArch64::MADDWrrr;
8672 RC = &AArch64::GPR32RegClass;
8673 } else {
8674 Opc = AArch64::MADDXrrr;
8675 RC = &AArch64::GPR64RegClass;
8676 }
8677 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8678 break;
8683 // MUL I=A,B,0
8684 // ADD/SUB R,I,Imm
8685 // ==> MOV V, Imm/-Imm
8686 // ==> MADD R,A,B,V
8687 // --- Create(MADD);
8688 const TargetRegisterClass *RC;
8689 unsigned BitSize, MovImm;
8692 MovImm = AArch64::MOVi32imm;
8693 RC = &AArch64::GPR32spRegClass;
8694 BitSize = 32;
8695 Opc = AArch64::MADDWrrr;
8696 RC = &AArch64::GPR32RegClass;
8697 } else {
8698 MovImm = AArch64::MOVi64imm;
8699 RC = &AArch64::GPR64spRegClass;
8700 BitSize = 64;
8701 Opc = AArch64::MADDXrrr;
8702 RC = &AArch64::GPR64RegClass;
8703 }
8704 Register NewVR = MRI.createVirtualRegister(RC);
8705 uint64_t Imm = Root.getOperand(2).getImm();
8706
8707 if (Root.getOperand(3).isImm()) {
8708 unsigned Val = Root.getOperand(3).getImm();
8709 Imm = Imm << Val;
8710 }
8711 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8713 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
8714 // Check that the immediate can be composed via a single instruction.
8716 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
8717 if (Insn.size() != 1)
8718 return;
8719 MachineInstrBuilder MIB1 =
8720 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
8721 .addImm(IsSub ? -Imm : Imm);
8722 InsInstrs.push_back(MIB1);
8723 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8724 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8725 break;
8726 }
8729 // MUL I=A,B,0
8730 // SUB R,I, C
8731 // ==> SUB V, 0, C
8732 // ==> MADD R,A,B,V // = -C + A*B
8733 // --- Create(MADD);
8734 const TargetRegisterClass *SubRC;
8735 unsigned SubOpc, ZeroReg;
8737 SubOpc = AArch64::SUBWrr;
8738 SubRC = &AArch64::GPR32spRegClass;
8739 ZeroReg = AArch64::WZR;
8740 Opc = AArch64::MADDWrrr;
8741 RC = &AArch64::GPR32RegClass;
8742 } else {
8743 SubOpc = AArch64::SUBXrr;
8744 SubRC = &AArch64::GPR64spRegClass;
8745 ZeroReg = AArch64::XZR;
8746 Opc = AArch64::MADDXrrr;
8747 RC = &AArch64::GPR64RegClass;
8748 }
8749 Register NewVR = MRI.createVirtualRegister(SubRC);
8750 // SUB NewVR, 0, C
8751 MachineInstrBuilder MIB1 =
8752 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
8753 .addReg(ZeroReg)
8754 .add(Root.getOperand(2));
8755 InsInstrs.push_back(MIB1);
8756 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8757 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8758 break;
8759 }
8762 // MUL I=A,B,0
8763 // SUB R,C,I
8764 // ==> MSUB R,A,B,C (computes C - A*B)
8765 // --- Create(MSUB);
8767 Opc = AArch64::MSUBWrrr;
8768 RC = &AArch64::GPR32RegClass;
8769 } else {
8770 Opc = AArch64::MSUBXrrr;
8771 RC = &AArch64::GPR64RegClass;
8772 }
8773 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8774 break;
8776 Opc = AArch64::MLAv8i8;
8777 RC = &AArch64::FPR64RegClass;
8778 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8779 break;
8781 Opc = AArch64::MLAv8i8;
8782 RC = &AArch64::FPR64RegClass;
8783 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8784 break;
8786 Opc = AArch64::MLAv16i8;
8787 RC = &AArch64::FPR128RegClass;
8788 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8789 break;
8791 Opc = AArch64::MLAv16i8;
8792 RC = &AArch64::FPR128RegClass;
8793 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8794 break;
8796 Opc = AArch64::MLAv4i16;
8797 RC = &AArch64::FPR64RegClass;
8798 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8799 break;
8801 Opc = AArch64::MLAv4i16;
8802 RC = &AArch64::FPR64RegClass;
8803 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8804 break;
8806 Opc = AArch64::MLAv8i16;
8807 RC = &AArch64::FPR128RegClass;
8808 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8809 break;
8811 Opc = AArch64::MLAv8i16;
8812 RC = &AArch64::FPR128RegClass;
8813 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8814 break;
8816 Opc = AArch64::MLAv2i32;
8817 RC = &AArch64::FPR64RegClass;
8818 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8819 break;
8821 Opc = AArch64::MLAv2i32;
8822 RC = &AArch64::FPR64RegClass;
8823 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8824 break;
8826 Opc = AArch64::MLAv4i32;
8827 RC = &AArch64::FPR128RegClass;
8828 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8829 break;
8831 Opc = AArch64::MLAv4i32;
8832 RC = &AArch64::FPR128RegClass;
8833 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8834 break;
8835
8837 Opc = AArch64::MLAv8i8;
8838 RC = &AArch64::FPR64RegClass;
8839 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8840 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
8841 RC);
8842 break;
8844 Opc = AArch64::MLSv8i8;
8845 RC = &AArch64::FPR64RegClass;
8846 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8847 break;
8849 Opc = AArch64::MLAv16i8;
8850 RC = &AArch64::FPR128RegClass;
8851 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8852 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
8853 RC);
8854 break;
8856 Opc = AArch64::MLSv16i8;
8857 RC = &AArch64::FPR128RegClass;
8858 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8859 break;
8861 Opc = AArch64::MLAv4i16;
8862 RC = &AArch64::FPR64RegClass;
8863 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8864 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
8865 RC);
8866 break;
8868 Opc = AArch64::MLSv4i16;
8869 RC = &AArch64::FPR64RegClass;
8870 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8871 break;
8873 Opc = AArch64::MLAv8i16;
8874 RC = &AArch64::FPR128RegClass;
8875 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8876 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
8877 RC);
8878 break;
8880 Opc = AArch64::MLSv8i16;
8881 RC = &AArch64::FPR128RegClass;
8882 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8883 break;
8885 Opc = AArch64::MLAv2i32;
8886 RC = &AArch64::FPR64RegClass;
8887 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8888 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
8889 RC);
8890 break;
8892 Opc = AArch64::MLSv2i32;
8893 RC = &AArch64::FPR64RegClass;
8894 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8895 break;
8897 Opc = AArch64::MLAv4i32;
8898 RC = &AArch64::FPR128RegClass;
8899 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8900 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
8901 RC);
8902 break;
8904 Opc = AArch64::MLSv4i32;
8905 RC = &AArch64::FPR128RegClass;
8906 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8907 break;
8908
8910 Opc = AArch64::MLAv4i16_indexed;
8911 RC = &AArch64::FPR64RegClass;
8912 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8913 break;
8915 Opc = AArch64::MLAv4i16_indexed;
8916 RC = &AArch64::FPR64RegClass;
8917 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8918 break;
8920 Opc = AArch64::MLAv8i16_indexed;
8921 RC = &AArch64::FPR128RegClass;
8922 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8923 break;
8925 Opc = AArch64::MLAv8i16_indexed;
8926 RC = &AArch64::FPR128RegClass;
8927 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8928 break;
8930 Opc = AArch64::MLAv2i32_indexed;
8931 RC = &AArch64::FPR64RegClass;
8932 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8933 break;
8935 Opc = AArch64::MLAv2i32_indexed;
8936 RC = &AArch64::FPR64RegClass;
8937 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8938 break;
8940 Opc = AArch64::MLAv4i32_indexed;
8941 RC = &AArch64::FPR128RegClass;
8942 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8943 break;
8945 Opc = AArch64::MLAv4i32_indexed;
8946 RC = &AArch64::FPR128RegClass;
8947 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8948 break;
8949
8951 Opc = AArch64::MLAv4i16_indexed;
8952 RC = &AArch64::FPR64RegClass;
8953 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8954 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
8955 RC);
8956 break;
8958 Opc = AArch64::MLSv4i16_indexed;
8959 RC = &AArch64::FPR64RegClass;
8960 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8961 break;
8963 Opc = AArch64::MLAv8i16_indexed;
8964 RC = &AArch64::FPR128RegClass;
8965 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8966 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
8967 RC);
8968 break;
8970 Opc = AArch64::MLSv8i16_indexed;
8971 RC = &AArch64::FPR128RegClass;
8972 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8973 break;
8975 Opc = AArch64::MLAv2i32_indexed;
8976 RC = &AArch64::FPR64RegClass;
8977 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8978 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
8979 RC);
8980 break;
8982 Opc = AArch64::MLSv2i32_indexed;
8983 RC = &AArch64::FPR64RegClass;
8984 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8985 break;
8987 Opc = AArch64::MLAv4i32_indexed;
8988 RC = &AArch64::FPR128RegClass;
8989 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8990 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
8991 RC);
8992 break;
8994 Opc = AArch64::MLSv4i32_indexed;
8995 RC = &AArch64::FPR128RegClass;
8996 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8997 break;
8998
8999 // Floating Point Support
9001 Opc = AArch64::FMADDHrrr;
9002 RC = &AArch64::FPR16RegClass;
9003 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9004 break;
9006 Opc = AArch64::FMADDSrrr;
9007 RC = &AArch64::FPR32RegClass;
9008 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9009 break;
9011 Opc = AArch64::FMADDDrrr;
9012 RC = &AArch64::FPR64RegClass;
9013 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9014 break;
9015
9017 Opc = AArch64::FMADDHrrr;
9018 RC = &AArch64::FPR16RegClass;
9019 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9020 break;
9022 Opc = AArch64::FMADDSrrr;
9023 RC = &AArch64::FPR32RegClass;
9024 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9025 break;
9027 Opc = AArch64::FMADDDrrr;
9028 RC = &AArch64::FPR64RegClass;
9029 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9030 break;
9031
9033 Opc = AArch64::FMLAv1i32_indexed;
9034 RC = &AArch64::FPR32RegClass;
9035 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9037 break;
9039 Opc = AArch64::FMLAv1i32_indexed;
9040 RC = &AArch64::FPR32RegClass;
9041 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9043 break;
9044
9046 Opc = AArch64::FMLAv1i64_indexed;
9047 RC = &AArch64::FPR64RegClass;
9048 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9050 break;
9052 Opc = AArch64::FMLAv1i64_indexed;
9053 RC = &AArch64::FPR64RegClass;
9054 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9056 break;
9057
9059 RC = &AArch64::FPR64RegClass;
9060 Opc = AArch64::FMLAv4i16_indexed;
9061 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9063 break;
9065 RC = &AArch64::FPR64RegClass;
9066 Opc = AArch64::FMLAv4f16;
9067 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9069 break;
9071 RC = &AArch64::FPR64RegClass;
9072 Opc = AArch64::FMLAv4i16_indexed;
9073 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9075 break;
9077 RC = &AArch64::FPR64RegClass;
9078 Opc = AArch64::FMLAv4f16;
9079 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9081 break;
9082
9085 RC = &AArch64::FPR64RegClass;
9087 Opc = AArch64::FMLAv2i32_indexed;
9088 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9090 } else {
9091 Opc = AArch64::FMLAv2f32;
9092 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9094 }
9095 break;
9098 RC = &AArch64::FPR64RegClass;
9100 Opc = AArch64::FMLAv2i32_indexed;
9101 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9103 } else {
9104 Opc = AArch64::FMLAv2f32;
9105 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9107 }
9108 break;
9109
9111 RC = &AArch64::FPR128RegClass;
9112 Opc = AArch64::FMLAv8i16_indexed;
9113 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9115 break;
9117 RC = &AArch64::FPR128RegClass;
9118 Opc = AArch64::FMLAv8f16;
9119 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9121 break;
9123 RC = &AArch64::FPR128RegClass;
9124 Opc = AArch64::FMLAv8i16_indexed;
9125 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9127 break;
9129 RC = &AArch64::FPR128RegClass;
9130 Opc = AArch64::FMLAv8f16;
9131 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9133 break;
9134
9137 RC = &AArch64::FPR128RegClass;
9139 Opc = AArch64::FMLAv2i64_indexed;
9140 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9142 } else {
9143 Opc = AArch64::FMLAv2f64;
9144 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9146 }
9147 break;
9150 RC = &AArch64::FPR128RegClass;
9152 Opc = AArch64::FMLAv2i64_indexed;
9153 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9155 } else {
9156 Opc = AArch64::FMLAv2f64;
9157 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9159 }
9160 break;
9161
9164 RC = &AArch64::FPR128RegClass;
9166 Opc = AArch64::FMLAv4i32_indexed;
9167 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9169 } else {
9170 Opc = AArch64::FMLAv4f32;
9171 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9173 }
9174 break;
9175
9178 RC = &AArch64::FPR128RegClass;
9180 Opc = AArch64::FMLAv4i32_indexed;
9181 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9183 } else {
9184 Opc = AArch64::FMLAv4f32;
9185 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9187 }
9188 break;
9189
9191 Opc = AArch64::FNMSUBHrrr;
9192 RC = &AArch64::FPR16RegClass;
9193 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9194 break;
9196 Opc = AArch64::FNMSUBSrrr;
9197 RC = &AArch64::FPR32RegClass;
9198 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9199 break;
9201 Opc = AArch64::FNMSUBDrrr;
9202 RC = &AArch64::FPR64RegClass;
9203 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9204 break;
9205
9207 Opc = AArch64::FNMADDHrrr;
9208 RC = &AArch64::FPR16RegClass;
9209 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9210 break;
9212 Opc = AArch64::FNMADDSrrr;
9213 RC = &AArch64::FPR32RegClass;
9214 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9215 break;
9217 Opc = AArch64::FNMADDDrrr;
9218 RC = &AArch64::FPR64RegClass;
9219 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9220 break;
9221
9223 Opc = AArch64::FMSUBHrrr;
9224 RC = &AArch64::FPR16RegClass;
9225 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9226 break;
9228 Opc = AArch64::FMSUBSrrr;
9229 RC = &AArch64::FPR32RegClass;
9230 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9231 break;
9233 Opc = AArch64::FMSUBDrrr;
9234 RC = &AArch64::FPR64RegClass;
9235 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9236 break;
9237
9239 Opc = AArch64::FMLSv1i32_indexed;
9240 RC = &AArch64::FPR32RegClass;
9241 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9243 break;
9244
9246 Opc = AArch64::FMLSv1i64_indexed;
9247 RC = &AArch64::FPR64RegClass;
9248 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9250 break;
9251
9254 RC = &AArch64::FPR64RegClass;
9255 Register NewVR = MRI.createVirtualRegister(RC);
9256 MachineInstrBuilder MIB1 =
9257 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9258 .add(Root.getOperand(2));
9259 InsInstrs.push_back(MIB1);
9260 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9262 Opc = AArch64::FMLAv4f16;
9263 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9264 FMAInstKind::Accumulator, &NewVR);
9265 } else {
9266 Opc = AArch64::FMLAv4i16_indexed;
9267 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9268 FMAInstKind::Indexed, &NewVR);
9269 }
9270 break;
9271 }
9273 RC = &AArch64::FPR64RegClass;
9274 Opc = AArch64::FMLSv4f16;
9275 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9277 break;
9279 RC = &AArch64::FPR64RegClass;
9280 Opc = AArch64::FMLSv4i16_indexed;
9281 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9283 break;
9284
9287 RC = &AArch64::FPR64RegClass;
9289 Opc = AArch64::FMLSv2i32_indexed;
9290 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9292 } else {
9293 Opc = AArch64::FMLSv2f32;
9294 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9296 }
9297 break;
9298
9301 RC = &AArch64::FPR128RegClass;
9302 Register NewVR = MRI.createVirtualRegister(RC);
9303 MachineInstrBuilder MIB1 =
9304 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9305 .add(Root.getOperand(2));
9306 InsInstrs.push_back(MIB1);
9307 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9309 Opc = AArch64::FMLAv8f16;
9310 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9311 FMAInstKind::Accumulator, &NewVR);
9312 } else {
9313 Opc = AArch64::FMLAv8i16_indexed;
9314 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9315 FMAInstKind::Indexed, &NewVR);
9316 }
9317 break;
9318 }
9320 RC = &AArch64::FPR128RegClass;
9321 Opc = AArch64::FMLSv8f16;
9322 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9324 break;
9326 RC = &AArch64::FPR128RegClass;
9327 Opc = AArch64::FMLSv8i16_indexed;
9328 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9330 break;
9331
9334 RC = &AArch64::FPR128RegClass;
9336 Opc = AArch64::FMLSv2i64_indexed;
9337 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9339 } else {
9340 Opc = AArch64::FMLSv2f64;
9341 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9343 }
9344 break;
9345
9348 RC = &AArch64::FPR128RegClass;
9350 Opc = AArch64::FMLSv4i32_indexed;
9351 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9353 } else {
9354 Opc = AArch64::FMLSv4f32;
9355 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9357 }
9358 break;
9361 RC = &AArch64::FPR64RegClass;
9362 Register NewVR = MRI.createVirtualRegister(RC);
9363 MachineInstrBuilder MIB1 =
9364 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9365 .add(Root.getOperand(2));
9366 InsInstrs.push_back(MIB1);
9367 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9369 Opc = AArch64::FMLAv2i32_indexed;
9370 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9371 FMAInstKind::Indexed, &NewVR);
9372 } else {
9373 Opc = AArch64::FMLAv2f32;
9374 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9375 FMAInstKind::Accumulator, &NewVR);
9376 }
9377 break;
9378 }
9381 RC = &AArch64::FPR128RegClass;
9382 Register NewVR = MRI.createVirtualRegister(RC);
9383 MachineInstrBuilder MIB1 =
9384 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9385 .add(Root.getOperand(2));
9386 InsInstrs.push_back(MIB1);
9387 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9389 Opc = AArch64::FMLAv4i32_indexed;
9390 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9391 FMAInstKind::Indexed, &NewVR);
9392 } else {
9393 Opc = AArch64::FMLAv4f32;
9394 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9395 FMAInstKind::Accumulator, &NewVR);
9396 }
9397 break;
9398 }
9401 RC = &AArch64::FPR128RegClass;
9402 Register NewVR = MRI.createVirtualRegister(RC);
9403 MachineInstrBuilder MIB1 =
9404 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9405 .add(Root.getOperand(2));
9406 InsInstrs.push_back(MIB1);
9407 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9409 Opc = AArch64::FMLAv2i64_indexed;
9410 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9411 FMAInstKind::Indexed, &NewVR);
9412 } else {
9413 Opc = AArch64::FMLAv2f64;
9414 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9415 FMAInstKind::Accumulator, &NewVR);
9416 }
9417 break;
9418 }
9421 unsigned IdxDupOp =
9423 : 2;
9424 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9425 &AArch64::FPR128RegClass, MRI);
9426 break;
9427 }
9430 unsigned IdxDupOp =
9432 : 2;
9433 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9434 &AArch64::FPR128RegClass, MRI);
9435 break;
9436 }
9439 unsigned IdxDupOp =
9441 : 2;
9442 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9443 &AArch64::FPR128_loRegClass, MRI);
9444 break;
9445 }
9448 unsigned IdxDupOp =
9450 : 2;
9451 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9452 &AArch64::FPR128RegClass, MRI);
9453 break;
9454 }
9457 unsigned IdxDupOp =
9459 : 2;
9460 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9461 &AArch64::FPR128_loRegClass, MRI);
9462 break;
9463 }
9465 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9466 break;
9467 }
9469 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9470 Pattern, 4);
9471 break;
9472 }
9474 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9475 Pattern, 8);
9476 break;
9477 }
9479 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9480 Pattern, 16);
9481 break;
9482 }
9483
9484 } // end switch (Pattern)
9485 // Record MUL and ADD/SUB for deletion
9486 if (MUL)
9487 DelInstrs.push_back(MUL);
9488 DelInstrs.push_back(&Root);
9489
9490 // Set the flags on the inserted instructions to be the merged flags of the
9491 // instructions that we have combined.
9492 uint32_t Flags = Root.getFlags();
9493 if (MUL)
9494 Flags = Root.mergeFlagsWith(*MUL);
9495 for (auto *MI : InsInstrs)
9496 MI->setFlags(Flags);
9497}
9498
9499/// Replace csincr-branch sequence by simple conditional branch
9500///
9501/// Examples:
9502/// 1. \code
9503/// csinc w9, wzr, wzr, <condition code>
9504/// tbnz w9, #0, 0x44
9505/// \endcode
9506/// to
9507/// \code
9508/// b.<inverted condition code>
9509/// \endcode
9510///
9511/// 2. \code
9512/// csinc w9, wzr, wzr, <condition code>
9513/// tbz w9, #0, 0x44
9514/// \endcode
9515/// to
9516/// \code
9517/// b.<condition code>
9518/// \endcode
9519///
9520/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9521/// compare's constant operand is power of 2.
9522///
9523/// Examples:
9524/// \code
9525/// and w8, w8, #0x400
9526/// cbnz w8, L1
9527/// \endcode
9528/// to
9529/// \code
9530/// tbnz w8, #10, L1
9531/// \endcode
9532///
9533/// \param MI Conditional Branch
9534/// \return True when the simple conditional branch is generated
9535///
9537 bool IsNegativeBranch = false;
9538 bool IsTestAndBranch = false;
9539 unsigned TargetBBInMI = 0;
9540 switch (MI.getOpcode()) {
9541 default:
9542 llvm_unreachable("Unknown branch instruction?");
9543 case AArch64::Bcc:
9544 case AArch64::CBWPri:
9545 case AArch64::CBXPri:
9546 case AArch64::CBBAssertExt:
9547 case AArch64::CBHAssertExt:
9548 case AArch64::CBWPrr:
9549 case AArch64::CBXPrr:
9550 return false;
9551 case AArch64::CBZW:
9552 case AArch64::CBZX:
9553 TargetBBInMI = 1;
9554 break;
9555 case AArch64::CBNZW:
9556 case AArch64::CBNZX:
9557 TargetBBInMI = 1;
9558 IsNegativeBranch = true;
9559 break;
9560 case AArch64::TBZW:
9561 case AArch64::TBZX:
9562 TargetBBInMI = 2;
9563 IsTestAndBranch = true;
9564 break;
9565 case AArch64::TBNZW:
9566 case AArch64::TBNZX:
9567 TargetBBInMI = 2;
9568 IsNegativeBranch = true;
9569 IsTestAndBranch = true;
9570 break;
9571 }
9572 // So we increment a zero register and test for bits other
9573 // than bit 0? Conservatively bail out in case the verifier
9574 // missed this case.
9575 if (IsTestAndBranch && MI.getOperand(1).getImm())
9576 return false;
9577
9578 // Find Definition.
9579 assert(MI.getParent() && "Incomplete machine instruction\n");
9580 MachineBasicBlock *MBB = MI.getParent();
9581 MachineFunction *MF = MBB->getParent();
9583 Register VReg = MI.getOperand(0).getReg();
9584 if (!VReg.isVirtual())
9585 return false;
9586
9587 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9588
9589 // Look through COPY instructions to find definition.
9590 while (DefMI->isCopy()) {
9591 Register CopyVReg = DefMI->getOperand(1).getReg();
9592 if (!MRI->hasOneNonDBGUse(CopyVReg))
9593 return false;
9594 if (!MRI->hasOneDef(CopyVReg))
9595 return false;
9596 DefMI = MRI->getVRegDef(CopyVReg);
9597 }
9598
9599 switch (DefMI->getOpcode()) {
9600 default:
9601 return false;
9602 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9603 case AArch64::ANDWri:
9604 case AArch64::ANDXri: {
9605 if (IsTestAndBranch)
9606 return false;
9607 if (DefMI->getParent() != MBB)
9608 return false;
9609 if (!MRI->hasOneNonDBGUse(VReg))
9610 return false;
9611
9612 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9614 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
9615 if (!isPowerOf2_64(Mask))
9616 return false;
9617
9618 MachineOperand &MO = DefMI->getOperand(1);
9619 Register NewReg = MO.getReg();
9620 if (!NewReg.isVirtual())
9621 return false;
9622
9623 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9624
9625 MachineBasicBlock &RefToMBB = *MBB;
9626 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
9627 DebugLoc DL = MI.getDebugLoc();
9628 unsigned Imm = Log2_64(Mask);
9629 unsigned Opc = (Imm < 32)
9630 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9631 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9632 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
9633 .addReg(NewReg)
9634 .addImm(Imm)
9635 .addMBB(TBB);
9636 // Register lives on to the CBZ now.
9637 MO.setIsKill(false);
9638
9639 // For immediate smaller than 32, we need to use the 32-bit
9640 // variant (W) in all cases. Indeed the 64-bit variant does not
9641 // allow to encode them.
9642 // Therefore, if the input register is 64-bit, we need to take the
9643 // 32-bit sub-part.
9644 if (!Is32Bit && Imm < 32)
9645 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
9646 MI.eraseFromParent();
9647 return true;
9648 }
9649 // Look for CSINC
9650 case AArch64::CSINCWr:
9651 case AArch64::CSINCXr: {
9652 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
9653 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
9654 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
9655 DefMI->getOperand(2).getReg() == AArch64::XZR))
9656 return false;
9657
9658 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
9659 true) != -1)
9660 return false;
9661
9662 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
9663 // Convert only when the condition code is not modified between
9664 // the CSINC and the branch. The CC may be used by other
9665 // instructions in between.
9667 return false;
9668 MachineBasicBlock &RefToMBB = *MBB;
9669 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
9670 DebugLoc DL = MI.getDebugLoc();
9671 if (IsNegativeBranch)
9673 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
9674 MI.eraseFromParent();
9675 return true;
9676 }
9677 }
9678}
9679
9680std::pair<unsigned, unsigned>
9681AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9682 const unsigned Mask = AArch64II::MO_FRAGMENT;
9683 return std::make_pair(TF & Mask, TF & ~Mask);
9684}
9685
9687AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9688 using namespace AArch64II;
9689
9690 static const std::pair<unsigned, const char *> TargetFlags[] = {
9691 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9692 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9693 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9694 {MO_HI12, "aarch64-hi12"}};
9695 return ArrayRef(TargetFlags);
9696}
9697
9699AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9700 using namespace AArch64II;
9701
9702 static const std::pair<unsigned, const char *> TargetFlags[] = {
9703 {MO_COFFSTUB, "aarch64-coffstub"},
9704 {MO_GOT, "aarch64-got"},
9705 {MO_NC, "aarch64-nc"},
9706 {MO_S, "aarch64-s"},
9707 {MO_TLS, "aarch64-tls"},
9708 {MO_DLLIMPORT, "aarch64-dllimport"},
9709 {MO_PREL, "aarch64-prel"},
9710 {MO_TAGGED, "aarch64-tagged"},
9711 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9712 };
9713 return ArrayRef(TargetFlags);
9714}
9715
9717AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9718 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9719 {{MOSuppressPair, "aarch64-suppress-pair"},
9720 {MOStridedAccess, "aarch64-strided-access"}};
9721 return ArrayRef(TargetFlags);
9722}
9723
9724/// Constants defining how certain sequences should be outlined.
9725/// This encompasses how an outlined function should be called, and what kind of
9726/// frame should be emitted for that outlined function.
9727///
9728/// \p MachineOutlinerDefault implies that the function should be called with
9729/// a save and restore of LR to the stack.
9730///
9731/// That is,
9732///
9733/// I1 Save LR OUTLINED_FUNCTION:
9734/// I2 --> BL OUTLINED_FUNCTION I1
9735/// I3 Restore LR I2
9736/// I3
9737/// RET
9738///
9739/// * Call construction overhead: 3 (save + BL + restore)
9740/// * Frame construction overhead: 1 (ret)
9741/// * Requires stack fixups? Yes
9742///
9743/// \p MachineOutlinerTailCall implies that the function is being created from
9744/// a sequence of instructions ending in a return.
9745///
9746/// That is,
9747///
9748/// I1 OUTLINED_FUNCTION:
9749/// I2 --> B OUTLINED_FUNCTION I1
9750/// RET I2
9751/// RET
9752///
9753/// * Call construction overhead: 1 (B)
9754/// * Frame construction overhead: 0 (Return included in sequence)
9755/// * Requires stack fixups? No
9756///
9757/// \p MachineOutlinerNoLRSave implies that the function should be called using
9758/// a BL instruction, but doesn't require LR to be saved and restored. This
9759/// happens when LR is known to be dead.
9760///
9761/// That is,
9762///
9763/// I1 OUTLINED_FUNCTION:
9764/// I2 --> BL OUTLINED_FUNCTION I1
9765/// I3 I2
9766/// I3
9767/// RET
9768///
9769/// * Call construction overhead: 1 (BL)
9770/// * Frame construction overhead: 1 (RET)
9771/// * Requires stack fixups? No
9772///
9773/// \p MachineOutlinerThunk implies that the function is being created from
9774/// a sequence of instructions ending in a call. The outlined function is
9775/// called with a BL instruction, and the outlined function tail-calls the
9776/// original call destination.
9777///
9778/// That is,
9779///
9780/// I1 OUTLINED_FUNCTION:
9781/// I2 --> BL OUTLINED_FUNCTION I1
9782/// BL f I2
9783/// B f
9784/// * Call construction overhead: 1 (BL)
9785/// * Frame construction overhead: 0
9786/// * Requires stack fixups? No
9787///
9788/// \p MachineOutlinerRegSave implies that the function should be called with a
9789/// save and restore of LR to an available register. This allows us to avoid
9790/// stack fixups. Note that this outlining variant is compatible with the
9791/// NoLRSave case.
9792///
9793/// That is,
9794///
9795/// I1 Save LR OUTLINED_FUNCTION:
9796/// I2 --> BL OUTLINED_FUNCTION I1
9797/// I3 Restore LR I2
9798/// I3
9799/// RET
9800///
9801/// * Call construction overhead: 3 (save + BL + restore)
9802/// * Frame construction overhead: 1 (ret)
9803/// * Requires stack fixups? No
9805 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9806 MachineOutlinerTailCall, /// Only emit a branch.
9807 MachineOutlinerNoLRSave, /// Emit a call and return.
9808 MachineOutlinerThunk, /// Emit a call and tail-call.
9809 MachineOutlinerRegSave /// Same as default, but save to a register.
9810};
9811
9817
9819AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9820 MachineFunction *MF = C.getMF();
9821 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9822 const AArch64RegisterInfo *ARI =
9823 static_cast<const AArch64RegisterInfo *>(&TRI);
9824 // Check if there is an available register across the sequence that we can
9825 // use.
9826 for (unsigned Reg : AArch64::GPR64RegClass) {
9827 if (!ARI->isReservedReg(*MF, Reg) &&
9828 Reg != AArch64::LR && // LR is not reserved, but don't use it.
9829 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
9830 Reg != AArch64::X17 && // Ditto for X17.
9831 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
9832 C.isAvailableInsideSeq(Reg, TRI))
9833 return Reg;
9834 }
9835 return Register();
9836}
9837
9838static bool
9840 const outliner::Candidate &b) {
9841 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9842 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9843
9844 return MFIa->getSignReturnAddressCondition() ==
9846}
9847
9848static bool
9850 const outliner::Candidate &b) {
9851 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9852 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9853
9854 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
9855}
9856
9858 const outliner::Candidate &b) {
9859 const AArch64Subtarget &SubtargetA =
9861 const AArch64Subtarget &SubtargetB =
9862 b.getMF()->getSubtarget<AArch64Subtarget>();
9863 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
9864}
9865
9866std::optional<std::unique_ptr<outliner::OutlinedFunction>>
9867AArch64InstrInfo::getOutliningCandidateInfo(
9868 const MachineModuleInfo &MMI,
9869 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
9870 unsigned MinRepeats) const {
9871 unsigned SequenceSize = 0;
9872 for (auto &MI : RepeatedSequenceLocs[0])
9873 SequenceSize += getInstSizeInBytes(MI);
9874
9875 unsigned NumBytesToCreateFrame = 0;
9876
9877 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
9878 // These instructions are fused together by the scheduler.
9879 // Any candidate where ADRP is the last instruction should be rejected
9880 // as that will lead to splitting ADRP pair.
9881 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
9882 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
9883 if (LastMI.getOpcode() == AArch64::ADRP &&
9884 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
9885 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9886 return std::nullopt;
9887 }
9888
9889 // Similarly any candidate where the first instruction is ADD/LDR with a
9890 // page offset should be rejected to avoid ADRP splitting.
9891 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
9892 FirstMI.getOpcode() == AArch64::LDRXui) &&
9893 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
9894 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9895 return std::nullopt;
9896 }
9897
9898 // We only allow outlining for functions having exactly matching return
9899 // address signing attributes, i.e., all share the same value for the
9900 // attribute "sign-return-address" and all share the same type of key they
9901 // are signed with.
9902 // Additionally we require all functions to simultaneously either support
9903 // v8.3a features or not. Otherwise an outlined function could get signed
9904 // using dedicated v8.3 instructions and a call from a function that doesn't
9905 // support v8.3 instructions would therefore be invalid.
9906 if (std::adjacent_find(
9907 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
9908 [](const outliner::Candidate &a, const outliner::Candidate &b) {
9909 // Return true if a and b are non-equal w.r.t. return address
9910 // signing or support of v8.3a features
9911 if (outliningCandidatesSigningScopeConsensus(a, b) &&
9912 outliningCandidatesSigningKeyConsensus(a, b) &&
9913 outliningCandidatesV8_3OpsConsensus(a, b)) {
9914 return false;
9915 }
9916 return true;
9917 }) != RepeatedSequenceLocs.end()) {
9918 return std::nullopt;
9919 }
9920
9921 // Since at this point all candidates agree on their return address signing
9922 // picking just one is fine. If the candidate functions potentially sign their
9923 // return addresses, the outlined function should do the same. Note that in
9924 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
9925 // not certainly true that the outlined function will have to sign its return
9926 // address but this decision is made later, when the decision to outline
9927 // has already been made.
9928 // The same holds for the number of additional instructions we need: On
9929 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
9930 // necessary. However, at this point we don't know if the outlined function
9931 // will have a RET instruction so we assume the worst.
9932 const TargetRegisterInfo &TRI = getRegisterInfo();
9933 // Performing a tail call may require extra checks when PAuth is enabled.
9934 // If PAuth is disabled, set it to zero for uniformity.
9935 unsigned NumBytesToCheckLRInTCEpilogue = 0;
9936 const auto RASignCondition = RepeatedSequenceLocs[0]
9937 .getMF()
9938 ->getInfo<AArch64FunctionInfo>()
9939 ->getSignReturnAddressCondition();
9940 if (RASignCondition != SignReturnAddress::None) {
9941 // One PAC and one AUT instructions
9942 NumBytesToCreateFrame += 8;
9943
9944 // PAuth is enabled - set extra tail call cost, if any.
9945 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
9946 *RepeatedSequenceLocs[0].getMF());
9947 NumBytesToCheckLRInTCEpilogue =
9949 // Checking the authenticated LR value may significantly impact
9950 // SequenceSize, so account for it for more precise results.
9951 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
9952 SequenceSize += NumBytesToCheckLRInTCEpilogue;
9953
9954 // We have to check if sp modifying instructions would get outlined.
9955 // If so we only allow outlining if sp is unchanged overall, so matching
9956 // sub and add instructions are okay to outline, all other sp modifications
9957 // are not
9958 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
9959 int SPValue = 0;
9960 for (auto &MI : C) {
9961 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
9962 switch (MI.getOpcode()) {
9963 case AArch64::ADDXri:
9964 case AArch64::ADDWri:
9965 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
9966 assert(MI.getOperand(2).isImm() &&
9967 "Expected operand to be immediate");
9968 assert(MI.getOperand(1).isReg() &&
9969 "Expected operand to be a register");
9970 // Check if the add just increments sp. If so, we search for
9971 // matching sub instructions that decrement sp. If not, the
9972 // modification is illegal
9973 if (MI.getOperand(1).getReg() == AArch64::SP)
9974 SPValue += MI.getOperand(2).getImm();
9975 else
9976 return true;
9977 break;
9978 case AArch64::SUBXri:
9979 case AArch64::SUBWri:
9980 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
9981 assert(MI.getOperand(2).isImm() &&
9982 "Expected operand to be immediate");
9983 assert(MI.getOperand(1).isReg() &&
9984 "Expected operand to be a register");
9985 // Check if the sub just decrements sp. If so, we search for
9986 // matching add instructions that increment sp. If not, the
9987 // modification is illegal
9988 if (MI.getOperand(1).getReg() == AArch64::SP)
9989 SPValue -= MI.getOperand(2).getImm();
9990 else
9991 return true;
9992 break;
9993 default:
9994 return true;
9995 }
9996 }
9997 }
9998 if (SPValue)
9999 return true;
10000 return false;
10001 };
10002 // Remove candidates with illegal stack modifying instructions
10003 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10004
10005 // If the sequence doesn't have enough candidates left, then we're done.
10006 if (RepeatedSequenceLocs.size() < MinRepeats)
10007 return std::nullopt;
10008 }
10009
10010 // Properties about candidate MBBs that hold for all of them.
10011 unsigned FlagsSetInAll = 0xF;
10012
10013 // Compute liveness information for each candidate, and set FlagsSetInAll.
10014 for (outliner::Candidate &C : RepeatedSequenceLocs)
10015 FlagsSetInAll &= C.Flags;
10016
10017 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10018
10019 // Helper lambda which sets call information for every candidate.
10020 auto SetCandidateCallInfo =
10021 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10022 for (outliner::Candidate &C : RepeatedSequenceLocs)
10023 C.setCallInfo(CallID, NumBytesForCall);
10024 };
10025
10026 unsigned FrameID = MachineOutlinerDefault;
10027 NumBytesToCreateFrame += 4;
10028
10029 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10030 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10031 });
10032
10033 // We check to see if CFI Instructions are present, and if they are
10034 // we find the number of CFI Instructions in the candidates.
10035 unsigned CFICount = 0;
10036 for (auto &I : RepeatedSequenceLocs[0]) {
10037 if (I.isCFIInstruction())
10038 CFICount++;
10039 }
10040
10041 // We compare the number of found CFI Instructions to the number of CFI
10042 // instructions in the parent function for each candidate. We must check this
10043 // since if we outline one of the CFI instructions in a function, we have to
10044 // outline them all for correctness. If we do not, the address offsets will be
10045 // incorrect between the two sections of the program.
10046 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10047 std::vector<MCCFIInstruction> CFIInstructions =
10048 C.getMF()->getFrameInstructions();
10049
10050 if (CFICount > 0 && CFICount != CFIInstructions.size())
10051 return std::nullopt;
10052 }
10053
10054 // Returns true if an instructions is safe to fix up, false otherwise.
10055 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10056 if (MI.isCall())
10057 return true;
10058
10059 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10060 !MI.readsRegister(AArch64::SP, &TRI))
10061 return true;
10062
10063 // Any modification of SP will break our code to save/restore LR.
10064 // FIXME: We could handle some instructions which add a constant
10065 // offset to SP, with a bit more work.
10066 if (MI.modifiesRegister(AArch64::SP, &TRI))
10067 return false;
10068
10069 // At this point, we have a stack instruction that we might need to
10070 // fix up. We'll handle it if it's a load or store.
10071 if (MI.mayLoadOrStore()) {
10072 const MachineOperand *Base; // Filled with the base operand of MI.
10073 int64_t Offset; // Filled with the offset of MI.
10074 bool OffsetIsScalable;
10075
10076 // Does it allow us to offset the base operand and is the base the
10077 // register SP?
10078 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10079 !Base->isReg() || Base->getReg() != AArch64::SP)
10080 return false;
10081
10082 // Fixe-up code below assumes bytes.
10083 if (OffsetIsScalable)
10084 return false;
10085
10086 // Find the minimum/maximum offset for this instruction and check
10087 // if fixing it up would be in range.
10088 int64_t MinOffset,
10089 MaxOffset; // Unscaled offsets for the instruction.
10090 // The scale to multiply the offsets by.
10091 TypeSize Scale(0U, false), DummyWidth(0U, false);
10092 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10093
10094 Offset += 16; // Update the offset to what it would be if we outlined.
10095 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10096 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10097 return false;
10098
10099 // It's in range, so we can outline it.
10100 return true;
10101 }
10102
10103 // FIXME: Add handling for instructions like "add x0, sp, #8".
10104
10105 // We can't fix it up, so don't outline it.
10106 return false;
10107 };
10108
10109 // True if it's possible to fix up each stack instruction in this sequence.
10110 // Important for frames/call variants that modify the stack.
10111 bool AllStackInstrsSafe =
10112 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10113
10114 // If the last instruction in any candidate is a terminator, then we should
10115 // tail call all of the candidates.
10116 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10117 FrameID = MachineOutlinerTailCall;
10118 NumBytesToCreateFrame = 0;
10119 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10120 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10121 }
10122
10123 else if (LastInstrOpcode == AArch64::BL ||
10124 ((LastInstrOpcode == AArch64::BLR ||
10125 LastInstrOpcode == AArch64::BLRNoIP) &&
10126 !HasBTI)) {
10127 // FIXME: Do we need to check if the code after this uses the value of LR?
10128 FrameID = MachineOutlinerThunk;
10129 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10130 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10131 }
10132
10133 else {
10134 // We need to decide how to emit calls + frames. We can always emit the same
10135 // frame if we don't need to save to the stack. If we have to save to the
10136 // stack, then we need a different frame.
10137 unsigned NumBytesNoStackCalls = 0;
10138 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10139
10140 // Check if we have to save LR.
10141 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10142 bool LRAvailable =
10144 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10145 : true;
10146 // If we have a noreturn caller, then we're going to be conservative and
10147 // say that we have to save LR. If we don't have a ret at the end of the
10148 // block, then we can't reason about liveness accurately.
10149 //
10150 // FIXME: We can probably do better than always disabling this in
10151 // noreturn functions by fixing up the liveness info.
10152 bool IsNoReturn =
10153 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10154
10155 // Is LR available? If so, we don't need a save.
10156 if (LRAvailable && !IsNoReturn) {
10157 NumBytesNoStackCalls += 4;
10158 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10159 CandidatesWithoutStackFixups.push_back(C);
10160 }
10161
10162 // Is an unused register available? If so, we won't modify the stack, so
10163 // we can outline with the same frame type as those that don't save LR.
10164 else if (findRegisterToSaveLRTo(C)) {
10165 NumBytesNoStackCalls += 12;
10166 C.setCallInfo(MachineOutlinerRegSave, 12);
10167 CandidatesWithoutStackFixups.push_back(C);
10168 }
10169
10170 // Is SP used in the sequence at all? If not, we don't have to modify
10171 // the stack, so we are guaranteed to get the same frame.
10172 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10173 NumBytesNoStackCalls += 12;
10174 C.setCallInfo(MachineOutlinerDefault, 12);
10175 CandidatesWithoutStackFixups.push_back(C);
10176 }
10177
10178 // If we outline this, we need to modify the stack. Pretend we don't
10179 // outline this by saving all of its bytes.
10180 else {
10181 NumBytesNoStackCalls += SequenceSize;
10182 }
10183 }
10184
10185 // If there are no places where we have to save LR, then note that we
10186 // don't have to update the stack. Otherwise, give every candidate the
10187 // default call type, as long as it's safe to do so.
10188 if (!AllStackInstrsSafe ||
10189 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10190 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10191 FrameID = MachineOutlinerNoLRSave;
10192 if (RepeatedSequenceLocs.size() < MinRepeats)
10193 return std::nullopt;
10194 } else {
10195 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10196
10197 // Bugzilla ID: 46767
10198 // TODO: Check if fixing up the stack more than once is safe so we can
10199 // outline these.
10200 //
10201 // An outline resulting in a caller that requires stack fixups at the
10202 // callsite to a callee that also requires stack fixups can happen when
10203 // there are no available registers at the candidate callsite for a
10204 // candidate that itself also has calls.
10205 //
10206 // In other words if function_containing_sequence in the following pseudo
10207 // assembly requires that we save LR at the point of the call, but there
10208 // are no available registers: in this case we save using SP and as a
10209 // result the SP offsets requires stack fixups by multiples of 16.
10210 //
10211 // function_containing_sequence:
10212 // ...
10213 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10214 // call OUTLINED_FUNCTION_N
10215 // restore LR from SP
10216 // ...
10217 //
10218 // OUTLINED_FUNCTION_N:
10219 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10220 // ...
10221 // bl foo
10222 // restore LR from SP
10223 // ret
10224 //
10225 // Because the code to handle more than one stack fixup does not
10226 // currently have the proper checks for legality, these cases will assert
10227 // in the AArch64 MachineOutliner. This is because the code to do this
10228 // needs more hardening, testing, better checks that generated code is
10229 // legal, etc and because it is only verified to handle a single pass of
10230 // stack fixup.
10231 //
10232 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10233 // these cases until they are known to be handled. Bugzilla 46767 is
10234 // referenced in comments at the assert site.
10235 //
10236 // To avoid asserting (or generating non-legal code on noassert builds)
10237 // we remove all candidates which would need more than one stack fixup by
10238 // pruning the cases where the candidate has calls while also having no
10239 // available LR and having no available general purpose registers to copy
10240 // LR to (ie one extra stack save/restore).
10241 //
10242 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10243 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10244 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10245 return (llvm::any_of(C, IsCall)) &&
10246 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10247 !findRegisterToSaveLRTo(C));
10248 });
10249 }
10250 }
10251
10252 // If we dropped all of the candidates, bail out here.
10253 if (RepeatedSequenceLocs.size() < MinRepeats)
10254 return std::nullopt;
10255 }
10256
10257 // Does every candidate's MBB contain a call? If so, then we might have a call
10258 // in the range.
10259 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10260 // Check if the range contains a call. These require a save + restore of the
10261 // link register.
10262 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10263 bool ModStackToSaveLR = false;
10264 if (any_of(drop_end(FirstCand),
10265 [](const MachineInstr &MI) { return MI.isCall(); }))
10266 ModStackToSaveLR = true;
10267
10268 // Handle the last instruction separately. If this is a tail call, then the
10269 // last instruction is a call. We don't want to save + restore in this case.
10270 // However, it could be possible that the last instruction is a call without
10271 // it being valid to tail call this sequence. We should consider this as
10272 // well.
10273 else if (FrameID != MachineOutlinerThunk &&
10274 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10275 ModStackToSaveLR = true;
10276
10277 if (ModStackToSaveLR) {
10278 // We can't fix up the stack. Bail out.
10279 if (!AllStackInstrsSafe)
10280 return std::nullopt;
10281
10282 // Save + restore LR.
10283 NumBytesToCreateFrame += 8;
10284 }
10285 }
10286
10287 // If we have CFI instructions, we can only outline if the outlined section
10288 // can be a tail call
10289 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10290 return std::nullopt;
10291
10292 return std::make_unique<outliner::OutlinedFunction>(
10293 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10294}
10295
10296void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10297 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10298 // If a bunch of candidates reach this point they must agree on their return
10299 // address signing. It is therefore enough to just consider the signing
10300 // behaviour of one of them
10301 const auto &CFn = Candidates.front().getMF()->getFunction();
10302
10303 if (CFn.hasFnAttribute("ptrauth-returns"))
10304 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10305 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10306 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10307 // Since all candidates belong to the same module, just copy the
10308 // function-level attributes of an arbitrary function.
10309 if (CFn.hasFnAttribute("sign-return-address"))
10310 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10311 if (CFn.hasFnAttribute("sign-return-address-key"))
10312 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10313
10314 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10315}
10316
10317bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10318 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10319 const Function &F = MF.getFunction();
10320
10321 // Can F be deduplicated by the linker? If it can, don't outline from it.
10322 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10323 return false;
10324
10325 // Don't outline from functions with section markings; the program could
10326 // expect that all the code is in the named section.
10327 // FIXME: Allow outlining from multiple functions with the same section
10328 // marking.
10329 if (F.hasSection())
10330 return false;
10331
10332 // Outlining from functions with redzones is unsafe since the outliner may
10333 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10334 // outline from it.
10335 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10336 if (!AFI || AFI->hasRedZone().value_or(true))
10337 return false;
10338
10339 // FIXME: Determine whether it is safe to outline from functions which contain
10340 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10341 // outlined together and ensure it is safe to outline with async unwind info,
10342 // required for saving & restoring VG around calls.
10343 if (AFI->hasStreamingModeChanges())
10344 return false;
10345
10346 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10348 return false;
10349
10350 // It's safe to outline from MF.
10351 return true;
10352}
10353
10355AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10356 unsigned &Flags) const {
10358 "Must track liveness!");
10360 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10361 Ranges;
10362 // According to the AArch64 Procedure Call Standard, the following are
10363 // undefined on entry/exit from a function call:
10364 //
10365 // * Registers x16, x17, (and thus w16, w17)
10366 // * Condition codes (and thus the NZCV register)
10367 //
10368 // If any of these registers are used inside or live across an outlined
10369 // function, then they may be modified later, either by the compiler or
10370 // some other tool (like the linker).
10371 //
10372 // To avoid outlining in these situations, partition each block into ranges
10373 // where these registers are dead. We will only outline from those ranges.
10374 LiveRegUnits LRU(getRegisterInfo());
10375 auto AreAllUnsafeRegsDead = [&LRU]() {
10376 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10377 LRU.available(AArch64::NZCV);
10378 };
10379
10380 // We need to know if LR is live across an outlining boundary later on in
10381 // order to decide how we'll create the outlined call, frame, etc.
10382 //
10383 // It's pretty expensive to check this for *every candidate* within a block.
10384 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10385 // to compute liveness from the end of the block for O(n) candidates within
10386 // the block.
10387 //
10388 // So, to improve the average case, let's keep track of liveness from the end
10389 // of the block to the beginning of *every outlinable range*. If we know that
10390 // LR is available in every range we could outline from, then we know that
10391 // we don't need to check liveness for any candidate within that range.
10392 bool LRAvailableEverywhere = true;
10393 // Compute liveness bottom-up.
10394 LRU.addLiveOuts(MBB);
10395 // Update flags that require info about the entire MBB.
10396 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10397 if (MI.isCall() && !MI.isTerminator())
10399 };
10400 // Range: [RangeBegin, RangeEnd)
10401 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10402 unsigned RangeLen;
10403 auto CreateNewRangeStartingAt =
10404 [&RangeBegin, &RangeEnd,
10405 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10406 RangeBegin = NewBegin;
10407 RangeEnd = std::next(RangeBegin);
10408 RangeLen = 0;
10409 };
10410 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10411 // At least one unsafe register is not dead. We do not want to outline at
10412 // this point. If it is long enough to outline from and does not cross a
10413 // bundle boundary, save the range [RangeBegin, RangeEnd).
10414 if (RangeLen <= 1)
10415 return;
10416 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10417 return;
10418 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10419 return;
10420 Ranges.emplace_back(RangeBegin, RangeEnd);
10421 };
10422 // Find the first point where all unsafe registers are dead.
10423 // FIND: <safe instr> <-- end of first potential range
10424 // SKIP: <unsafe def>
10425 // SKIP: ... everything between ...
10426 // SKIP: <unsafe use>
10427 auto FirstPossibleEndPt = MBB.instr_rbegin();
10428 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10429 LRU.stepBackward(*FirstPossibleEndPt);
10430 // Update flags that impact how we outline across the entire block,
10431 // regardless of safety.
10432 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10433 if (AreAllUnsafeRegsDead())
10434 break;
10435 }
10436 // If we exhausted the entire block, we have no safe ranges to outline.
10437 if (FirstPossibleEndPt == MBB.instr_rend())
10438 return Ranges;
10439 // Current range.
10440 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10441 // StartPt points to the first place where all unsafe registers
10442 // are dead (if there is any such point). Begin partitioning the MBB into
10443 // ranges.
10444 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10445 LRU.stepBackward(MI);
10446 UpdateWholeMBBFlags(MI);
10447 if (!AreAllUnsafeRegsDead()) {
10448 SaveRangeIfNonEmpty();
10449 CreateNewRangeStartingAt(MI.getIterator());
10450 continue;
10451 }
10452 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10453 RangeBegin = MI.getIterator();
10454 ++RangeLen;
10455 }
10456 // Above loop misses the last (or only) range. If we are still safe, then
10457 // let's save the range.
10458 if (AreAllUnsafeRegsDead())
10459 SaveRangeIfNonEmpty();
10460 if (Ranges.empty())
10461 return Ranges;
10462 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10463 // the order.
10464 std::reverse(Ranges.begin(), Ranges.end());
10465 // If there is at least one outlinable range where LR is unavailable
10466 // somewhere, remember that.
10467 if (!LRAvailableEverywhere)
10469 return Ranges;
10470}
10471
10473AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10475 unsigned Flags) const {
10476 MachineInstr &MI = *MIT;
10477
10478 // Don't outline anything used for return address signing. The outlined
10479 // function will get signed later if needed
10480 switch (MI.getOpcode()) {
10481 case AArch64::PACM:
10482 case AArch64::PACIASP:
10483 case AArch64::PACIBSP:
10484 case AArch64::PACIASPPC:
10485 case AArch64::PACIBSPPC:
10486 case AArch64::AUTIASP:
10487 case AArch64::AUTIBSP:
10488 case AArch64::AUTIASPPCi:
10489 case AArch64::AUTIASPPCr:
10490 case AArch64::AUTIBSPPCi:
10491 case AArch64::AUTIBSPPCr:
10492 case AArch64::RETAA:
10493 case AArch64::RETAB:
10494 case AArch64::RETAASPPCi:
10495 case AArch64::RETAASPPCr:
10496 case AArch64::RETABSPPCi:
10497 case AArch64::RETABSPPCr:
10498 case AArch64::EMITBKEY:
10499 case AArch64::PAUTH_PROLOGUE:
10500 case AArch64::PAUTH_EPILOGUE:
10502 }
10503
10504 // We can only outline these if we will tail call the outlined function, or
10505 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10506 // in a tail call.
10507 //
10508 // FIXME: If the proper fixups for the offset are implemented, this should be
10509 // possible.
10510 if (MI.isCFIInstruction())
10512
10513 // Is this a terminator for a basic block?
10514 if (MI.isTerminator())
10515 // TargetInstrInfo::getOutliningType has already filtered out anything
10516 // that would break this, so we can allow it here.
10518
10519 // Make sure none of the operands are un-outlinable.
10520 for (const MachineOperand &MOP : MI.operands()) {
10521 // A check preventing CFI indices was here before, but only CFI
10522 // instructions should have those.
10523 assert(!MOP.isCFIIndex());
10524
10525 // If it uses LR or W30 explicitly, then don't touch it.
10526 if (MOP.isReg() && !MOP.isImplicit() &&
10527 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10529 }
10530
10531 // Special cases for instructions that can always be outlined, but will fail
10532 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10533 // be outlined because they don't require a *specific* value to be in LR.
10534 if (MI.getOpcode() == AArch64::ADRP)
10536
10537 // If MI is a call we might be able to outline it. We don't want to outline
10538 // any calls that rely on the position of items on the stack. When we outline
10539 // something containing a call, we have to emit a save and restore of LR in
10540 // the outlined function. Currently, this always happens by saving LR to the
10541 // stack. Thus, if we outline, say, half the parameters for a function call
10542 // plus the call, then we'll break the callee's expectations for the layout
10543 // of the stack.
10544 //
10545 // FIXME: Allow calls to functions which construct a stack frame, as long
10546 // as they don't access arguments on the stack.
10547 // FIXME: Figure out some way to analyze functions defined in other modules.
10548 // We should be able to compute the memory usage based on the IR calling
10549 // convention, even if we can't see the definition.
10550 if (MI.isCall()) {
10551 // Get the function associated with the call. Look at each operand and find
10552 // the one that represents the callee and get its name.
10553 const Function *Callee = nullptr;
10554 for (const MachineOperand &MOP : MI.operands()) {
10555 if (MOP.isGlobal()) {
10556 Callee = dyn_cast<Function>(MOP.getGlobal());
10557 break;
10558 }
10559 }
10560
10561 // Never outline calls to mcount. There isn't any rule that would require
10562 // this, but the Linux kernel's "ftrace" feature depends on it.
10563 if (Callee && Callee->getName() == "\01_mcount")
10565
10566 // If we don't know anything about the callee, assume it depends on the
10567 // stack layout of the caller. In that case, it's only legal to outline
10568 // as a tail-call. Explicitly list the call instructions we know about so we
10569 // don't get unexpected results with call pseudo-instructions.
10570 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10571 if (MI.getOpcode() == AArch64::BLR ||
10572 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10573 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10574
10575 if (!Callee)
10576 return UnknownCallOutlineType;
10577
10578 // We have a function we have information about. Check it if it's something
10579 // can safely outline.
10580 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10581
10582 // We don't know what's going on with the callee at all. Don't touch it.
10583 if (!CalleeMF)
10584 return UnknownCallOutlineType;
10585
10586 // Check if we know anything about the callee saves on the function. If we
10587 // don't, then don't touch it, since that implies that we haven't
10588 // computed anything about its stack frame yet.
10589 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10590 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10591 MFI.getNumObjects() > 0)
10592 return UnknownCallOutlineType;
10593
10594 // At this point, we can say that CalleeMF ought to not pass anything on the
10595 // stack. Therefore, we can outline it.
10597 }
10598
10599 // Don't touch the link register or W30.
10600 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10601 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10603
10604 // Don't outline BTI instructions, because that will prevent the outlining
10605 // site from being indirectly callable.
10606 if (hasBTISemantics(MI))
10608
10610}
10611
10612void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10613 for (MachineInstr &MI : MBB) {
10614 const MachineOperand *Base;
10615 TypeSize Width(0, false);
10616 int64_t Offset;
10617 bool OffsetIsScalable;
10618
10619 // Is this a load or store with an immediate offset with SP as the base?
10620 if (!MI.mayLoadOrStore() ||
10621 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
10622 &RI) ||
10623 (Base->isReg() && Base->getReg() != AArch64::SP))
10624 continue;
10625
10626 // It is, so we have to fix it up.
10627 TypeSize Scale(0U, false);
10628 int64_t Dummy1, Dummy2;
10629
10630 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
10631 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10632 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
10633 assert(Scale != 0 && "Unexpected opcode!");
10634 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10635
10636 // We've pushed the return address to the stack, so add 16 to the offset.
10637 // This is safe, since we already checked if it would overflow when we
10638 // checked if this instruction was legal to outline.
10639 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10640 StackOffsetOperand.setImm(NewImm);
10641 }
10642}
10643
10645 const AArch64InstrInfo *TII,
10646 bool ShouldSignReturnAddr) {
10647 if (!ShouldSignReturnAddr)
10648 return;
10649
10650 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
10652 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
10653 TII->get(AArch64::PAUTH_EPILOGUE))
10655}
10656
10657void AArch64InstrInfo::buildOutlinedFrame(
10659 const outliner::OutlinedFunction &OF) const {
10660
10661 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10662
10663 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10664 FI->setOutliningStyle("Tail Call");
10665 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10666 // For thunk outlining, rewrite the last instruction from a call to a
10667 // tail-call.
10668 MachineInstr *Call = &*--MBB.instr_end();
10669 unsigned TailOpcode;
10670 if (Call->getOpcode() == AArch64::BL) {
10671 TailOpcode = AArch64::TCRETURNdi;
10672 } else {
10673 assert(Call->getOpcode() == AArch64::BLR ||
10674 Call->getOpcode() == AArch64::BLRNoIP);
10675 TailOpcode = AArch64::TCRETURNriALL;
10676 }
10677 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
10678 .add(Call->getOperand(0))
10679 .addImm(0);
10680 MBB.insert(MBB.end(), TC);
10682
10683 FI->setOutliningStyle("Thunk");
10684 }
10685
10686 bool IsLeafFunction = true;
10687
10688 // Is there a call in the outlined range?
10689 auto IsNonTailCall = [](const MachineInstr &MI) {
10690 return MI.isCall() && !MI.isReturn();
10691 };
10692
10693 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
10694 // Fix up the instructions in the range, since we're going to modify the
10695 // stack.
10696
10697 // Bugzilla ID: 46767
10698 // TODO: Check if fixing up twice is safe so we can outline these.
10699 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10700 "Can only fix up stack references once");
10701 fixupPostOutline(MBB);
10702
10703 IsLeafFunction = false;
10704
10705 // LR has to be a live in so that we can save it.
10706 if (!MBB.isLiveIn(AArch64::LR))
10707 MBB.addLiveIn(AArch64::LR);
10708
10711
10712 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10713 OF.FrameConstructionID == MachineOutlinerThunk)
10714 Et = std::prev(MBB.end());
10715
10716 // Insert a save before the outlined region
10717 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10718 .addReg(AArch64::SP, RegState::Define)
10719 .addReg(AArch64::LR)
10720 .addReg(AArch64::SP)
10721 .addImm(-16);
10722 It = MBB.insert(It, STRXpre);
10723
10724 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10725 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10726
10727 // Add a CFI saying the stack was moved 16 B down.
10728 CFIBuilder.buildDefCFAOffset(16);
10729
10730 // Add a CFI saying that the LR that we want to find is now 16 B higher
10731 // than before.
10732 CFIBuilder.buildOffset(AArch64::LR, -16);
10733 }
10734
10735 // Insert a restore before the terminator for the function.
10736 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10737 .addReg(AArch64::SP, RegState::Define)
10738 .addReg(AArch64::LR, RegState::Define)
10739 .addReg(AArch64::SP)
10740 .addImm(16);
10741 Et = MBB.insert(Et, LDRXpost);
10742 }
10743
10744 auto RASignCondition = FI->getSignReturnAddressCondition();
10745 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10746 RASignCondition, !IsLeafFunction);
10747
10748 // If this is a tail call outlined function, then there's already a return.
10749 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10750 OF.FrameConstructionID == MachineOutlinerThunk) {
10751 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10752 return;
10753 }
10754
10755 // It's not a tail call, so we have to insert the return ourselves.
10756
10757 // LR has to be a live in so that we can return to it.
10758 if (!MBB.isLiveIn(AArch64::LR))
10759 MBB.addLiveIn(AArch64::LR);
10760
10761 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
10762 .addReg(AArch64::LR);
10763 MBB.insert(MBB.end(), ret);
10764
10765 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10766
10767 FI->setOutliningStyle("Function");
10768
10769 // Did we have to modify the stack by saving the link register?
10770 if (OF.FrameConstructionID != MachineOutlinerDefault)
10771 return;
10772
10773 // We modified the stack.
10774 // Walk over the basic block and fix up all the stack accesses.
10775 fixupPostOutline(MBB);
10776}
10777
10778MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10781
10782 // Are we tail calling?
10783 if (C.CallConstructionID == MachineOutlinerTailCall) {
10784 // If yes, then we can just branch to the label.
10785 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
10786 .addGlobalAddress(M.getNamedValue(MF.getName()))
10787 .addImm(0));
10788 return It;
10789 }
10790
10791 // Are we saving the link register?
10792 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10793 C.CallConstructionID == MachineOutlinerThunk) {
10794 // No, so just insert the call.
10795 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10796 .addGlobalAddress(M.getNamedValue(MF.getName())));
10797 return It;
10798 }
10799
10800 // We want to return the spot where we inserted the call.
10802
10803 // Instructions for saving and restoring LR around the call instruction we're
10804 // going to insert.
10805 MachineInstr *Save;
10806 MachineInstr *Restore;
10807 // Can we save to a register?
10808 if (C.CallConstructionID == MachineOutlinerRegSave) {
10809 // FIXME: This logic should be sunk into a target-specific interface so that
10810 // we don't have to recompute the register.
10811 Register Reg = findRegisterToSaveLRTo(C);
10812 assert(Reg && "No callee-saved register available?");
10813
10814 // LR has to be a live in so that we can save it.
10815 if (!MBB.isLiveIn(AArch64::LR))
10816 MBB.addLiveIn(AArch64::LR);
10817
10818 // Save and restore LR from Reg.
10819 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
10820 .addReg(AArch64::XZR)
10821 .addReg(AArch64::LR)
10822 .addImm(0);
10823 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
10824 .addReg(AArch64::XZR)
10825 .addReg(Reg)
10826 .addImm(0);
10827 } else {
10828 // We have the default case. Save and restore from SP.
10829 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10830 .addReg(AArch64::SP, RegState::Define)
10831 .addReg(AArch64::LR)
10832 .addReg(AArch64::SP)
10833 .addImm(-16);
10834 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10835 .addReg(AArch64::SP, RegState::Define)
10836 .addReg(AArch64::LR, RegState::Define)
10837 .addReg(AArch64::SP)
10838 .addImm(16);
10839 }
10840
10841 It = MBB.insert(It, Save);
10842 It++;
10843
10844 // Insert the call.
10845 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10846 .addGlobalAddress(M.getNamedValue(MF.getName())));
10847 CallPt = It;
10848 It++;
10849
10850 It = MBB.insert(It, Restore);
10851 return CallPt;
10852}
10853
10854bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
10855 MachineFunction &MF) const {
10856 return MF.getFunction().hasMinSize();
10857}
10858
10859void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
10861 DebugLoc &DL,
10862 bool AllowSideEffects) const {
10863 const MachineFunction &MF = *MBB.getParent();
10864 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
10865 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
10866
10867 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10868 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
10869 } else if (STI.isSVEorStreamingSVEAvailable()) {
10870 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
10871 .addImm(0)
10872 .addImm(0);
10873 } else if (STI.isNeonAvailable()) {
10874 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
10875 .addImm(0);
10876 } else {
10877 // This is a streaming-compatible function without SVE. We don't have full
10878 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
10879 // So given `movi v..` would be illegal use `fmov d..` instead.
10880 assert(STI.hasNEON() && "Expected to have NEON.");
10881 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
10882 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
10883 }
10884}
10885
10886std::optional<DestSourcePair>
10888
10889 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
10890 // and zero immediate operands used as an alias for mov instruction.
10891 if (((MI.getOpcode() == AArch64::ORRWrs &&
10892 MI.getOperand(1).getReg() == AArch64::WZR &&
10893 MI.getOperand(3).getImm() == 0x0) ||
10894 (MI.getOpcode() == AArch64::ORRWrr &&
10895 MI.getOperand(1).getReg() == AArch64::WZR)) &&
10896 // Check that the w->w move is not a zero-extending w->x mov.
10897 (!MI.getOperand(0).getReg().isVirtual() ||
10898 MI.getOperand(0).getSubReg() == 0) &&
10899 (!MI.getOperand(0).getReg().isPhysical() ||
10900 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
10901 /*TRI=*/nullptr) == -1))
10902 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10903
10904 if (MI.getOpcode() == AArch64::ORRXrs &&
10905 MI.getOperand(1).getReg() == AArch64::XZR &&
10906 MI.getOperand(3).getImm() == 0x0)
10907 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10908
10909 return std::nullopt;
10910}
10911
10912std::optional<DestSourcePair>
10914 if ((MI.getOpcode() == AArch64::ORRWrs &&
10915 MI.getOperand(1).getReg() == AArch64::WZR &&
10916 MI.getOperand(3).getImm() == 0x0) ||
10917 (MI.getOpcode() == AArch64::ORRWrr &&
10918 MI.getOperand(1).getReg() == AArch64::WZR))
10919 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10920 return std::nullopt;
10921}
10922
10923std::optional<RegImmPair>
10924AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
10925 int Sign = 1;
10926 int64_t Offset = 0;
10927
10928 // TODO: Handle cases where Reg is a super- or sub-register of the
10929 // destination register.
10930 const MachineOperand &Op0 = MI.getOperand(0);
10931 if (!Op0.isReg() || Reg != Op0.getReg())
10932 return std::nullopt;
10933
10934 switch (MI.getOpcode()) {
10935 default:
10936 return std::nullopt;
10937 case AArch64::SUBWri:
10938 case AArch64::SUBXri:
10939 case AArch64::SUBSWri:
10940 case AArch64::SUBSXri:
10941 Sign *= -1;
10942 [[fallthrough]];
10943 case AArch64::ADDSWri:
10944 case AArch64::ADDSXri:
10945 case AArch64::ADDWri:
10946 case AArch64::ADDXri: {
10947 // TODO: Third operand can be global address (usually some string).
10948 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
10949 !MI.getOperand(2).isImm())
10950 return std::nullopt;
10951 int Shift = MI.getOperand(3).getImm();
10952 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
10953 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
10954 }
10955 }
10956 return RegImmPair{MI.getOperand(1).getReg(), Offset};
10957}
10958
10959/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
10960/// the destination register then, if possible, describe the value in terms of
10961/// the source register.
10962static std::optional<ParamLoadedValue>
10964 const TargetInstrInfo *TII,
10965 const TargetRegisterInfo *TRI) {
10966 auto DestSrc = TII->isCopyLikeInstr(MI);
10967 if (!DestSrc)
10968 return std::nullopt;
10969
10970 Register DestReg = DestSrc->Destination->getReg();
10971 Register SrcReg = DestSrc->Source->getReg();
10972
10973 if (!DestReg.isValid() || !SrcReg.isValid())
10974 return std::nullopt;
10975
10976 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10977
10978 // If the described register is the destination, just return the source.
10979 if (DestReg == DescribedReg)
10980 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10981
10982 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
10983 if (MI.getOpcode() == AArch64::ORRWrs &&
10984 TRI->isSuperRegister(DestReg, DescribedReg))
10985 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10986
10987 // We may need to describe the lower part of a ORRXrs move.
10988 if (MI.getOpcode() == AArch64::ORRXrs &&
10989 TRI->isSubRegister(DestReg, DescribedReg)) {
10990 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
10991 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
10992 }
10993
10994 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
10995 "Unhandled ORR[XW]rs copy case");
10996
10997 return std::nullopt;
10998}
10999
11000bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11001 // Functions cannot be split to different sections on AArch64 if they have
11002 // a red zone. This is because relaxing a cross-section branch may require
11003 // incrementing the stack pointer to spill a register, which would overwrite
11004 // the red zone.
11005 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11006 return false;
11007
11009}
11010
11011bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11012 const MachineBasicBlock &MBB) const {
11013 // Asm Goto blocks can contain conditional branches to goto labels, which can
11014 // get moved out of range of the branch instruction.
11015 auto isAsmGoto = [](const MachineInstr &MI) {
11016 return MI.getOpcode() == AArch64::INLINEASM_BR;
11017 };
11018 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11019 return false;
11020
11021 // Because jump tables are label-relative instead of table-relative, they all
11022 // must be in the same section or relocation fixup handling will fail.
11023
11024 // Check if MBB is a jump table target
11025 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11026 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11027 return llvm::is_contained(JTE.MBBs, &MBB);
11028 };
11029 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11030 return false;
11031
11032 // Check if MBB contains a jump table lookup
11033 for (const MachineInstr &MI : MBB) {
11034 switch (MI.getOpcode()) {
11035 case TargetOpcode::G_BRJT:
11036 case AArch64::JumpTableDest32:
11037 case AArch64::JumpTableDest16:
11038 case AArch64::JumpTableDest8:
11039 return false;
11040 default:
11041 continue;
11042 }
11043 }
11044
11045 // MBB isn't a special case, so it's safe to be split to the cold section.
11046 return true;
11047}
11048
11049std::optional<ParamLoadedValue>
11050AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11051 Register Reg) const {
11052 const MachineFunction *MF = MI.getMF();
11053 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11054 switch (MI.getOpcode()) {
11055 case AArch64::MOVZWi:
11056 case AArch64::MOVZXi: {
11057 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11058 // 64-bit parameters, so we need to consider super-registers.
11059 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11060 return std::nullopt;
11061
11062 if (!MI.getOperand(1).isImm())
11063 return std::nullopt;
11064 int64_t Immediate = MI.getOperand(1).getImm();
11065 int Shift = MI.getOperand(2).getImm();
11066 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11067 nullptr);
11068 }
11069 case AArch64::ORRWrs:
11070 case AArch64::ORRXrs:
11071 return describeORRLoadedValue(MI, Reg, this, TRI);
11072 }
11073
11075}
11076
11077bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11078 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11079 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11080 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11081 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11082
11083 // Anyexts are nops.
11084 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11085 return true;
11086
11087 Register DefReg = ExtMI.getOperand(0).getReg();
11088 if (!MRI.hasOneNonDBGUse(DefReg))
11089 return false;
11090
11091 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11092 // addressing mode.
11093 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11094 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11095}
11096
11097uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11098 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11099}
11100
11101bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11102 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11103}
11104
11105bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11106 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11107}
11108
11109unsigned int
11110AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11111 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11112}
11113
11114bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11115 unsigned Scale) const {
11116 if (Offset && Scale)
11117 return false;
11118
11119 // Check Reg + Imm
11120 if (!Scale) {
11121 // 9-bit signed offset
11122 if (isInt<9>(Offset))
11123 return true;
11124
11125 // 12-bit unsigned offset
11126 unsigned Shift = Log2_64(NumBytes);
11127 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11128 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11129 (Offset >> Shift) << Shift == Offset)
11130 return true;
11131 return false;
11132 }
11133
11134 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11135 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11136}
11137
11139 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11140 return AArch64::BLRNoIP;
11141 else
11142 return AArch64::BLR;
11143}
11144
11147 Register TargetReg, bool FrameSetup) const {
11148 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11149
11150 MachineBasicBlock &MBB = *MBBI->getParent();
11151 MachineFunction &MF = *MBB.getParent();
11152 const AArch64InstrInfo *TII =
11153 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11154 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11155 DebugLoc DL = MBB.findDebugLoc(MBBI);
11156
11157 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11158 MachineBasicBlock *LoopTestMBB =
11159 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11160 MF.insert(MBBInsertPoint, LoopTestMBB);
11161 MachineBasicBlock *LoopBodyMBB =
11162 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11163 MF.insert(MBBInsertPoint, LoopBodyMBB);
11164 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11165 MF.insert(MBBInsertPoint, ExitMBB);
11166 MachineInstr::MIFlag Flags =
11168
11169 // LoopTest:
11170 // SUB SP, SP, #ProbeSize
11171 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11172 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11173
11174 // CMP SP, TargetReg
11175 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11176 AArch64::XZR)
11177 .addReg(AArch64::SP)
11178 .addReg(TargetReg)
11180 .setMIFlags(Flags);
11181
11182 // B.<Cond> LoopExit
11183 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11185 .addMBB(ExitMBB)
11186 .setMIFlags(Flags);
11187
11188 // STR XZR, [SP]
11189 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
11190 .addReg(AArch64::XZR)
11191 .addReg(AArch64::SP)
11192 .addImm(0)
11193 .setMIFlags(Flags);
11194
11195 // B loop
11196 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11197 .addMBB(LoopTestMBB)
11198 .setMIFlags(Flags);
11199
11200 // LoopExit:
11201 // MOV SP, TargetReg
11202 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11203 .addReg(TargetReg)
11204 .addImm(0)
11206 .setMIFlags(Flags);
11207
11208 // LDR XZR, [SP]
11209 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11210 .addReg(AArch64::XZR, RegState::Define)
11211 .addReg(AArch64::SP)
11212 .addImm(0)
11213 .setMIFlags(Flags);
11214
11215 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11217
11218 LoopTestMBB->addSuccessor(ExitMBB);
11219 LoopTestMBB->addSuccessor(LoopBodyMBB);
11220 LoopBodyMBB->addSuccessor(LoopTestMBB);
11221 MBB.addSuccessor(LoopTestMBB);
11222
11223 // Update liveins.
11224 if (MF.getRegInfo().reservedRegsFrozen())
11225 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11226
11227 return ExitMBB->begin();
11228}
11229
11230namespace {
11231class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11232 MachineFunction *MF;
11233 const TargetInstrInfo *TII;
11234 const TargetRegisterInfo *TRI;
11236
11237 /// The block of the loop
11238 MachineBasicBlock *LoopBB;
11239 /// The conditional branch of the loop
11240 MachineInstr *CondBranch;
11241 /// The compare instruction for loop control
11242 MachineInstr *Comp;
11243 /// The number of the operand of the loop counter value in Comp
11244 unsigned CompCounterOprNum;
11245 /// The instruction that updates the loop counter value
11246 MachineInstr *Update;
11247 /// The number of the operand of the loop counter value in Update
11248 unsigned UpdateCounterOprNum;
11249 /// The initial value of the loop counter
11250 Register Init;
11251 /// True iff Update is a predecessor of Comp
11252 bool IsUpdatePriorComp;
11253
11254 /// The normalized condition used by createTripCountGreaterCondition()
11256
11257public:
11258 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11259 MachineInstr *Comp, unsigned CompCounterOprNum,
11260 MachineInstr *Update, unsigned UpdateCounterOprNum,
11261 Register Init, bool IsUpdatePriorComp,
11263 : MF(Comp->getParent()->getParent()),
11264 TII(MF->getSubtarget().getInstrInfo()),
11265 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11266 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11267 CompCounterOprNum(CompCounterOprNum), Update(Update),
11268 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11269 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11270
11271 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11272 // Make the instructions for loop control be placed in stage 0.
11273 // The predecessors of Comp are considered by the caller.
11274 return MI == Comp;
11275 }
11276
11277 std::optional<bool> createTripCountGreaterCondition(
11278 int TC, MachineBasicBlock &MBB,
11279 SmallVectorImpl<MachineOperand> &CondParam) override {
11280 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11281 // Cond is normalized for such use.
11282 // The predecessors of the branch are assumed to have already been inserted.
11283 CondParam = Cond;
11284 return {};
11285 }
11286
11287 void createRemainingIterationsGreaterCondition(
11288 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11289 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11290
11291 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11292
11293 void adjustTripCount(int TripCountAdjust) override {}
11294
11295 bool isMVEExpanderSupported() override { return true; }
11296};
11297} // namespace
11298
11299/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11300/// is replaced by ReplaceReg. The output register is newly created.
11301/// The other operands are unchanged from MI.
11302static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11303 Register ReplaceReg, MachineBasicBlock &MBB,
11304 MachineBasicBlock::iterator InsertTo) {
11305 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11306 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11307 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11308 Register Result = 0;
11309 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11310 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11311 Result = MRI.createVirtualRegister(
11312 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11313 NewMI->getOperand(I).setReg(Result);
11314 } else if (I == ReplaceOprNum) {
11315 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11316 NewMI->getOperand(I).setReg(ReplaceReg);
11317 }
11318 }
11319 MBB.insert(InsertTo, NewMI);
11320 return Result;
11321}
11322
11323void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11326 // Create and accumulate conditions for next TC iterations.
11327 // Example:
11328 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11329 // # iteration of the kernel
11330 //
11331 // # insert the following instructions
11332 // cond = CSINCXr 0, 0, C, implicit $nzcv
11333 // counter = ADDXri counter, 1 # clone from this->Update
11334 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11335 // cond = CSINCXr cond, cond, C, implicit $nzcv
11336 // ... (repeat TC times)
11337 // SUBSXri cond, 0, implicit-def $nzcv
11338
11339 assert(CondBranch->getOpcode() == AArch64::Bcc);
11340 // CondCode to exit the loop
11342 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11343 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11345
11346 // Accumulate conditions to exit the loop
11347 Register AccCond = AArch64::XZR;
11348
11349 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11350 auto AccumulateCond = [&](Register CurCond,
11352 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11353 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11354 .addReg(NewCond, RegState::Define)
11355 .addReg(CurCond)
11356 .addReg(CurCond)
11358 return NewCond;
11359 };
11360
11361 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11362 // Update and Comp for I==0 are already exists in MBB
11363 // (MBB is an unrolled kernel)
11364 Register Counter;
11365 for (int I = 0; I <= TC; ++I) {
11366 Register NextCounter;
11367 if (I != 0)
11368 NextCounter =
11369 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11370
11371 AccCond = AccumulateCond(AccCond, CC);
11372
11373 if (I != TC) {
11374 if (I == 0) {
11375 if (Update != Comp && IsUpdatePriorComp) {
11376 Counter =
11377 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11378 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11379 MBB.end());
11380 } else {
11381 // can use already calculated value
11382 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11383 }
11384 } else if (Update != Comp) {
11385 NextCounter =
11386 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11387 }
11388 }
11389 Counter = NextCounter;
11390 }
11391 } else {
11392 Register Counter;
11393 if (LastStage0Insts.empty()) {
11394 // use initial counter value (testing if the trip count is sufficient to
11395 // be executed by pipelined code)
11396 Counter = Init;
11397 if (IsUpdatePriorComp)
11398 Counter =
11399 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11400 } else {
11401 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11402 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11403 }
11404
11405 for (int I = 0; I <= TC; ++I) {
11406 Register NextCounter;
11407 NextCounter =
11408 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11409 AccCond = AccumulateCond(AccCond, CC);
11410 if (I != TC && Update != Comp)
11411 NextCounter =
11412 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11413 Counter = NextCounter;
11414 }
11415 }
11416
11417 // If AccCond == 0, the remainder is greater than TC.
11418 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11419 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11420 .addReg(AccCond)
11421 .addImm(0)
11422 .addImm(0);
11423 Cond.clear();
11425}
11426
11427static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11428 Register &RegMBB, Register &RegOther) {
11429 assert(Phi.getNumOperands() == 5);
11430 if (Phi.getOperand(2).getMBB() == MBB) {
11431 RegMBB = Phi.getOperand(1).getReg();
11432 RegOther = Phi.getOperand(3).getReg();
11433 } else {
11434 assert(Phi.getOperand(4).getMBB() == MBB);
11435 RegMBB = Phi.getOperand(3).getReg();
11436 RegOther = Phi.getOperand(1).getReg();
11437 }
11438}
11439
11441 if (!Reg.isVirtual())
11442 return false;
11443 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11444 return MRI.getVRegDef(Reg)->getParent() != BB;
11445}
11446
11447/// If Reg is an induction variable, return true and set some parameters
11448static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11449 MachineInstr *&UpdateInst,
11450 unsigned &UpdateCounterOprNum, Register &InitReg,
11451 bool &IsUpdatePriorComp) {
11452 // Example:
11453 //
11454 // Preheader:
11455 // InitReg = ...
11456 // LoopBB:
11457 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11458 // Reg = COPY Reg0 ; COPY is ignored.
11459 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11460 // ; Reg is the value calculated in the previous
11461 // ; iteration, so IsUpdatePriorComp == false.
11462
11463 if (LoopBB->pred_size() != 2)
11464 return false;
11465 if (!Reg.isVirtual())
11466 return false;
11467 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11468 UpdateInst = nullptr;
11469 UpdateCounterOprNum = 0;
11470 InitReg = 0;
11471 IsUpdatePriorComp = true;
11472 Register CurReg = Reg;
11473 while (true) {
11474 MachineInstr *Def = MRI.getVRegDef(CurReg);
11475 if (Def->getParent() != LoopBB)
11476 return false;
11477 if (Def->isCopy()) {
11478 // Ignore copy instructions unless they contain subregisters
11479 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11480 return false;
11481 CurReg = Def->getOperand(1).getReg();
11482 } else if (Def->isPHI()) {
11483 if (InitReg != 0)
11484 return false;
11485 if (!UpdateInst)
11486 IsUpdatePriorComp = false;
11487 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11488 } else {
11489 if (UpdateInst)
11490 return false;
11491 switch (Def->getOpcode()) {
11492 case AArch64::ADDSXri:
11493 case AArch64::ADDSWri:
11494 case AArch64::SUBSXri:
11495 case AArch64::SUBSWri:
11496 case AArch64::ADDXri:
11497 case AArch64::ADDWri:
11498 case AArch64::SUBXri:
11499 case AArch64::SUBWri:
11500 UpdateInst = Def;
11501 UpdateCounterOprNum = 1;
11502 break;
11503 case AArch64::ADDSXrr:
11504 case AArch64::ADDSWrr:
11505 case AArch64::SUBSXrr:
11506 case AArch64::SUBSWrr:
11507 case AArch64::ADDXrr:
11508 case AArch64::ADDWrr:
11509 case AArch64::SUBXrr:
11510 case AArch64::SUBWrr:
11511 UpdateInst = Def;
11512 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11513 UpdateCounterOprNum = 1;
11514 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11515 UpdateCounterOprNum = 2;
11516 else
11517 return false;
11518 break;
11519 default:
11520 return false;
11521 }
11522 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11523 }
11524
11525 if (!CurReg.isVirtual())
11526 return false;
11527 if (Reg == CurReg)
11528 break;
11529 }
11530
11531 if (!UpdateInst)
11532 return false;
11533
11534 return true;
11535}
11536
11537std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11539 // Accept loops that meet the following conditions
11540 // * The conditional branch is BCC
11541 // * The compare instruction is ADDS/SUBS/WHILEXX
11542 // * One operand of the compare is an induction variable and the other is a
11543 // loop invariant value
11544 // * The induction variable is incremented/decremented by a single instruction
11545 // * Does not contain CALL or instructions which have unmodeled side effects
11546
11547 for (MachineInstr &MI : *LoopBB)
11548 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11549 // This instruction may use NZCV, which interferes with the instruction to
11550 // be inserted for loop control.
11551 return nullptr;
11552
11553 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11555 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11556 return nullptr;
11557
11558 // Infinite loops are not supported
11559 if (TBB == LoopBB && FBB == LoopBB)
11560 return nullptr;
11561
11562 // Must be conditional branch
11563 if (TBB != LoopBB && FBB == nullptr)
11564 return nullptr;
11565
11566 assert((TBB == LoopBB || FBB == LoopBB) &&
11567 "The Loop must be a single-basic-block loop");
11568
11569 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11571
11572 if (CondBranch->getOpcode() != AArch64::Bcc)
11573 return nullptr;
11574
11575 // Normalization for createTripCountGreaterCondition()
11576 if (TBB == LoopBB)
11578
11579 MachineInstr *Comp = nullptr;
11580 unsigned CompCounterOprNum = 0;
11581 for (MachineInstr &MI : reverse(*LoopBB)) {
11582 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
11583 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11584 // operands is a loop invariant value
11585
11586 switch (MI.getOpcode()) {
11587 case AArch64::SUBSXri:
11588 case AArch64::SUBSWri:
11589 case AArch64::ADDSXri:
11590 case AArch64::ADDSWri:
11591 Comp = &MI;
11592 CompCounterOprNum = 1;
11593 break;
11594 case AArch64::ADDSWrr:
11595 case AArch64::ADDSXrr:
11596 case AArch64::SUBSWrr:
11597 case AArch64::SUBSXrr:
11598 Comp = &MI;
11599 break;
11600 default:
11601 if (isWhileOpcode(MI.getOpcode())) {
11602 Comp = &MI;
11603 break;
11604 }
11605 return nullptr;
11606 }
11607
11608 if (CompCounterOprNum == 0) {
11609 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
11610 CompCounterOprNum = 2;
11611 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
11612 CompCounterOprNum = 1;
11613 else
11614 return nullptr;
11615 }
11616 break;
11617 }
11618 }
11619 if (!Comp)
11620 return nullptr;
11621
11622 MachineInstr *Update = nullptr;
11623 Register Init;
11624 bool IsUpdatePriorComp;
11625 unsigned UpdateCounterOprNum;
11626 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
11627 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
11628 return nullptr;
11629
11630 return std::make_unique<AArch64PipelinerLoopInfo>(
11631 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
11632 Init, IsUpdatePriorComp, Cond);
11633}
11634
11635/// verifyInstruction - Perform target specific instruction verification.
11636bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11637 StringRef &ErrInfo) const {
11638 // Verify that immediate offsets on load/store instructions are within range.
11639 // Stack objects with an FI operand are excluded as they can be fixed up
11640 // during PEI.
11641 TypeSize Scale(0U, false), Width(0U, false);
11642 int64_t MinOffset, MaxOffset;
11643 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11644 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
11645 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
11646 int64_t Imm = MI.getOperand(ImmIdx).getImm();
11647 if (Imm < MinOffset || Imm > MaxOffset) {
11648 ErrInfo = "Unexpected immediate on load/store instruction";
11649 return false;
11650 }
11651 }
11652 }
11653
11654 const MCInstrDesc &MCID = MI.getDesc();
11655 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11656 const MachineOperand &MO = MI.getOperand(Op);
11657 switch (MCID.operands()[Op].OperandType) {
11659 if (!MO.isImm() || MO.getImm() != 0) {
11660 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11661 return false;
11662 }
11663 break;
11665 if (!MO.isImm() ||
11667 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
11668 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
11669 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11670 return false;
11671 }
11672 break;
11673 default:
11674 break;
11675 }
11676 }
11677 return true;
11678}
11679
11680#define GET_INSTRINFO_HELPERS
11681#define GET_INSTRMAP_INFO
11682#include "AArch64GenInstrInfo.inc"
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, unsigned State, const TargetRegisterInfo *TRI)
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static int findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr)
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the SmallSet class.
This file defines the SmallVector class.
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI, Register TargetReg, bool FrameSetup) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:123
bool empty() const
Definition DenseMap.h:109
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:64
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:585
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:627
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:600
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:697
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
Instructions::const_iterator const_instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:199
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:232
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Renamable
Register that may be renamed.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
LLVM_ABI Instruction & back() const
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
unsigned getUndefRegState(bool B)
static MCRegister getXRegFromWReg(MCRegister Reg)
unsigned getDefRegState(bool B)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
unsigned getKillRegState(bool B)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2132
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.