LLVM 23.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
50#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Support/LEB128.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
101 "aarch64-search-limit", cl::Hidden, cl::init(2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// Return the maximum number of bytes of code the specified instruction may be
111/// after LFI rewriting. If the instruction is not rewritten, std::nullopt is
112/// returned (use default sizing).
113///
114/// NOTE: the size estimates here must be kept in sync with the rewrites in
115/// AArch64MCLFIRewriter.cpp. Sizes may be overestimates of the rewritten
116/// instruction sequences.
117static std::optional<unsigned> getLFIInstSizeInBytes(const MachineInstr &MI) {
118 switch (MI.getOpcode()) {
119 case AArch64::SVC:
120 // SVC expands to 4 instructions.
121 return 16;
122 default:
123 // Default case: instructions that don't cause expansion.
124 // - TP accesses in LFI are a single load/store, so no expansion.
125 // - All remaining instructions are not rewritten.
126 return std::nullopt;
127 }
128}
129
130/// GetInstSize - Return the number of bytes of code the specified
131/// instruction may be. This returns the maximum number of bytes.
133 const MachineBasicBlock &MBB = *MI.getParent();
134 const MachineFunction *MF = MBB.getParent();
135 const Function &F = MF->getFunction();
136 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
137
138 {
139 auto Op = MI.getOpcode();
140 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
141 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
142 }
143
144 // Meta-instructions emit no code.
145 if (MI.isMetaInstruction())
146 return 0;
147
148 // FIXME: We currently only handle pseudoinstructions that don't get expanded
149 // before the assembly printer.
150 unsigned NumBytes = 0;
151 const MCInstrDesc &Desc = MI.getDesc();
152
153 // LFI rewriter expansions that supersede normal sizing.
154 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
155 if (STI.isLFI())
156 if (auto Size = getLFIInstSizeInBytes(MI))
157 return *Size;
158
159 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
160 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
161
162 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
163 if (!MFI->shouldSignReturnAddress(*MF))
164 return NumBytes;
165
166 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
167 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
168 return NumBytes;
169 }
170
171 // Size should be preferably set in
172 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
173 // Specific cases handle instructions of variable sizes
174 switch (Desc.getOpcode()) {
175 default:
176 if (Desc.getSize())
177 return Desc.getSize();
178
179 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
180 // with fixed constant size but not specified in .td file) is a normal
181 // 4-byte insn.
182 NumBytes = 4;
183 break;
184 case TargetOpcode::STACKMAP:
185 // The upper bound for a stackmap intrinsic is the full length of its shadow
186 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
187 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
188 break;
189 case TargetOpcode::PATCHPOINT:
190 // The size of the patchpoint intrinsic is the number of bytes requested
191 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
192 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
193 break;
194 case TargetOpcode::STATEPOINT:
195 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
196 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
197 // No patch bytes means a normal call inst is emitted
198 if (NumBytes == 0)
199 NumBytes = 4;
200 break;
201 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
202 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
203 // instructions are expanded to the specified number of NOPs. Otherwise,
204 // they are expanded to 36-byte XRay sleds.
205 NumBytes =
206 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
207 break;
208 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
209 case TargetOpcode::PATCHABLE_TAIL_CALL:
210 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
211 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
212 NumBytes = 36;
213 break;
214 case TargetOpcode::PATCHABLE_EVENT_CALL:
215 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
216 NumBytes = 24;
217 break;
218
219 case AArch64::SPACE:
220 NumBytes = MI.getOperand(1).getImm();
221 break;
222 case TargetOpcode::BUNDLE:
223 NumBytes = getInstBundleLength(MI);
224 break;
225 }
226
227 return NumBytes;
228}
229
230unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
231 unsigned Size = 0;
233 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
234 while (++I != E && I->isInsideBundle()) {
235 assert(!I->isBundle() && "No nested bundle!");
237 }
238 return Size;
239}
240
243 // Block ends with fall-through condbranch.
244 switch (LastInst->getOpcode()) {
245 default:
246 llvm_unreachable("Unknown branch instruction?");
247 case AArch64::Bcc:
248 Target = LastInst->getOperand(1).getMBB();
249 Cond.push_back(LastInst->getOperand(0));
250 break;
251 case AArch64::CBZW:
252 case AArch64::CBZX:
253 case AArch64::CBNZW:
254 case AArch64::CBNZX:
255 Target = LastInst->getOperand(1).getMBB();
256 Cond.push_back(MachineOperand::CreateImm(-1));
257 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
258 Cond.push_back(LastInst->getOperand(0));
259 break;
260 case AArch64::TBZW:
261 case AArch64::TBZX:
262 case AArch64::TBNZW:
263 case AArch64::TBNZX:
264 Target = LastInst->getOperand(2).getMBB();
265 Cond.push_back(MachineOperand::CreateImm(-1));
266 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
267 Cond.push_back(LastInst->getOperand(0));
268 Cond.push_back(LastInst->getOperand(1));
269 break;
270 case AArch64::CBWPri:
271 case AArch64::CBXPri:
272 case AArch64::CBWPrr:
273 case AArch64::CBXPrr:
274 Target = LastInst->getOperand(3).getMBB();
275 Cond.push_back(MachineOperand::CreateImm(-1));
276 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
277 Cond.push_back(LastInst->getOperand(0));
278 Cond.push_back(LastInst->getOperand(1));
279 Cond.push_back(LastInst->getOperand(2));
280 break;
281 case AArch64::CBBAssertExt:
282 case AArch64::CBHAssertExt:
283 Target = LastInst->getOperand(3).getMBB();
284 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
285 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
286 Cond.push_back(LastInst->getOperand(0)); // Cond
287 Cond.push_back(LastInst->getOperand(1)); // Op0
288 Cond.push_back(LastInst->getOperand(2)); // Op1
289 Cond.push_back(LastInst->getOperand(4)); // Ext0
290 Cond.push_back(LastInst->getOperand(5)); // Ext1
291 break;
292 }
293}
294
295static unsigned getBranchDisplacementBits(unsigned Opc) {
296 switch (Opc) {
297 default:
298 llvm_unreachable("unexpected opcode!");
299 case AArch64::B:
300 return BDisplacementBits;
301 case AArch64::TBNZW:
302 case AArch64::TBZW:
303 case AArch64::TBNZX:
304 case AArch64::TBZX:
305 return TBZDisplacementBits;
306 case AArch64::CBNZW:
307 case AArch64::CBZW:
308 case AArch64::CBNZX:
309 case AArch64::CBZX:
310 return CBZDisplacementBits;
311 case AArch64::Bcc:
312 return BCCDisplacementBits;
313 case AArch64::CBWPri:
314 case AArch64::CBXPri:
315 case AArch64::CBBAssertExt:
316 case AArch64::CBHAssertExt:
317 case AArch64::CBWPrr:
318 case AArch64::CBXPrr:
319 return CBDisplacementBits;
320 }
321}
322
324 int64_t BrOffset) const {
325 unsigned Bits = getBranchDisplacementBits(BranchOp);
326 assert(Bits >= 3 && "max branch displacement must be enough to jump"
327 "over conditional branch expansion");
328 return isIntN(Bits, BrOffset / 4);
329}
330
333 switch (MI.getOpcode()) {
334 default:
335 llvm_unreachable("unexpected opcode!");
336 case AArch64::B:
337 return MI.getOperand(0).getMBB();
338 case AArch64::TBZW:
339 case AArch64::TBNZW:
340 case AArch64::TBZX:
341 case AArch64::TBNZX:
342 return MI.getOperand(2).getMBB();
343 case AArch64::CBZW:
344 case AArch64::CBNZW:
345 case AArch64::CBZX:
346 case AArch64::CBNZX:
347 case AArch64::Bcc:
348 return MI.getOperand(1).getMBB();
349 case AArch64::CBWPri:
350 case AArch64::CBXPri:
351 case AArch64::CBBAssertExt:
352 case AArch64::CBHAssertExt:
353 case AArch64::CBWPrr:
354 case AArch64::CBXPrr:
355 return MI.getOperand(3).getMBB();
356 }
357}
358
360 MachineBasicBlock &NewDestBB,
361 MachineBasicBlock &RestoreBB,
362 const DebugLoc &DL,
363 int64_t BrOffset,
364 RegScavenger *RS) const {
365 assert(RS && "RegScavenger required for long branching");
366 assert(MBB.empty() &&
367 "new block should be inserted for expanding unconditional branch");
368 assert(MBB.pred_size() == 1);
369 assert(RestoreBB.empty() &&
370 "restore block should be inserted for restoring clobbered registers");
371
372 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
373 // Offsets outside of the signed 33-bit range are not supported for ADRP +
374 // ADD.
375 if (!isInt<33>(BrOffset))
377 "Branch offsets outside of the signed 33-bit range not supported");
378
379 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
380 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
381 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
382 .addReg(Reg)
383 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
384 .addImm(0);
385 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
386 };
387
388 RS->enterBasicBlockEnd(MBB);
389 // If X16 is unused, we can rely on the linker to insert a range extension
390 // thunk if NewDestBB is out of range of a single B instruction.
391 constexpr Register Reg = AArch64::X16;
392 if (!RS->isRegUsed(Reg)) {
393 insertUnconditionalBranch(MBB, &NewDestBB, DL);
394 RS->setRegUsed(Reg);
395 return;
396 }
397
398 // If there's a free register and it's worth inflating the code size,
399 // manually insert the indirect branch.
400 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
401 if (Scavenged != AArch64::NoRegister &&
402 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
403 buildIndirectBranch(Scavenged, NewDestBB);
404 RS->setRegUsed(Scavenged);
405 return;
406 }
407
408 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
409 // with red zones.
410 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
411 if (!AFI || AFI->hasRedZone().value_or(true))
413 "Unable to insert indirect branch inside function that has red zone");
414
415 // Otherwise, spill X16 and defer range extension to the linker.
416 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
417 .addReg(AArch64::SP, RegState::Define)
418 .addReg(Reg)
419 .addReg(AArch64::SP)
420 .addImm(-16);
421
422 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
423
424 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
425 .addReg(AArch64::SP, RegState::Define)
427 .addReg(AArch64::SP)
428 .addImm(16);
429}
430
431// Branch analysis.
434 MachineBasicBlock *&FBB,
436 bool AllowModify) const {
437 // If the block has no terminators, it just falls into the block after it.
438 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
439 if (I == MBB.end())
440 return false;
441
442 // Skip over SpeculationBarrierEndBB terminators
443 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
444 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
445 --I;
446 }
447
448 if (!isUnpredicatedTerminator(*I))
449 return false;
450
451 // Get the last instruction in the block.
452 MachineInstr *LastInst = &*I;
453
454 // If there is only one terminator instruction, process it.
455 unsigned LastOpc = LastInst->getOpcode();
456 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
457 if (isUncondBranchOpcode(LastOpc)) {
458 TBB = LastInst->getOperand(0).getMBB();
459 return false;
460 }
461 if (isCondBranchOpcode(LastOpc)) {
462 // Block ends with fall-through condbranch.
463 parseCondBranch(LastInst, TBB, Cond);
464 return false;
465 }
466 return true; // Can't handle indirect branch.
467 }
468
469 // Get the instruction before it if it is a terminator.
470 MachineInstr *SecondLastInst = &*I;
471 unsigned SecondLastOpc = SecondLastInst->getOpcode();
472
473 // If AllowModify is true and the block ends with two or more unconditional
474 // branches, delete all but the first unconditional branch.
475 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
476 while (isUncondBranchOpcode(SecondLastOpc)) {
477 LastInst->eraseFromParent();
478 LastInst = SecondLastInst;
479 LastOpc = LastInst->getOpcode();
480 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
481 // Return now the only terminator is an unconditional branch.
482 TBB = LastInst->getOperand(0).getMBB();
483 return false;
484 }
485 SecondLastInst = &*I;
486 SecondLastOpc = SecondLastInst->getOpcode();
487 }
488 }
489
490 // If we're allowed to modify and the block ends in a unconditional branch
491 // which could simply fallthrough, remove the branch. (Note: This case only
492 // matters when we can't understand the whole sequence, otherwise it's also
493 // handled by BranchFolding.cpp.)
494 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
495 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
496 LastInst->eraseFromParent();
497 LastInst = SecondLastInst;
498 LastOpc = LastInst->getOpcode();
499 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
500 assert(!isUncondBranchOpcode(LastOpc) &&
501 "unreachable unconditional branches removed above");
502
503 if (isCondBranchOpcode(LastOpc)) {
504 // Block ends with fall-through condbranch.
505 parseCondBranch(LastInst, TBB, Cond);
506 return false;
507 }
508 return true; // Can't handle indirect branch.
509 }
510 SecondLastInst = &*I;
511 SecondLastOpc = SecondLastInst->getOpcode();
512 }
513
514 // If there are three terminators, we don't know what sort of block this is.
515 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
516 return true;
517
518 // If the block ends with a B and a Bcc, handle it.
519 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
520 parseCondBranch(SecondLastInst, TBB, Cond);
521 FBB = LastInst->getOperand(0).getMBB();
522 return false;
523 }
524
525 // If the block ends with two unconditional branches, handle it. The second
526 // one is not executed, so remove it.
527 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
528 TBB = SecondLastInst->getOperand(0).getMBB();
529 I = LastInst;
530 if (AllowModify)
531 I->eraseFromParent();
532 return false;
533 }
534
535 // ...likewise if it ends with an indirect branch followed by an unconditional
536 // branch.
537 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
538 I = LastInst;
539 if (AllowModify)
540 I->eraseFromParent();
541 return true;
542 }
543
544 // Otherwise, can't handle this.
545 return true;
546}
547
549 MachineBranchPredicate &MBP,
550 bool AllowModify) const {
551 // Use analyzeBranch to validate the branch pattern.
552 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
554 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
555 return true;
556
557 // analyzeBranch returns success with empty Cond for unconditional branches.
558 if (Cond.empty())
559 return true;
560
561 MBP.TrueDest = TBB;
562 assert(MBP.TrueDest && "expected!");
563 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
564
565 MBP.ConditionDef = nullptr;
566 MBP.SingleUseCondition = false;
567
568 // Find the conditional branch. After analyzeBranch succeeds with non-empty
569 // Cond, there's exactly one conditional branch - either last (fallthrough)
570 // or second-to-last (followed by unconditional B).
571 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
572 if (I == MBB.end())
573 return true;
574
575 if (isUncondBranchOpcode(I->getOpcode())) {
576 if (I == MBB.begin())
577 return true;
578 --I;
579 }
580
581 MachineInstr *CondBranch = &*I;
582 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
583
584 switch (CondBranch->getOpcode()) {
585 default:
586 return true;
587
588 case AArch64::Bcc:
589 // Bcc takes the NZCV flag as the operand to branch on, walk up the
590 // instruction stream to find the last instruction to define NZCV.
592 if (MI.modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
593 MBP.ConditionDef = &MI;
594 break;
595 }
596 }
597 return false;
598
599 case AArch64::CBZW:
600 case AArch64::CBZX:
601 case AArch64::CBNZW:
602 case AArch64::CBNZX: {
603 MBP.LHS = CondBranch->getOperand(0);
604 MBP.RHS = MachineOperand::CreateImm(0);
605 unsigned Opc = CondBranch->getOpcode();
606 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
607 ? MachineBranchPredicate::PRED_NE
608 : MachineBranchPredicate::PRED_EQ;
609 Register CondReg = MBP.LHS.getReg();
610 if (CondReg.isVirtual())
611 MBP.ConditionDef = MRI.getVRegDef(CondReg);
612 return false;
613 }
614
615 case AArch64::TBZW:
616 case AArch64::TBZX:
617 case AArch64::TBNZW:
618 case AArch64::TBNZX: {
619 Register CondReg = CondBranch->getOperand(0).getReg();
620 if (CondReg.isVirtual())
621 MBP.ConditionDef = MRI.getVRegDef(CondReg);
622 return false;
623 }
624 }
625}
626
629 if (Cond[0].getImm() != -1) {
630 // Regular Bcc
631 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
633 } else {
634 // Folded compare-and-branch
635 switch (Cond[1].getImm()) {
636 default:
637 llvm_unreachable("Unknown conditional branch!");
638 case AArch64::CBZW:
639 Cond[1].setImm(AArch64::CBNZW);
640 break;
641 case AArch64::CBNZW:
642 Cond[1].setImm(AArch64::CBZW);
643 break;
644 case AArch64::CBZX:
645 Cond[1].setImm(AArch64::CBNZX);
646 break;
647 case AArch64::CBNZX:
648 Cond[1].setImm(AArch64::CBZX);
649 break;
650 case AArch64::TBZW:
651 Cond[1].setImm(AArch64::TBNZW);
652 break;
653 case AArch64::TBNZW:
654 Cond[1].setImm(AArch64::TBZW);
655 break;
656 case AArch64::TBZX:
657 Cond[1].setImm(AArch64::TBNZX);
658 break;
659 case AArch64::TBNZX:
660 Cond[1].setImm(AArch64::TBZX);
661 break;
662
663 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
664 case AArch64::CBWPri:
665 case AArch64::CBXPri:
666 case AArch64::CBBAssertExt:
667 case AArch64::CBHAssertExt:
668 case AArch64::CBWPrr:
669 case AArch64::CBXPrr: {
670 // Pseudos using standard 4bit Arm condition codes
672 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
674 }
675 }
676 }
677
678 return false;
679}
680
682 int *BytesRemoved) const {
683 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
684 if (I == MBB.end())
685 return 0;
686
687 if (!isUncondBranchOpcode(I->getOpcode()) &&
688 !isCondBranchOpcode(I->getOpcode()))
689 return 0;
690
691 // Remove the branch.
692 I->eraseFromParent();
693
694 I = MBB.end();
695
696 if (I == MBB.begin()) {
697 if (BytesRemoved)
698 *BytesRemoved = 4;
699 return 1;
700 }
701 --I;
702 if (!isCondBranchOpcode(I->getOpcode())) {
703 if (BytesRemoved)
704 *BytesRemoved = 4;
705 return 1;
706 }
707
708 // Remove the branch.
709 I->eraseFromParent();
710 if (BytesRemoved)
711 *BytesRemoved = 8;
712
713 return 2;
714}
715
716void AArch64InstrInfo::instantiateCondBranch(
719 if (Cond[0].getImm() != -1) {
720 // Regular Bcc
721 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
722 } else {
723 // Folded compare-and-branch
724 // Note that we use addOperand instead of addReg to keep the flags.
725
726 // cbz, cbnz
727 const MachineInstrBuilder MIB =
728 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
729
730 // tbz/tbnz
731 if (Cond.size() > 3)
732 MIB.add(Cond[3]);
733
734 // cb
735 if (Cond.size() > 4)
736 MIB.add(Cond[4]);
737
738 MIB.addMBB(TBB);
739
740 // cb[b,h]
741 if (Cond.size() > 5) {
742 MIB.addImm(Cond[5].getImm());
743 MIB.addImm(Cond[6].getImm());
744 }
745 }
746}
747
750 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
751 // Shouldn't be a fall through.
752 assert(TBB && "insertBranch must not be told to insert a fallthrough");
753
754 if (!FBB) {
755 if (Cond.empty()) // Unconditional branch?
756 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
757 else
758 instantiateCondBranch(MBB, DL, TBB, Cond);
759
760 if (BytesAdded)
761 *BytesAdded = 4;
762
763 return 1;
764 }
765
766 // Two-way conditional branch.
767 instantiateCondBranch(MBB, DL, TBB, Cond);
768 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
769
770 if (BytesAdded)
771 *BytesAdded = 8;
772
773 return 2;
774}
775
777 const TargetInstrInfo &TII) {
778 for (MachineInstr &MI : MBB->terminators()) {
779 unsigned Opc = MI.getOpcode();
780 switch (Opc) {
781 case AArch64::CBZW:
782 case AArch64::CBZX:
783 case AArch64::TBZW:
784 case AArch64::TBZX:
785 // CBZ/TBZ with WZR/XZR -> unconditional B
786 if (MI.getOperand(0).getReg() == AArch64::WZR ||
787 MI.getOperand(0).getReg() == AArch64::XZR) {
788 DEBUG_WITH_TYPE("optimizeTerminators",
789 dbgs() << "Removing always taken branch: " << MI);
790 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
791 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
792 for (auto *S : Succs)
793 if (S != Target)
794 MBB->removeSuccessor(S);
795 DebugLoc DL = MI.getDebugLoc();
796 while (MBB->rbegin() != &MI)
797 MBB->rbegin()->eraseFromParent();
798 MI.eraseFromParent();
799 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
800 return true;
801 }
802 break;
803 case AArch64::CBNZW:
804 case AArch64::CBNZX:
805 case AArch64::TBNZW:
806 case AArch64::TBNZX:
807 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
808 if (MI.getOperand(0).getReg() == AArch64::WZR ||
809 MI.getOperand(0).getReg() == AArch64::XZR) {
810 DEBUG_WITH_TYPE("optimizeTerminators",
811 dbgs() << "Removing never taken branch: " << MI);
812 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
813 MI.getParent()->removeSuccessor(Target);
814 MI.eraseFromParent();
815 return true;
816 }
817 break;
818 }
819 }
820 return false;
821}
822
823// Find the original register that VReg is copied from.
824static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
825 while (Register::isVirtualRegister(VReg)) {
826 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
827 if (!DefMI->isFullCopy())
828 return VReg;
829 VReg = DefMI->getOperand(1).getReg();
830 }
831 return VReg;
832}
833
834// Determine if VReg is defined by an instruction that can be folded into a
835// csel instruction. If so, return the folded opcode, and the replacement
836// register.
837static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
838 unsigned *NewReg = nullptr) {
839 VReg = removeCopies(MRI, VReg);
841 return 0;
842
843 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
844 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
845 unsigned Opc = 0;
846 unsigned SrcReg = 0;
847 switch (DefMI->getOpcode()) {
848 case AArch64::SUBREG_TO_REG:
849 // Check for the following way to define an 64-bit immediate:
850 // %0:gpr32 = MOVi32imm 1
851 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
852 if (!DefMI->getOperand(1).isReg())
853 return 0;
854 if (!DefMI->getOperand(2).isImm() ||
855 DefMI->getOperand(2).getImm() != AArch64::sub_32)
856 return 0;
857 DefMI = MRI.getVRegDef(DefMI->getOperand(1).getReg());
858 if (DefMI->getOpcode() != AArch64::MOVi32imm)
859 return 0;
860 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
861 return 0;
862 assert(Is64Bit);
863 SrcReg = AArch64::XZR;
864 Opc = AArch64::CSINCXr;
865 break;
866
867 case AArch64::MOVi32imm:
868 case AArch64::MOVi64imm:
869 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
870 return 0;
871 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
872 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
873 break;
874
875 case AArch64::ADDSXri:
876 case AArch64::ADDSWri:
877 // if NZCV is used, do not fold.
878 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
879 true) == -1)
880 return 0;
881 // fall-through to ADDXri and ADDWri.
882 [[fallthrough]];
883 case AArch64::ADDXri:
884 case AArch64::ADDWri:
885 // add x, 1 -> csinc.
886 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
887 DefMI->getOperand(3).getImm() != 0)
888 return 0;
889 SrcReg = DefMI->getOperand(1).getReg();
890 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
891 break;
892
893 case AArch64::ORNXrr:
894 case AArch64::ORNWrr: {
895 // not x -> csinv, represented as orn dst, xzr, src.
896 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
897 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
898 return 0;
899 SrcReg = DefMI->getOperand(2).getReg();
900 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
901 break;
902 }
903
904 case AArch64::SUBSXrr:
905 case AArch64::SUBSWrr:
906 // if NZCV is used, do not fold.
907 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
908 true) == -1)
909 return 0;
910 // fall-through to SUBXrr and SUBWrr.
911 [[fallthrough]];
912 case AArch64::SUBXrr:
913 case AArch64::SUBWrr: {
914 // neg x -> csneg, represented as sub dst, xzr, src.
915 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
916 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
917 return 0;
918 SrcReg = DefMI->getOperand(2).getReg();
919 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
920 break;
921 }
922 default:
923 return 0;
924 }
925 assert(Opc && SrcReg && "Missing parameters");
926
927 if (NewReg)
928 *NewReg = SrcReg;
929 return Opc;
930}
931
934 Register DstReg, Register TrueReg,
935 Register FalseReg, int &CondCycles,
936 int &TrueCycles,
937 int &FalseCycles) const {
938 // Check register classes.
939 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
940 const TargetRegisterClass *RC =
941 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
942 if (!RC)
943 return false;
944
945 // Also need to check the dest regclass, in case we're trying to optimize
946 // something like:
947 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
948 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
949 return false;
950
951 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
952 unsigned ExtraCondLat = Cond.size() != 1;
953
954 // GPRs are handled by csel.
955 // FIXME: Fold in x+1, -x, and ~x when applicable.
956 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
957 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
958 // Single-cycle csel, csinc, csinv, and csneg.
959 CondCycles = 1 + ExtraCondLat;
960 TrueCycles = FalseCycles = 1;
961 if (canFoldIntoCSel(MRI, TrueReg))
962 TrueCycles = 0;
963 else if (canFoldIntoCSel(MRI, FalseReg))
964 FalseCycles = 0;
965 return true;
966 }
967
968 // Scalar floating point is handled by fcsel.
969 // FIXME: Form fabs, fmin, and fmax when applicable.
970 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
971 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
972 CondCycles = 5 + ExtraCondLat;
973 TrueCycles = FalseCycles = 2;
974 return true;
975 }
976
977 // Can't do vectors.
978 return false;
979}
980
983 const DebugLoc &DL, Register DstReg,
985 Register TrueReg, Register FalseReg) const {
986 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
987
988 // Parse the condition code, see parseCondBranch() above.
990 switch (Cond.size()) {
991 default:
992 llvm_unreachable("Unknown condition opcode in Cond");
993 case 1: // b.cc
995 break;
996 case 3: { // cbz/cbnz
997 // We must insert a compare against 0.
998 bool Is64Bit;
999 switch (Cond[1].getImm()) {
1000 default:
1001 llvm_unreachable("Unknown branch opcode in Cond");
1002 case AArch64::CBZW:
1003 Is64Bit = false;
1004 CC = AArch64CC::EQ;
1005 break;
1006 case AArch64::CBZX:
1007 Is64Bit = true;
1008 CC = AArch64CC::EQ;
1009 break;
1010 case AArch64::CBNZW:
1011 Is64Bit = false;
1012 CC = AArch64CC::NE;
1013 break;
1014 case AArch64::CBNZX:
1015 Is64Bit = true;
1016 CC = AArch64CC::NE;
1017 break;
1018 }
1019 Register SrcReg = Cond[2].getReg();
1020 if (Is64Bit) {
1021 // cmp reg, #0 is actually subs xzr, reg, #0.
1022 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
1023 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
1024 .addReg(SrcReg)
1025 .addImm(0)
1026 .addImm(0);
1027 } else {
1028 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
1029 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
1030 .addReg(SrcReg)
1031 .addImm(0)
1032 .addImm(0);
1033 }
1034 break;
1035 }
1036 case 4: { // tbz/tbnz
1037 // We must insert a tst instruction.
1038 switch (Cond[1].getImm()) {
1039 default:
1040 llvm_unreachable("Unknown branch opcode in Cond");
1041 case AArch64::TBZW:
1042 case AArch64::TBZX:
1043 CC = AArch64CC::EQ;
1044 break;
1045 case AArch64::TBNZW:
1046 case AArch64::TBNZX:
1047 CC = AArch64CC::NE;
1048 break;
1049 }
1050 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1051 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1052 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1053 .addReg(Cond[2].getReg())
1054 .addImm(
1056 else
1057 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1058 .addReg(Cond[2].getReg())
1059 .addImm(
1061 break;
1062 }
1063 case 5: { // cb
1064 // We must insert a cmp, that is a subs
1065 // 0 1 2 3 4
1066 // Cond is { -1, Opcode, CC, Op0, Op1 }
1067
1068 unsigned SubsOpc, SubsDestReg;
1069 bool IsImm = false;
1070 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1071 switch (Cond[1].getImm()) {
1072 default:
1073 llvm_unreachable("Unknown branch opcode in Cond");
1074 case AArch64::CBWPri:
1075 SubsOpc = AArch64::SUBSWri;
1076 SubsDestReg = AArch64::WZR;
1077 IsImm = true;
1078 break;
1079 case AArch64::CBXPri:
1080 SubsOpc = AArch64::SUBSXri;
1081 SubsDestReg = AArch64::XZR;
1082 IsImm = true;
1083 break;
1084 case AArch64::CBWPrr:
1085 SubsOpc = AArch64::SUBSWrr;
1086 SubsDestReg = AArch64::WZR;
1087 IsImm = false;
1088 break;
1089 case AArch64::CBXPrr:
1090 SubsOpc = AArch64::SUBSXrr;
1091 SubsDestReg = AArch64::XZR;
1092 IsImm = false;
1093 break;
1094 }
1095
1096 if (IsImm)
1097 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1098 .addReg(Cond[3].getReg())
1099 .addImm(Cond[4].getImm())
1100 .addImm(0);
1101 else
1102 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1103 .addReg(Cond[3].getReg())
1104 .addReg(Cond[4].getReg());
1105 } break;
1106 case 7: { // cb[b,h]
1107 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1108 // that have been folded. For the first operand we codegen an explicit
1109 // extension, for the second operand we fold the extension into cmp.
1110 // 0 1 2 3 4 5 6
1111 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1112
1113 // We need a new register for the now explicitly extended register
1114 Register Reg = Cond[4].getReg();
1116 unsigned ExtOpc;
1117 unsigned ExtBits;
1118 AArch64_AM::ShiftExtendType ExtendType =
1120 switch (ExtendType) {
1121 default:
1122 llvm_unreachable("Unknown shift-extend for CB instruction");
1123 case AArch64_AM::SXTB:
1124 assert(
1125 Cond[1].getImm() == AArch64::CBBAssertExt &&
1126 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1127 ExtOpc = AArch64::SBFMWri;
1128 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1129 break;
1130 case AArch64_AM::SXTH:
1131 assert(
1132 Cond[1].getImm() == AArch64::CBHAssertExt &&
1133 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1134 ExtOpc = AArch64::SBFMWri;
1135 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1136 break;
1137 case AArch64_AM::UXTB:
1138 assert(
1139 Cond[1].getImm() == AArch64::CBBAssertExt &&
1140 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1141 ExtOpc = AArch64::ANDWri;
1142 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1143 break;
1144 case AArch64_AM::UXTH:
1145 assert(
1146 Cond[1].getImm() == AArch64::CBHAssertExt &&
1147 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1148 ExtOpc = AArch64::ANDWri;
1149 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1150 break;
1151 }
1152
1153 // Build the explicit extension of the first operand
1154 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1156 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1157 if (ExtOpc != AArch64::ANDWri)
1158 MBBI.addImm(0);
1159 MBBI.addImm(ExtBits);
1160 }
1161
1162 // Now, subs with an extended second operand
1164 AArch64_AM::ShiftExtendType ExtendType =
1166 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1167 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1168 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1169 .addReg(Cond[3].getReg())
1170 .addReg(Reg)
1171 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1172 } // If no extension is needed, just a regular subs
1173 else {
1174 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1175 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1176 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1177 .addReg(Cond[3].getReg())
1178 .addReg(Reg);
1179 }
1180
1181 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1182 } break;
1183 }
1184
1185 unsigned Opc = 0;
1186 const TargetRegisterClass *RC = nullptr;
1187 bool TryFold = false;
1188 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1189 RC = &AArch64::GPR64RegClass;
1190 Opc = AArch64::CSELXr;
1191 TryFold = true;
1192 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1193 RC = &AArch64::GPR32RegClass;
1194 Opc = AArch64::CSELWr;
1195 TryFold = true;
1196 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1197 RC = &AArch64::FPR64RegClass;
1198 Opc = AArch64::FCSELDrrr;
1199 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1200 RC = &AArch64::FPR32RegClass;
1201 Opc = AArch64::FCSELSrrr;
1202 }
1203 assert(RC && "Unsupported regclass");
1204
1205 // Try folding simple instructions into the csel.
1206 if (TryFold) {
1207 unsigned NewReg = 0;
1208 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1209 if (FoldedOpc) {
1210 // The folded opcodes csinc, csinc and csneg apply the operation to
1211 // FalseReg, so we need to invert the condition.
1213 TrueReg = FalseReg;
1214 } else
1215 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1216
1217 // Fold the operation. Leave any dead instructions for DCE to clean up.
1218 if (FoldedOpc) {
1219 FalseReg = NewReg;
1220 Opc = FoldedOpc;
1221 // Extend the live range of NewReg.
1222 MRI.clearKillFlags(NewReg);
1223 }
1224 }
1225
1226 // Pull all virtual register into the appropriate class.
1227 MRI.constrainRegClass(TrueReg, RC);
1228 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1229 assert(
1230 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1231 FalseReg == AArch64::XZR) &&
1232 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1233 if (FalseReg.isVirtual())
1234 MRI.constrainRegClass(FalseReg, RC);
1235
1236 // Insert the csel.
1237 BuildMI(MBB, I, DL, get(Opc), DstReg)
1238 .addReg(TrueReg)
1239 .addReg(FalseReg)
1240 .addImm(CC);
1241}
1242
1243// Return true if Imm can be loaded into a register by a "cheap" sequence of
1244// instructions. For now, "cheap" means at most two instructions.
1245static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1246 if (BitSize == 32)
1247 return true;
1248
1249 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1250 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1252 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1253
1254 return Is.size() <= 2;
1255}
1256
1257// Check if a COPY instruction is cheap.
1258static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1259 assert(MI.isCopy() && "Expected COPY instruction");
1260 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1261
1262 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1263 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1264 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1265 if (Reg.isVirtual())
1266 return MRI.getRegClass(Reg);
1267 if (Reg.isPhysical())
1268 return RI.getMinimalPhysRegClass(Reg);
1269 return nullptr;
1270 };
1271 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1272 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1273 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1274 return false;
1275
1276 return MI.isAsCheapAsAMove();
1277}
1278
1279// FIXME: this implementation should be micro-architecture dependent, so a
1280// micro-architecture target hook should be introduced here in future.
1282 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1283 if (isExynosCheapAsMove(MI))
1284 return true;
1285 return MI.isAsCheapAsAMove();
1286 }
1287
1288 switch (MI.getOpcode()) {
1289 default:
1290 return MI.isAsCheapAsAMove();
1291
1292 case TargetOpcode::COPY:
1293 return isCheapCopy(MI, RI);
1294
1295 case AArch64::ADDWrs:
1296 case AArch64::ADDXrs:
1297 case AArch64::SUBWrs:
1298 case AArch64::SUBXrs:
1299 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1300
1301 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1302 // ORRXri, it is as cheap as MOV.
1303 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1304 case AArch64::MOVi32imm:
1305 return isCheapImmediate(MI, 32);
1306 case AArch64::MOVi64imm:
1307 return isCheapImmediate(MI, 64);
1308 }
1309}
1310
1311bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1312 switch (MI.getOpcode()) {
1313 default:
1314 return false;
1315
1316 case AArch64::ADDWrs:
1317 case AArch64::ADDXrs:
1318 case AArch64::ADDSWrs:
1319 case AArch64::ADDSXrs: {
1320 unsigned Imm = MI.getOperand(3).getImm();
1321 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1322 if (ShiftVal == 0)
1323 return true;
1324 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1325 }
1326
1327 case AArch64::ADDWrx:
1328 case AArch64::ADDXrx:
1329 case AArch64::ADDXrx64:
1330 case AArch64::ADDSWrx:
1331 case AArch64::ADDSXrx:
1332 case AArch64::ADDSXrx64: {
1333 unsigned Imm = MI.getOperand(3).getImm();
1334 switch (AArch64_AM::getArithExtendType(Imm)) {
1335 default:
1336 return false;
1337 case AArch64_AM::UXTB:
1338 case AArch64_AM::UXTH:
1339 case AArch64_AM::UXTW:
1340 case AArch64_AM::UXTX:
1341 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1342 }
1343 }
1344
1345 case AArch64::SUBWrs:
1346 case AArch64::SUBSWrs: {
1347 unsigned Imm = MI.getOperand(3).getImm();
1348 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1349 return ShiftVal == 0 ||
1350 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1351 }
1352
1353 case AArch64::SUBXrs:
1354 case AArch64::SUBSXrs: {
1355 unsigned Imm = MI.getOperand(3).getImm();
1356 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1357 return ShiftVal == 0 ||
1358 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1359 }
1360
1361 case AArch64::SUBWrx:
1362 case AArch64::SUBXrx:
1363 case AArch64::SUBXrx64:
1364 case AArch64::SUBSWrx:
1365 case AArch64::SUBSXrx:
1366 case AArch64::SUBSXrx64: {
1367 unsigned Imm = MI.getOperand(3).getImm();
1368 switch (AArch64_AM::getArithExtendType(Imm)) {
1369 default:
1370 return false;
1371 case AArch64_AM::UXTB:
1372 case AArch64_AM::UXTH:
1373 case AArch64_AM::UXTW:
1374 case AArch64_AM::UXTX:
1375 return AArch64_AM::getArithShiftValue(Imm) == 0;
1376 }
1377 }
1378
1379 case AArch64::LDRBBroW:
1380 case AArch64::LDRBBroX:
1381 case AArch64::LDRBroW:
1382 case AArch64::LDRBroX:
1383 case AArch64::LDRDroW:
1384 case AArch64::LDRDroX:
1385 case AArch64::LDRHHroW:
1386 case AArch64::LDRHHroX:
1387 case AArch64::LDRHroW:
1388 case AArch64::LDRHroX:
1389 case AArch64::LDRQroW:
1390 case AArch64::LDRQroX:
1391 case AArch64::LDRSBWroW:
1392 case AArch64::LDRSBWroX:
1393 case AArch64::LDRSBXroW:
1394 case AArch64::LDRSBXroX:
1395 case AArch64::LDRSHWroW:
1396 case AArch64::LDRSHWroX:
1397 case AArch64::LDRSHXroW:
1398 case AArch64::LDRSHXroX:
1399 case AArch64::LDRSWroW:
1400 case AArch64::LDRSWroX:
1401 case AArch64::LDRSroW:
1402 case AArch64::LDRSroX:
1403 case AArch64::LDRWroW:
1404 case AArch64::LDRWroX:
1405 case AArch64::LDRXroW:
1406 case AArch64::LDRXroX:
1407 case AArch64::PRFMroW:
1408 case AArch64::PRFMroX:
1409 case AArch64::STRBBroW:
1410 case AArch64::STRBBroX:
1411 case AArch64::STRBroW:
1412 case AArch64::STRBroX:
1413 case AArch64::STRDroW:
1414 case AArch64::STRDroX:
1415 case AArch64::STRHHroW:
1416 case AArch64::STRHHroX:
1417 case AArch64::STRHroW:
1418 case AArch64::STRHroX:
1419 case AArch64::STRQroW:
1420 case AArch64::STRQroX:
1421 case AArch64::STRSroW:
1422 case AArch64::STRSroX:
1423 case AArch64::STRWroW:
1424 case AArch64::STRWroX:
1425 case AArch64::STRXroW:
1426 case AArch64::STRXroX: {
1427 unsigned IsSigned = MI.getOperand(3).getImm();
1428 return !IsSigned;
1429 }
1430 }
1431}
1432
1433bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1434 unsigned Opc = MI.getOpcode();
1435 switch (Opc) {
1436 default:
1437 return false;
1438 case AArch64::SEH_StackAlloc:
1439 case AArch64::SEH_SaveFPLR:
1440 case AArch64::SEH_SaveFPLR_X:
1441 case AArch64::SEH_SaveReg:
1442 case AArch64::SEH_SaveReg_X:
1443 case AArch64::SEH_SaveRegP:
1444 case AArch64::SEH_SaveRegP_X:
1445 case AArch64::SEH_SaveFReg:
1446 case AArch64::SEH_SaveFReg_X:
1447 case AArch64::SEH_SaveFRegP:
1448 case AArch64::SEH_SaveFRegP_X:
1449 case AArch64::SEH_SetFP:
1450 case AArch64::SEH_AddFP:
1451 case AArch64::SEH_Nop:
1452 case AArch64::SEH_PrologEnd:
1453 case AArch64::SEH_EpilogStart:
1454 case AArch64::SEH_EpilogEnd:
1455 case AArch64::SEH_PACSignLR:
1456 case AArch64::SEH_SaveAnyRegI:
1457 case AArch64::SEH_SaveAnyRegIP:
1458 case AArch64::SEH_SaveAnyRegQP:
1459 case AArch64::SEH_SaveAnyRegQPX:
1460 case AArch64::SEH_AllocZ:
1461 case AArch64::SEH_SaveZReg:
1462 case AArch64::SEH_SavePReg:
1463 return true;
1464 }
1465}
1466
1468 Register &SrcReg, Register &DstReg,
1469 unsigned &SubIdx) const {
1470 switch (MI.getOpcode()) {
1471 default:
1472 return false;
1473 case AArch64::SBFMXri: // aka sxtw
1474 case AArch64::UBFMXri: // aka uxtw
1475 // Check for the 32 -> 64 bit extension case, these instructions can do
1476 // much more.
1477 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1478 return false;
1479 // This is a signed or unsigned 32 -> 64 bit extension.
1480 SrcReg = MI.getOperand(1).getReg();
1481 DstReg = MI.getOperand(0).getReg();
1482 SubIdx = AArch64::sub_32;
1483 return true;
1484 }
1485}
1486
1488 const MachineInstr &MIa, const MachineInstr &MIb) const {
1490 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1491 int64_t OffsetA = 0, OffsetB = 0;
1492 TypeSize WidthA(0, false), WidthB(0, false);
1493 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1494
1495 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1496 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1497
1500 return false;
1501
1502 // Retrieve the base, offset from the base and width. Width
1503 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1504 // base are identical, and the offset of a lower memory access +
1505 // the width doesn't overlap the offset of a higher memory access,
1506 // then the memory accesses are different.
1507 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1508 // are assumed to have the same scale (vscale).
1509 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1510 WidthA, TRI) &&
1511 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1512 WidthB, TRI)) {
1513 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1514 OffsetAIsScalable == OffsetBIsScalable) {
1515 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1516 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1517 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1518 if (LowWidth.isScalable() == OffsetAIsScalable &&
1519 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1520 return true;
1521 }
1522 }
1523 return false;
1524}
1525
1527 const MachineBasicBlock *MBB,
1528 const MachineFunction &MF) const {
1530 return true;
1531
1532 // Do not move an instruction that can be recognized as a branch target.
1533 if (hasBTISemantics(MI))
1534 return true;
1535
1536 switch (MI.getOpcode()) {
1537 case AArch64::HINT:
1538 // CSDB hints are scheduling barriers.
1539 if (MI.getOperand(0).getImm() == 0x14)
1540 return true;
1541 break;
1542 case AArch64::DSB:
1543 case AArch64::ISB:
1544 // DSB and ISB also are scheduling barriers.
1545 return true;
1546 case AArch64::MSRpstatesvcrImm1:
1547 // SMSTART and SMSTOP are also scheduling barriers.
1548 return true;
1549 default:;
1550 }
1551 if (isSEHInstruction(MI))
1552 return true;
1553 auto Next = std::next(MI.getIterator());
1554 return Next != MBB->end() && Next->isCFIInstruction();
1555}
1556
1557/// analyzeCompare - For a comparison instruction, return the source registers
1558/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1559/// Return true if the comparison instruction can be analyzed.
1561 Register &SrcReg2, int64_t &CmpMask,
1562 int64_t &CmpValue) const {
1563 // The first operand can be a frame index where we'd normally expect a
1564 // register.
1565 // FIXME: Pass subregisters out of analyzeCompare
1566 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1567 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1568 return false;
1569
1570 switch (MI.getOpcode()) {
1571 default:
1572 break;
1573 case AArch64::PTEST_PP:
1574 case AArch64::PTEST_PP_ANY:
1575 case AArch64::PTEST_PP_FIRST:
1576 SrcReg = MI.getOperand(0).getReg();
1577 SrcReg2 = MI.getOperand(1).getReg();
1578 if (MI.getOperand(2).getSubReg())
1579 return false;
1580
1581 // Not sure about the mask and value for now...
1582 CmpMask = ~0;
1583 CmpValue = 0;
1584 return true;
1585 case AArch64::SUBSWrr:
1586 case AArch64::SUBSWrs:
1587 case AArch64::SUBSWrx:
1588 case AArch64::SUBSXrr:
1589 case AArch64::SUBSXrs:
1590 case AArch64::SUBSXrx:
1591 case AArch64::ADDSWrr:
1592 case AArch64::ADDSWrs:
1593 case AArch64::ADDSWrx:
1594 case AArch64::ADDSXrr:
1595 case AArch64::ADDSXrs:
1596 case AArch64::ADDSXrx:
1597 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1598 SrcReg = MI.getOperand(1).getReg();
1599 SrcReg2 = MI.getOperand(2).getReg();
1600
1601 // FIXME: Pass subregisters out of analyzeCompare
1602 if (MI.getOperand(2).getSubReg())
1603 return false;
1604
1605 CmpMask = ~0;
1606 CmpValue = 0;
1607 return true;
1608 case AArch64::SUBSWri:
1609 case AArch64::ADDSWri:
1610 case AArch64::SUBSXri:
1611 case AArch64::ADDSXri:
1612 SrcReg = MI.getOperand(1).getReg();
1613 SrcReg2 = 0;
1614 CmpMask = ~0;
1615 CmpValue = MI.getOperand(2).getImm();
1616 return true;
1617 case AArch64::ANDSWri:
1618 case AArch64::ANDSXri:
1619 // ANDS does not use the same encoding scheme as the others xxxS
1620 // instructions.
1621 SrcReg = MI.getOperand(1).getReg();
1622 SrcReg2 = 0;
1623 CmpMask = ~0;
1625 MI.getOperand(2).getImm(),
1626 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1627 return true;
1628 }
1629
1630 return false;
1631}
1632
1634 MachineBasicBlock *MBB = Instr.getParent();
1635 assert(MBB && "Can't get MachineBasicBlock here");
1636 MachineFunction *MF = MBB->getParent();
1637 assert(MF && "Can't get MachineFunction here");
1640 MachineRegisterInfo *MRI = &MF->getRegInfo();
1641
1642 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1643 ++OpIdx) {
1644 MachineOperand &MO = Instr.getOperand(OpIdx);
1645 const TargetRegisterClass *OpRegCstraints =
1646 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1647
1648 // If there's no constraint, there's nothing to do.
1649 if (!OpRegCstraints)
1650 continue;
1651 // If the operand is a frame index, there's nothing to do here.
1652 // A frame index operand will resolve correctly during PEI.
1653 if (MO.isFI())
1654 continue;
1655
1656 assert(MO.isReg() &&
1657 "Operand has register constraints without being a register!");
1658
1659 Register Reg = MO.getReg();
1660 if (Reg.isPhysical()) {
1661 if (!OpRegCstraints->contains(Reg))
1662 return false;
1663 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1664 !MRI->constrainRegClass(Reg, OpRegCstraints))
1665 return false;
1666 }
1667
1668 return true;
1669}
1670
1671/// Return the opcode that does not set flags when possible - otherwise
1672/// return the original opcode. The caller is responsible to do the actual
1673/// substitution and legality checking.
1675 // Don't convert all compare instructions, because for some the zero register
1676 // encoding becomes the sp register.
1677 bool MIDefinesZeroReg = false;
1678 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1679 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1680 MIDefinesZeroReg = true;
1681
1682 switch (MI.getOpcode()) {
1683 default:
1684 return MI.getOpcode();
1685 case AArch64::ADDSWrr:
1686 return AArch64::ADDWrr;
1687 case AArch64::ADDSWri:
1688 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1689 case AArch64::ADDSWrs:
1690 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1691 case AArch64::ADDSWrx:
1692 return AArch64::ADDWrx;
1693 case AArch64::ADDSXrr:
1694 return AArch64::ADDXrr;
1695 case AArch64::ADDSXri:
1696 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1697 case AArch64::ADDSXrs:
1698 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1699 case AArch64::ADDSXrx:
1700 return AArch64::ADDXrx;
1701 case AArch64::SUBSWrr:
1702 return AArch64::SUBWrr;
1703 case AArch64::SUBSWri:
1704 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1705 case AArch64::SUBSWrs:
1706 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1707 case AArch64::SUBSWrx:
1708 return AArch64::SUBWrx;
1709 case AArch64::SUBSXrr:
1710 return AArch64::SUBXrr;
1711 case AArch64::SUBSXri:
1712 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1713 case AArch64::SUBSXrs:
1714 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1715 case AArch64::SUBSXrx:
1716 return AArch64::SUBXrx;
1717 }
1718}
1719
1720enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1721
1722/// True when condition flags are accessed (either by writing or reading)
1723/// on the instruction trace starting at From and ending at To.
1724///
1725/// Note: If From and To are from different blocks it's assumed CC are accessed
1726/// on the path.
1729 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1730 // Early exit if To is at the beginning of the BB.
1731 if (To == To->getParent()->begin())
1732 return true;
1733
1734 // Check whether the instructions are in the same basic block
1735 // If not, assume the condition flags might get modified somewhere.
1736 if (To->getParent() != From->getParent())
1737 return true;
1738
1739 // From must be above To.
1740 assert(std::any_of(
1741 ++To.getReverse(), To->getParent()->rend(),
1742 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1743
1744 // We iterate backward starting at \p To until we hit \p From.
1745 for (const MachineInstr &Instr :
1747 if (((AccessToCheck & AK_Write) &&
1748 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1749 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1750 return true;
1751 }
1752 return false;
1753}
1754
1755std::optional<unsigned>
1756AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1757 MachineInstr *Pred,
1758 const MachineRegisterInfo *MRI) const {
1759 unsigned MaskOpcode = Mask->getOpcode();
1760 unsigned PredOpcode = Pred->getOpcode();
1761 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1762 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1763
1764 if (PredIsWhileLike) {
1765 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1766 // instruction and the condition is "any" since WHILcc does an implicit
1767 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1768 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1769 return PredOpcode;
1770
1771 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1772 // redundant since WHILE performs an implicit PTEST with an all active
1773 // mask.
1774 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1775 getElementSizeForOpcode(MaskOpcode) ==
1776 getElementSizeForOpcode(PredOpcode))
1777 return PredOpcode;
1778
1779 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1780 // WHILEcc performs an implicit PTEST with an all active mask, setting
1781 // the N flag as the PTEST_FIRST would.
1782 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1783 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1784 return PredOpcode;
1785
1786 return {};
1787 }
1788
1789 if (PredIsPTestLike) {
1790 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1791 // instruction that sets the flags as PTEST would and the condition is
1792 // "any" since PG is always a subset of the governing predicate of the
1793 // ptest-like instruction.
1794 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1795 return PredOpcode;
1796
1797 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1798
1799 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1800 // to look through a copy and try again. This is because some instructions
1801 // take a predicate whose register class is a subset of its result class.
1802 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1803 PTestLikeMask->getOperand(1).getReg().isVirtual())
1804 PTestLikeMask =
1805 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1806
1807 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1808 // the element size matches and either the PTEST_LIKE instruction uses
1809 // the same all active mask or the condition is "any".
1810 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1811 getElementSizeForOpcode(MaskOpcode) ==
1812 getElementSizeForOpcode(PredOpcode)) {
1813 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1814 return PredOpcode;
1815 }
1816
1817 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1818 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1819 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1820 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1821 // performed by the compare could consider fewer lanes for these element
1822 // sizes.
1823 //
1824 // For example, consider
1825 //
1826 // ptrue p0.b ; P0=1111-1111-1111-1111
1827 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1828 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1829 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1830 // ; ^ last active
1831 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1832 // ; ^ last active
1833 //
1834 // where the compare generates a canonical all active 32-bit predicate
1835 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1836 // active flag, whereas the PTEST instruction with the same mask doesn't.
1837 // For PTEST_ANY this doesn't apply as the flags in this case would be
1838 // identical regardless of element size.
1839 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1840 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1841 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1842 return PredOpcode;
1843
1844 return {};
1845 }
1846
1847 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1848 // opcode so the PTEST becomes redundant.
1849 switch (PredOpcode) {
1850 case AArch64::AND_PPzPP:
1851 case AArch64::BIC_PPzPP:
1852 case AArch64::EOR_PPzPP:
1853 case AArch64::NAND_PPzPP:
1854 case AArch64::NOR_PPzPP:
1855 case AArch64::ORN_PPzPP:
1856 case AArch64::ORR_PPzPP:
1857 case AArch64::BRKA_PPzP:
1858 case AArch64::BRKPA_PPzPP:
1859 case AArch64::BRKB_PPzP:
1860 case AArch64::BRKPB_PPzPP:
1861 case AArch64::RDFFR_PPz: {
1862 // Check to see if our mask is the same. If not the resulting flag bits
1863 // may be different and we can't remove the ptest.
1864 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1865 if (Mask != PredMask)
1866 return {};
1867 break;
1868 }
1869 case AArch64::BRKN_PPzP: {
1870 // BRKN uses an all active implicit mask to set flags unlike the other
1871 // flag-setting instructions.
1872 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1873 if ((MaskOpcode != AArch64::PTRUE_B) ||
1874 (Mask->getOperand(1).getImm() != 31))
1875 return {};
1876 break;
1877 }
1878 case AArch64::PTRUE_B:
1879 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1880 break;
1881 default:
1882 // Bail out if we don't recognize the input
1883 return {};
1884 }
1885
1886 return convertToFlagSettingOpc(PredOpcode);
1887}
1888
1889/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1890/// operation which could set the flags in an identical manner
1891bool AArch64InstrInfo::optimizePTestInstr(
1892 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1893 const MachineRegisterInfo *MRI) const {
1894 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1895 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1896
1897 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1898 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1899 // before the branch to extract each subregister.
1900 auto Op = Pred->getOperand(1);
1901 if (Op.isReg() && Op.getReg().isVirtual() &&
1902 Op.getSubReg() == AArch64::psub0)
1903 Pred = MRI->getUniqueVRegDef(Op.getReg());
1904 }
1905
1906 unsigned PredOpcode = Pred->getOpcode();
1907 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1908 if (!NewOp)
1909 return false;
1910
1911 const TargetRegisterInfo *TRI = &getRegisterInfo();
1912
1913 // If another instruction between Pred and PTest accesses flags, don't remove
1914 // the ptest or update the earlier instruction to modify them.
1915 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1916 return false;
1917
1918 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1919 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1920 // operand to be replaced with an equivalent instruction that also sets the
1921 // flags.
1922 PTest->eraseFromParent();
1923 if (*NewOp != PredOpcode) {
1924 Pred->setDesc(get(*NewOp));
1925 bool succeeded = UpdateOperandRegClass(*Pred);
1926 (void)succeeded;
1927 assert(succeeded && "Operands have incompatible register classes!");
1928 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1929 }
1930
1931 // Ensure that the flags def is live.
1932 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1933 unsigned i = 0, e = Pred->getNumOperands();
1934 for (; i != e; ++i) {
1935 MachineOperand &MO = Pred->getOperand(i);
1936 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1937 MO.setIsDead(false);
1938 break;
1939 }
1940 }
1941 }
1942 return true;
1943}
1944
1945/// Try to optimize a compare instruction. A compare instruction is an
1946/// instruction which produces AArch64::NZCV. It can be truly compare
1947/// instruction
1948/// when there are no uses of its destination register.
1949///
1950/// The following steps are tried in order:
1951/// 1. Convert CmpInstr into an unconditional version.
1952/// 2. Remove CmpInstr if above there is an instruction producing a needed
1953/// condition code or an instruction which can be converted into such an
1954/// instruction.
1955/// Only comparison with zero is supported.
1957 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1958 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1959 assert(CmpInstr.getParent());
1960 assert(MRI);
1961
1962 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1963 int DeadNZCVIdx =
1964 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1965 if (DeadNZCVIdx != -1) {
1966 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1967 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1968 CmpInstr.eraseFromParent();
1969 return true;
1970 }
1971 unsigned Opc = CmpInstr.getOpcode();
1972 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1973 if (NewOpc == Opc)
1974 return false;
1975 const MCInstrDesc &MCID = get(NewOpc);
1976 CmpInstr.setDesc(MCID);
1977 CmpInstr.removeOperand(DeadNZCVIdx);
1978 bool succeeded = UpdateOperandRegClass(CmpInstr);
1979 (void)succeeded;
1980 assert(succeeded && "Some operands reg class are incompatible!");
1981 return true;
1982 }
1983
1984 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1985 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1986 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1987 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1988
1989 if (SrcReg2 != 0)
1990 return false;
1991
1992 // CmpInstr is a Compare instruction if destination register is not used.
1993 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1994 return false;
1995
1996 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1997 return true;
1998 return (CmpValue == 0 || CmpValue == 1) &&
1999 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
2000}
2001
2002/// Get opcode of S version of Instr.
2003/// If Instr is S version its opcode is returned.
2004/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
2005/// or we are not interested in it.
2006static unsigned sForm(MachineInstr &Instr) {
2007 switch (Instr.getOpcode()) {
2008 default:
2009 return AArch64::INSTRUCTION_LIST_END;
2010
2011 case AArch64::ADDSWrr:
2012 case AArch64::ADDSWri:
2013 case AArch64::ADDSXrr:
2014 case AArch64::ADDSXri:
2015 case AArch64::ADDSWrx:
2016 case AArch64::ADDSXrx:
2017 case AArch64::SUBSWrr:
2018 case AArch64::SUBSWri:
2019 case AArch64::SUBSWrx:
2020 case AArch64::SUBSXrr:
2021 case AArch64::SUBSXri:
2022 case AArch64::SUBSXrx:
2023 case AArch64::ANDSWri:
2024 case AArch64::ANDSWrr:
2025 case AArch64::ANDSWrs:
2026 case AArch64::ANDSXri:
2027 case AArch64::ANDSXrr:
2028 case AArch64::ANDSXrs:
2029 case AArch64::BICSWrr:
2030 case AArch64::BICSXrr:
2031 case AArch64::BICSWrs:
2032 case AArch64::BICSXrs:
2033 return Instr.getOpcode();
2034
2035 case AArch64::ADDWrr:
2036 return AArch64::ADDSWrr;
2037 case AArch64::ADDWri:
2038 return AArch64::ADDSWri;
2039 case AArch64::ADDXrr:
2040 return AArch64::ADDSXrr;
2041 case AArch64::ADDXri:
2042 return AArch64::ADDSXri;
2043 case AArch64::ADDWrx:
2044 return AArch64::ADDSWrx;
2045 case AArch64::ADDXrx:
2046 return AArch64::ADDSXrx;
2047 case AArch64::ADCWr:
2048 return AArch64::ADCSWr;
2049 case AArch64::ADCXr:
2050 return AArch64::ADCSXr;
2051 case AArch64::SUBWrr:
2052 return AArch64::SUBSWrr;
2053 case AArch64::SUBWri:
2054 return AArch64::SUBSWri;
2055 case AArch64::SUBXrr:
2056 return AArch64::SUBSXrr;
2057 case AArch64::SUBXri:
2058 return AArch64::SUBSXri;
2059 case AArch64::SUBWrx:
2060 return AArch64::SUBSWrx;
2061 case AArch64::SUBXrx:
2062 return AArch64::SUBSXrx;
2063 case AArch64::SBCWr:
2064 return AArch64::SBCSWr;
2065 case AArch64::SBCXr:
2066 return AArch64::SBCSXr;
2067 case AArch64::ANDWri:
2068 return AArch64::ANDSWri;
2069 case AArch64::ANDXri:
2070 return AArch64::ANDSXri;
2071 case AArch64::ANDWrr:
2072 return AArch64::ANDSWrr;
2073 case AArch64::ANDWrs:
2074 return AArch64::ANDSWrs;
2075 case AArch64::ANDXrr:
2076 return AArch64::ANDSXrr;
2077 case AArch64::ANDXrs:
2078 return AArch64::ANDSXrs;
2079 case AArch64::BICWrr:
2080 return AArch64::BICSWrr;
2081 case AArch64::BICXrr:
2082 return AArch64::BICSXrr;
2083 case AArch64::BICWrs:
2084 return AArch64::BICSWrs;
2085 case AArch64::BICXrs:
2086 return AArch64::BICSXrs;
2087 }
2088}
2089
2090/// Check if AArch64::NZCV should be alive in successors of MBB.
2092 for (auto *BB : MBB->successors())
2093 if (BB->isLiveIn(AArch64::NZCV))
2094 return true;
2095 return false;
2096}
2097
2098/// \returns The condition code operand index for \p Instr if it is a branch
2099/// or select and -1 otherwise.
2100int AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(
2101 const MachineInstr &Instr) {
2102 switch (Instr.getOpcode()) {
2103 default:
2104 return -1;
2105
2106 case AArch64::Bcc: {
2107 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2108 assert(Idx >= 2);
2109 return Idx - 2;
2110 }
2111
2112 case AArch64::CSINVWr:
2113 case AArch64::CSINVXr:
2114 case AArch64::CSINCWr:
2115 case AArch64::CSINCXr:
2116 case AArch64::CSELWr:
2117 case AArch64::CSELXr:
2118 case AArch64::CSNEGWr:
2119 case AArch64::CSNEGXr:
2120 case AArch64::FCSELSrrr:
2121 case AArch64::FCSELDrrr: {
2122 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2123 assert(Idx >= 1);
2124 return Idx - 1;
2125 }
2126 }
2127}
2128
2129/// Find a condition code used by the instruction.
2130/// Returns AArch64CC::Invalid if either the instruction does not use condition
2131/// codes or we don't optimize CmpInstr in the presence of such instructions.
2133 int CCIdx =
2134 AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2135 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2136 Instr.getOperand(CCIdx).getImm())
2138}
2139
2142 UsedNZCV UsedFlags;
2143 switch (CC) {
2144 default:
2145 break;
2146
2147 case AArch64CC::EQ: // Z set
2148 case AArch64CC::NE: // Z clear
2149 UsedFlags.Z = true;
2150 break;
2151
2152 case AArch64CC::HI: // Z clear and C set
2153 case AArch64CC::LS: // Z set or C clear
2154 UsedFlags.Z = true;
2155 [[fallthrough]];
2156 case AArch64CC::HS: // C set
2157 case AArch64CC::LO: // C clear
2158 UsedFlags.C = true;
2159 break;
2160
2161 case AArch64CC::MI: // N set
2162 case AArch64CC::PL: // N clear
2163 UsedFlags.N = true;
2164 break;
2165
2166 case AArch64CC::VS: // V set
2167 case AArch64CC::VC: // V clear
2168 UsedFlags.V = true;
2169 break;
2170
2171 case AArch64CC::GT: // Z clear, N and V the same
2172 case AArch64CC::LE: // Z set, N and V differ
2173 UsedFlags.Z = true;
2174 [[fallthrough]];
2175 case AArch64CC::GE: // N and V the same
2176 case AArch64CC::LT: // N and V differ
2177 UsedFlags.N = true;
2178 UsedFlags.V = true;
2179 break;
2180 }
2181 return UsedFlags;
2182}
2183
2184/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2185/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2186/// \returns std::nullopt otherwise.
2187///
2188/// Collect instructions using that flags in \p CCUseInstrs if provided.
2189std::optional<UsedNZCV>
2191 const TargetRegisterInfo &TRI,
2192 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2193 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2194 if (MI.getParent() != CmpParent)
2195 return std::nullopt;
2196
2197 if (areCFlagsAliveInSuccessors(CmpParent))
2198 return std::nullopt;
2199
2200 UsedNZCV NZCVUsedAfterCmp;
2202 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2203 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2205 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2206 return std::nullopt;
2207 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2208 if (CCUseInstrs)
2209 CCUseInstrs->push_back(&Instr);
2210 }
2211 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2212 break;
2213 }
2214 return NZCVUsedAfterCmp;
2215}
2216
2217static bool isADDSRegImm(unsigned Opcode) {
2218 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2219}
2220
2221static bool isSUBSRegImm(unsigned Opcode) {
2222 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2223}
2224
2226 unsigned Opc = sForm(MI);
2227 switch (Opc) {
2228 case AArch64::ANDSWri:
2229 case AArch64::ANDSWrr:
2230 case AArch64::ANDSWrs:
2231 case AArch64::ANDSXri:
2232 case AArch64::ANDSXrr:
2233 case AArch64::ANDSXrs:
2234 case AArch64::BICSWrr:
2235 case AArch64::BICSXrr:
2236 case AArch64::BICSWrs:
2237 case AArch64::BICSXrs:
2238 return true;
2239 default:
2240 return false;
2241 }
2242}
2243
2244/// Check if CmpInstr can be substituted by MI.
2245///
2246/// CmpInstr can be substituted:
2247/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2248/// - and, MI and CmpInstr are from the same MachineBB
2249/// - and, condition flags are not alive in successors of the CmpInstr parent
2250/// - and, if MI opcode is the S form there must be no defs of flags between
2251/// MI and CmpInstr
2252/// or if MI opcode is not the S form there must be neither defs of flags
2253/// nor uses of flags between MI and CmpInstr.
2254/// - and, if C/V flags are not used after CmpInstr
2255/// or if N flag is used but MI produces poison value if signed overflow
2256/// occurs.
2258 const TargetRegisterInfo &TRI) {
2259 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2260 // that may or may not set flags.
2261 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2262
2263 const unsigned CmpOpcode = CmpInstr.getOpcode();
2264 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2265 return false;
2266
2267 assert((CmpInstr.getOperand(2).isImm() &&
2268 CmpInstr.getOperand(2).getImm() == 0) &&
2269 "Caller guarantees that CmpInstr compares with constant 0");
2270
2271 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2272 if (!NZVCUsed || NZVCUsed->C)
2273 return false;
2274
2275 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2276 // '%vreg = add ...' or '%vreg = sub ...'.
2277 // Condition flag V is used to indicate signed overflow.
2278 // 1) MI and CmpInstr set N and V to the same value.
2279 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2280 // signed overflow occurs, so CmpInstr could still be simplified away.
2281 // Note that Ands and Bics instructions always clear the V flag.
2282 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2283 return false;
2284
2285 AccessKind AccessToCheck = AK_Write;
2286 if (sForm(MI) != MI.getOpcode())
2287 AccessToCheck = AK_All;
2288 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2289}
2290
2291/// Substitute an instruction comparing to zero with another instruction
2292/// which produces needed condition flags.
2293///
2294/// Return true on success.
2295bool AArch64InstrInfo::substituteCmpToZero(
2296 MachineInstr &CmpInstr, unsigned SrcReg,
2297 const MachineRegisterInfo &MRI) const {
2298 // Get the unique definition of SrcReg.
2299 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2300 if (!MI)
2301 return false;
2302
2303 const TargetRegisterInfo &TRI = getRegisterInfo();
2304
2305 unsigned NewOpc = sForm(*MI);
2306 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2307 return false;
2308
2309 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2310 return false;
2311
2312 // Update the instruction to set NZCV.
2313 MI->setDesc(get(NewOpc));
2314 CmpInstr.eraseFromParent();
2316 (void)succeeded;
2317 assert(succeeded && "Some operands reg class are incompatible!");
2318 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2319 return true;
2320}
2321
2322/// \returns True if \p CmpInstr can be removed.
2323///
2324/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2325/// codes used in \p CCUseInstrs must be inverted.
2327 int CmpValue, const TargetRegisterInfo &TRI,
2329 bool &IsInvertCC) {
2330 assert((CmpValue == 0 || CmpValue == 1) &&
2331 "Only comparisons to 0 or 1 considered for removal!");
2332
2333 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2334 unsigned MIOpc = MI.getOpcode();
2335 if (MIOpc == AArch64::CSINCWr) {
2336 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2337 MI.getOperand(2).getReg() != AArch64::WZR)
2338 return false;
2339 } else if (MIOpc == AArch64::CSINCXr) {
2340 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2341 MI.getOperand(2).getReg() != AArch64::XZR)
2342 return false;
2343 } else {
2344 return false;
2345 }
2347 if (MICC == AArch64CC::Invalid)
2348 return false;
2349
2350 // NZCV needs to be defined
2351 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2352 return false;
2353
2354 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2355 const unsigned CmpOpcode = CmpInstr.getOpcode();
2356 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2357 if (CmpValue && !IsSubsRegImm)
2358 return false;
2359 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2360 return false;
2361
2362 // MI conditions allowed: eq, ne, mi, pl
2363 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2364 if (MIUsedNZCV.C || MIUsedNZCV.V)
2365 return false;
2366
2367 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2368 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2369 // Condition flags are not used in CmpInstr basic block successors and only
2370 // Z or N flags allowed to be used after CmpInstr within its basic block
2371 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2372 return false;
2373 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2374 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2375 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2376 return false;
2377 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2378 if (MIUsedNZCV.N && !CmpValue)
2379 return false;
2380
2381 // There must be no defs of flags between MI and CmpInstr
2382 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2383 return false;
2384
2385 // Condition code is inverted in the following cases:
2386 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2387 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2388 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2389 (!CmpValue && MICC == AArch64CC::NE);
2390 return true;
2391}
2392
2393/// Remove comparison in csinc-cmp sequence
2394///
2395/// Examples:
2396/// 1. \code
2397/// csinc w9, wzr, wzr, ne
2398/// cmp w9, #0
2399/// b.eq
2400/// \endcode
2401/// to
2402/// \code
2403/// csinc w9, wzr, wzr, ne
2404/// b.ne
2405/// \endcode
2406///
2407/// 2. \code
2408/// csinc x2, xzr, xzr, mi
2409/// cmp x2, #1
2410/// b.pl
2411/// \endcode
2412/// to
2413/// \code
2414/// csinc x2, xzr, xzr, mi
2415/// b.pl
2416/// \endcode
2417///
2418/// \param CmpInstr comparison instruction
2419/// \return True when comparison removed
2420bool AArch64InstrInfo::removeCmpToZeroOrOne(
2421 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2422 const MachineRegisterInfo &MRI) const {
2423 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2424 if (!MI)
2425 return false;
2426 const TargetRegisterInfo &TRI = getRegisterInfo();
2427 SmallVector<MachineInstr *, 4> CCUseInstrs;
2428 bool IsInvertCC = false;
2429 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2430 IsInvertCC))
2431 return false;
2432 // Make transformation
2433 CmpInstr.eraseFromParent();
2434 if (IsInvertCC) {
2435 // Invert condition codes in CmpInstr CC users
2436 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2437 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2438 assert(Idx >= 0 && "Unexpected instruction using CC.");
2439 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2441 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2442 CCOperand.setImm(CCUse);
2443 }
2444 }
2445 return true;
2446}
2447
2448bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2449 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2450 MI.getOpcode() != AArch64::CATCHRET)
2451 return false;
2452
2453 MachineBasicBlock &MBB = *MI.getParent();
2454 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2455 auto TRI = Subtarget.getRegisterInfo();
2456 DebugLoc DL = MI.getDebugLoc();
2457
2458 if (MI.getOpcode() == AArch64::CATCHRET) {
2459 // Skip to the first instruction before the epilog.
2460 const TargetInstrInfo *TII =
2462 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2464 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2465 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2466 FirstEpilogSEH != MBB.begin())
2467 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2468 if (FirstEpilogSEH != MBB.begin())
2469 FirstEpilogSEH = std::next(FirstEpilogSEH);
2470 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2471 .addReg(AArch64::X0, RegState::Define)
2472 .addMBB(TargetMBB);
2473 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2474 .addReg(AArch64::X0, RegState::Define)
2475 .addReg(AArch64::X0)
2476 .addMBB(TargetMBB)
2477 .addImm(0);
2478 TargetMBB->setMachineBlockAddressTaken();
2479 return true;
2480 }
2481
2482 Register Reg = MI.getOperand(0).getReg();
2484 if (M.getStackProtectorGuard() == "sysreg") {
2485 const AArch64SysReg::SysReg *SrcReg =
2486 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2487 if (!SrcReg)
2488 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2489
2490 // mrs xN, sysreg
2491 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2493 .addImm(SrcReg->Encoding);
2494 int Offset = M.getStackProtectorGuardOffset();
2495 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2496 // ldr xN, [xN, #offset]
2497 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2498 .addDef(Reg)
2500 .addImm(Offset / 8);
2501 } else if (Offset >= -256 && Offset <= 255) {
2502 // ldur xN, [xN, #offset]
2503 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2504 .addDef(Reg)
2506 .addImm(Offset);
2507 } else if (Offset >= -4095 && Offset <= 4095) {
2508 if (Offset > 0) {
2509 // add xN, xN, #offset
2510 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2511 .addDef(Reg)
2513 .addImm(Offset)
2514 .addImm(0);
2515 } else {
2516 // sub xN, xN, #offset
2517 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2518 .addDef(Reg)
2520 .addImm(-Offset)
2521 .addImm(0);
2522 }
2523 // ldr xN, [xN]
2524 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2525 .addDef(Reg)
2527 .addImm(0);
2528 } else {
2529 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2530 // than 23760.
2531 // It might be nice to use AArch64::MOVi32imm here, which would get
2532 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2533 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2534 // AArch64FrameLowering might help us find such a scratch register
2535 // though. If we failed to find a scratch register, we could emit a
2536 // stream of add instructions to build up the immediate. Or, we could try
2537 // to insert a AArch64::MOVi32imm before register allocation so that we
2538 // didn't need to scavenge for a scratch register.
2539 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2540 }
2541 MBB.erase(MI);
2542 return true;
2543 }
2544
2545 const GlobalValue *GV =
2546 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2547 const TargetMachine &TM = MBB.getParent()->getTarget();
2548 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2549 const unsigned char MO_NC = AArch64II::MO_NC;
2550
2551 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2552 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2553 .addGlobalAddress(GV, 0, OpFlags);
2554 if (Subtarget.isTargetILP32()) {
2555 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2556 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2557 .addDef(Reg32, RegState::Dead)
2559 .addImm(0)
2560 .addMemOperand(*MI.memoperands_begin())
2562 } else {
2563 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2565 .addImm(0)
2566 .addMemOperand(*MI.memoperands_begin());
2567 }
2568 } else if (TM.getCodeModel() == CodeModel::Large) {
2569 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2570 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2571 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2572 .addImm(0);
2573 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2575 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2576 .addImm(16);
2577 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2579 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2580 .addImm(32);
2581 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2584 .addImm(48);
2585 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2587 .addImm(0)
2588 .addMemOperand(*MI.memoperands_begin());
2589 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2590 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2591 .addGlobalAddress(GV, 0, OpFlags);
2592 } else {
2593 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2594 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2595 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2596 if (Subtarget.isTargetILP32()) {
2597 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2598 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2599 .addDef(Reg32, RegState::Dead)
2601 .addGlobalAddress(GV, 0, LoFlags)
2602 .addMemOperand(*MI.memoperands_begin())
2604 } else {
2605 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2607 .addGlobalAddress(GV, 0, LoFlags)
2608 .addMemOperand(*MI.memoperands_begin());
2609 }
2610 }
2611
2612 MBB.erase(MI);
2613
2614 return true;
2615}
2616
2617// Return true if this instruction simply sets its single destination register
2618// to zero. This is equivalent to a register rename of the zero-register.
2620 switch (MI.getOpcode()) {
2621 default:
2622 break;
2623 case AArch64::MOVZWi:
2624 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2625 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2626 assert(MI.getDesc().getNumOperands() == 3 &&
2627 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2628 return true;
2629 }
2630 break;
2631 case AArch64::ANDWri: // and Rd, Rzr, #imm
2632 return MI.getOperand(1).getReg() == AArch64::WZR;
2633 case AArch64::ANDXri:
2634 return MI.getOperand(1).getReg() == AArch64::XZR;
2635 case TargetOpcode::COPY:
2636 return MI.getOperand(1).getReg() == AArch64::WZR;
2637 }
2638 return false;
2639}
2640
2641// Return true if this instruction simply renames a general register without
2642// modifying bits.
2644 switch (MI.getOpcode()) {
2645 default:
2646 break;
2647 case TargetOpcode::COPY: {
2648 // GPR32 copies will by lowered to ORRXrs
2649 Register DstReg = MI.getOperand(0).getReg();
2650 return (AArch64::GPR32RegClass.contains(DstReg) ||
2651 AArch64::GPR64RegClass.contains(DstReg));
2652 }
2653 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2654 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2655 assert(MI.getDesc().getNumOperands() == 4 &&
2656 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2657 return true;
2658 }
2659 break;
2660 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2661 if (MI.getOperand(2).getImm() == 0) {
2662 assert(MI.getDesc().getNumOperands() == 4 &&
2663 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2664 return true;
2665 }
2666 break;
2667 }
2668 return false;
2669}
2670
2671// Return true if this instruction simply renames a general register without
2672// modifying bits.
2674 switch (MI.getOpcode()) {
2675 default:
2676 break;
2677 case TargetOpcode::COPY: {
2678 Register DstReg = MI.getOperand(0).getReg();
2679 return AArch64::FPR128RegClass.contains(DstReg);
2680 }
2681 case AArch64::ORRv16i8:
2682 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2683 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2684 "invalid ORRv16i8 operands");
2685 return true;
2686 }
2687 break;
2688 }
2689 return false;
2690}
2691
2692static bool isFrameLoadOpcode(int Opcode) {
2693 switch (Opcode) {
2694 default:
2695 return false;
2696 case AArch64::LDRWui:
2697 case AArch64::LDRXui:
2698 case AArch64::LDRBui:
2699 case AArch64::LDRHui:
2700 case AArch64::LDRSui:
2701 case AArch64::LDRDui:
2702 case AArch64::LDRQui:
2703 case AArch64::LDR_PXI:
2704 return true;
2705 }
2706}
2707
2709 int &FrameIndex) const {
2710 if (!isFrameLoadOpcode(MI.getOpcode()))
2711 return Register();
2712
2713 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2714 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2715 FrameIndex = MI.getOperand(1).getIndex();
2716 return MI.getOperand(0).getReg();
2717 }
2718 return Register();
2719}
2720
2721static bool isFrameStoreOpcode(int Opcode) {
2722 switch (Opcode) {
2723 default:
2724 return false;
2725 case AArch64::STRWui:
2726 case AArch64::STRXui:
2727 case AArch64::STRBui:
2728 case AArch64::STRHui:
2729 case AArch64::STRSui:
2730 case AArch64::STRDui:
2731 case AArch64::STRQui:
2732 case AArch64::STR_PXI:
2733 return true;
2734 }
2735}
2736
2738 int &FrameIndex) const {
2739 if (!isFrameStoreOpcode(MI.getOpcode()))
2740 return Register();
2741
2742 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2743 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2744 FrameIndex = MI.getOperand(1).getIndex();
2745 return MI.getOperand(0).getReg();
2746 }
2747 return Register();
2748}
2749
2751 int &FrameIndex) const {
2752 if (!isFrameStoreOpcode(MI.getOpcode()))
2753 return Register();
2754
2755 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2756 return Reg;
2757
2759 if (hasStoreToStackSlot(MI, Accesses)) {
2760 if (Accesses.size() > 1)
2761 return Register();
2762
2763 FrameIndex =
2764 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2765 ->getFrameIndex();
2766 return MI.getOperand(0).getReg();
2767 }
2768 return Register();
2769}
2770
2772 int &FrameIndex) const {
2773 if (!isFrameLoadOpcode(MI.getOpcode()))
2774 return Register();
2775
2776 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2777 return Reg;
2778
2780 if (hasLoadFromStackSlot(MI, Accesses)) {
2781 if (Accesses.size() > 1)
2782 return Register();
2783
2784 FrameIndex =
2785 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2786 ->getFrameIndex();
2787 return MI.getOperand(0).getReg();
2788 }
2789 return Register();
2790}
2791
2792/// Check all MachineMemOperands for a hint to suppress pairing.
2794 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2795 return MMO->getFlags() & MOSuppressPair;
2796 });
2797}
2798
2799/// Set a flag on the first MachineMemOperand to suppress pairing.
2801 if (MI.memoperands_empty())
2802 return;
2803 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2804}
2805
2806/// Check all MachineMemOperands for a hint that the load/store is strided.
2808 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2809 return MMO->getFlags() & MOStridedAccess;
2810 });
2811}
2812
2814 switch (Opc) {
2815 default:
2816 return false;
2817 case AArch64::STURSi:
2818 case AArch64::STRSpre:
2819 case AArch64::STURDi:
2820 case AArch64::STRDpre:
2821 case AArch64::STURQi:
2822 case AArch64::STRQpre:
2823 case AArch64::STURBBi:
2824 case AArch64::STURHHi:
2825 case AArch64::STURWi:
2826 case AArch64::STRWpre:
2827 case AArch64::STURXi:
2828 case AArch64::STRXpre:
2829 case AArch64::LDURSi:
2830 case AArch64::LDRSpre:
2831 case AArch64::LDURDi:
2832 case AArch64::LDRDpre:
2833 case AArch64::LDURQi:
2834 case AArch64::LDRQpre:
2835 case AArch64::LDURWi:
2836 case AArch64::LDRWpre:
2837 case AArch64::LDURXi:
2838 case AArch64::LDRXpre:
2839 case AArch64::LDRSWpre:
2840 case AArch64::LDURSWi:
2841 case AArch64::LDURHHi:
2842 case AArch64::LDURBBi:
2843 case AArch64::LDURSBWi:
2844 case AArch64::LDURSHWi:
2845 return true;
2846 }
2847}
2848
2849std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2850 switch (Opc) {
2851 default: return {};
2852 case AArch64::PRFMui: return AArch64::PRFUMi;
2853 case AArch64::LDRXui: return AArch64::LDURXi;
2854 case AArch64::LDRWui: return AArch64::LDURWi;
2855 case AArch64::LDRBui: return AArch64::LDURBi;
2856 case AArch64::LDRHui: return AArch64::LDURHi;
2857 case AArch64::LDRSui: return AArch64::LDURSi;
2858 case AArch64::LDRDui: return AArch64::LDURDi;
2859 case AArch64::LDRQui: return AArch64::LDURQi;
2860 case AArch64::LDRBBui: return AArch64::LDURBBi;
2861 case AArch64::LDRHHui: return AArch64::LDURHHi;
2862 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2863 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2864 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2865 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2866 case AArch64::LDRSWui: return AArch64::LDURSWi;
2867 case AArch64::STRXui: return AArch64::STURXi;
2868 case AArch64::STRWui: return AArch64::STURWi;
2869 case AArch64::STRBui: return AArch64::STURBi;
2870 case AArch64::STRHui: return AArch64::STURHi;
2871 case AArch64::STRSui: return AArch64::STURSi;
2872 case AArch64::STRDui: return AArch64::STURDi;
2873 case AArch64::STRQui: return AArch64::STURQi;
2874 case AArch64::STRBBui: return AArch64::STURBBi;
2875 case AArch64::STRHHui: return AArch64::STURHHi;
2876 }
2877}
2878
2880 switch (Opc) {
2881 default:
2882 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2883 case AArch64::ADDG:
2884 case AArch64::LDAPURBi:
2885 case AArch64::LDAPURHi:
2886 case AArch64::LDAPURi:
2887 case AArch64::LDAPURSBWi:
2888 case AArch64::LDAPURSBXi:
2889 case AArch64::LDAPURSHWi:
2890 case AArch64::LDAPURSHXi:
2891 case AArch64::LDAPURSWi:
2892 case AArch64::LDAPURXi:
2893 case AArch64::LDR_PPXI:
2894 case AArch64::LDR_PXI:
2895 case AArch64::LDR_ZXI:
2896 case AArch64::LDR_ZZXI:
2897 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2898 case AArch64::LDR_ZZZXI:
2899 case AArch64::LDR_ZZZZXI:
2900 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2901 case AArch64::LDRBBui:
2902 case AArch64::LDRBui:
2903 case AArch64::LDRDui:
2904 case AArch64::LDRHHui:
2905 case AArch64::LDRHui:
2906 case AArch64::LDRQui:
2907 case AArch64::LDRSBWui:
2908 case AArch64::LDRSBXui:
2909 case AArch64::LDRSHWui:
2910 case AArch64::LDRSHXui:
2911 case AArch64::LDRSui:
2912 case AArch64::LDRSWui:
2913 case AArch64::LDRWui:
2914 case AArch64::LDRXui:
2915 case AArch64::LDURBBi:
2916 case AArch64::LDURBi:
2917 case AArch64::LDURDi:
2918 case AArch64::LDURHHi:
2919 case AArch64::LDURHi:
2920 case AArch64::LDURQi:
2921 case AArch64::LDURSBWi:
2922 case AArch64::LDURSBXi:
2923 case AArch64::LDURSHWi:
2924 case AArch64::LDURSHXi:
2925 case AArch64::LDURSi:
2926 case AArch64::LDURSWi:
2927 case AArch64::LDURWi:
2928 case AArch64::LDURXi:
2929 case AArch64::PRFMui:
2930 case AArch64::PRFUMi:
2931 case AArch64::ST2Gi:
2932 case AArch64::STGi:
2933 case AArch64::STLURBi:
2934 case AArch64::STLURHi:
2935 case AArch64::STLURWi:
2936 case AArch64::STLURXi:
2937 case AArch64::StoreSwiftAsyncContext:
2938 case AArch64::STR_PPXI:
2939 case AArch64::STR_PXI:
2940 case AArch64::STR_ZXI:
2941 case AArch64::STR_ZZXI:
2942 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2943 case AArch64::STR_ZZZXI:
2944 case AArch64::STR_ZZZZXI:
2945 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2946 case AArch64::STRBBui:
2947 case AArch64::STRBui:
2948 case AArch64::STRDui:
2949 case AArch64::STRHHui:
2950 case AArch64::STRHui:
2951 case AArch64::STRQui:
2952 case AArch64::STRSui:
2953 case AArch64::STRWui:
2954 case AArch64::STRXui:
2955 case AArch64::STURBBi:
2956 case AArch64::STURBi:
2957 case AArch64::STURDi:
2958 case AArch64::STURHHi:
2959 case AArch64::STURHi:
2960 case AArch64::STURQi:
2961 case AArch64::STURSi:
2962 case AArch64::STURWi:
2963 case AArch64::STURXi:
2964 case AArch64::STZ2Gi:
2965 case AArch64::STZGi:
2966 case AArch64::TAGPstack:
2967 return 2;
2968 case AArch64::LD1B_D_IMM:
2969 case AArch64::LD1B_H_IMM:
2970 case AArch64::LD1B_IMM:
2971 case AArch64::LD1B_S_IMM:
2972 case AArch64::LD1D_IMM:
2973 case AArch64::LD1H_D_IMM:
2974 case AArch64::LD1H_IMM:
2975 case AArch64::LD1H_S_IMM:
2976 case AArch64::LD1RB_D_IMM:
2977 case AArch64::LD1RB_H_IMM:
2978 case AArch64::LD1RB_IMM:
2979 case AArch64::LD1RB_S_IMM:
2980 case AArch64::LD1RD_IMM:
2981 case AArch64::LD1RH_D_IMM:
2982 case AArch64::LD1RH_IMM:
2983 case AArch64::LD1RH_S_IMM:
2984 case AArch64::LD1RSB_D_IMM:
2985 case AArch64::LD1RSB_H_IMM:
2986 case AArch64::LD1RSB_S_IMM:
2987 case AArch64::LD1RSH_D_IMM:
2988 case AArch64::LD1RSH_S_IMM:
2989 case AArch64::LD1RSW_IMM:
2990 case AArch64::LD1RW_D_IMM:
2991 case AArch64::LD1RW_IMM:
2992 case AArch64::LD1SB_D_IMM:
2993 case AArch64::LD1SB_H_IMM:
2994 case AArch64::LD1SB_S_IMM:
2995 case AArch64::LD1SH_D_IMM:
2996 case AArch64::LD1SH_S_IMM:
2997 case AArch64::LD1SW_D_IMM:
2998 case AArch64::LD1W_D_IMM:
2999 case AArch64::LD1W_IMM:
3000 case AArch64::LD2B_IMM:
3001 case AArch64::LD2D_IMM:
3002 case AArch64::LD2H_IMM:
3003 case AArch64::LD2W_IMM:
3004 case AArch64::LD3B_IMM:
3005 case AArch64::LD3D_IMM:
3006 case AArch64::LD3H_IMM:
3007 case AArch64::LD3W_IMM:
3008 case AArch64::LD4B_IMM:
3009 case AArch64::LD4D_IMM:
3010 case AArch64::LD4H_IMM:
3011 case AArch64::LD4W_IMM:
3012 case AArch64::LDG:
3013 case AArch64::LDNF1B_D_IMM:
3014 case AArch64::LDNF1B_H_IMM:
3015 case AArch64::LDNF1B_IMM:
3016 case AArch64::LDNF1B_S_IMM:
3017 case AArch64::LDNF1D_IMM:
3018 case AArch64::LDNF1H_D_IMM:
3019 case AArch64::LDNF1H_IMM:
3020 case AArch64::LDNF1H_S_IMM:
3021 case AArch64::LDNF1SB_D_IMM:
3022 case AArch64::LDNF1SB_H_IMM:
3023 case AArch64::LDNF1SB_S_IMM:
3024 case AArch64::LDNF1SH_D_IMM:
3025 case AArch64::LDNF1SH_S_IMM:
3026 case AArch64::LDNF1SW_D_IMM:
3027 case AArch64::LDNF1W_D_IMM:
3028 case AArch64::LDNF1W_IMM:
3029 case AArch64::LDNPDi:
3030 case AArch64::LDNPQi:
3031 case AArch64::LDNPSi:
3032 case AArch64::LDNPWi:
3033 case AArch64::LDNPXi:
3034 case AArch64::LDNT1B_ZRI:
3035 case AArch64::LDNT1D_ZRI:
3036 case AArch64::LDNT1H_ZRI:
3037 case AArch64::LDNT1W_ZRI:
3038 case AArch64::LDPDi:
3039 case AArch64::LDPQi:
3040 case AArch64::LDPSi:
3041 case AArch64::LDPWi:
3042 case AArch64::LDPXi:
3043 case AArch64::LDRBBpost:
3044 case AArch64::LDRBBpre:
3045 case AArch64::LDRBpost:
3046 case AArch64::LDRBpre:
3047 case AArch64::LDRDpost:
3048 case AArch64::LDRDpre:
3049 case AArch64::LDRHHpost:
3050 case AArch64::LDRHHpre:
3051 case AArch64::LDRHpost:
3052 case AArch64::LDRHpre:
3053 case AArch64::LDRQpost:
3054 case AArch64::LDRQpre:
3055 case AArch64::LDRSpost:
3056 case AArch64::LDRSpre:
3057 case AArch64::LDRWpost:
3058 case AArch64::LDRWpre:
3059 case AArch64::LDRXpost:
3060 case AArch64::LDRXpre:
3061 case AArch64::ST1B_D_IMM:
3062 case AArch64::ST1B_H_IMM:
3063 case AArch64::ST1B_IMM:
3064 case AArch64::ST1B_S_IMM:
3065 case AArch64::ST1D_IMM:
3066 case AArch64::ST1H_D_IMM:
3067 case AArch64::ST1H_IMM:
3068 case AArch64::ST1H_S_IMM:
3069 case AArch64::ST1W_D_IMM:
3070 case AArch64::ST1W_IMM:
3071 case AArch64::ST2B_IMM:
3072 case AArch64::ST2D_IMM:
3073 case AArch64::ST2H_IMM:
3074 case AArch64::ST2W_IMM:
3075 case AArch64::ST3B_IMM:
3076 case AArch64::ST3D_IMM:
3077 case AArch64::ST3H_IMM:
3078 case AArch64::ST3W_IMM:
3079 case AArch64::ST4B_IMM:
3080 case AArch64::ST4D_IMM:
3081 case AArch64::ST4H_IMM:
3082 case AArch64::ST4W_IMM:
3083 case AArch64::STGPi:
3084 case AArch64::STGPreIndex:
3085 case AArch64::STZGPreIndex:
3086 case AArch64::ST2GPreIndex:
3087 case AArch64::STZ2GPreIndex:
3088 case AArch64::STGPostIndex:
3089 case AArch64::STZGPostIndex:
3090 case AArch64::ST2GPostIndex:
3091 case AArch64::STZ2GPostIndex:
3092 case AArch64::STNPDi:
3093 case AArch64::STNPQi:
3094 case AArch64::STNPSi:
3095 case AArch64::STNPWi:
3096 case AArch64::STNPXi:
3097 case AArch64::STNT1B_ZRI:
3098 case AArch64::STNT1D_ZRI:
3099 case AArch64::STNT1H_ZRI:
3100 case AArch64::STNT1W_ZRI:
3101 case AArch64::STPDi:
3102 case AArch64::STPQi:
3103 case AArch64::STPSi:
3104 case AArch64::STPWi:
3105 case AArch64::STPXi:
3106 case AArch64::STRBBpost:
3107 case AArch64::STRBBpre:
3108 case AArch64::STRBpost:
3109 case AArch64::STRBpre:
3110 case AArch64::STRDpost:
3111 case AArch64::STRDpre:
3112 case AArch64::STRHHpost:
3113 case AArch64::STRHHpre:
3114 case AArch64::STRHpost:
3115 case AArch64::STRHpre:
3116 case AArch64::STRQpost:
3117 case AArch64::STRQpre:
3118 case AArch64::STRSpost:
3119 case AArch64::STRSpre:
3120 case AArch64::STRWpost:
3121 case AArch64::STRWpre:
3122 case AArch64::STRXpost:
3123 case AArch64::STRXpre:
3124 return 3;
3125 case AArch64::LDPDpost:
3126 case AArch64::LDPDpre:
3127 case AArch64::LDPQpost:
3128 case AArch64::LDPQpre:
3129 case AArch64::LDPSpost:
3130 case AArch64::LDPSpre:
3131 case AArch64::LDPWpost:
3132 case AArch64::LDPWpre:
3133 case AArch64::LDPXpost:
3134 case AArch64::LDPXpre:
3135 case AArch64::STGPpre:
3136 case AArch64::STGPpost:
3137 case AArch64::STPDpost:
3138 case AArch64::STPDpre:
3139 case AArch64::STPQpost:
3140 case AArch64::STPQpre:
3141 case AArch64::STPSpost:
3142 case AArch64::STPSpre:
3143 case AArch64::STPWpost:
3144 case AArch64::STPWpre:
3145 case AArch64::STPXpost:
3146 case AArch64::STPXpre:
3147 return 4;
3148 }
3149}
3150
3152 switch (MI.getOpcode()) {
3153 default:
3154 return false;
3155 // Scaled instructions.
3156 case AArch64::STRSui:
3157 case AArch64::STRDui:
3158 case AArch64::STRQui:
3159 case AArch64::STRXui:
3160 case AArch64::STRWui:
3161 case AArch64::LDRSui:
3162 case AArch64::LDRDui:
3163 case AArch64::LDRQui:
3164 case AArch64::LDRXui:
3165 case AArch64::LDRWui:
3166 case AArch64::LDRSWui:
3167 // Unscaled instructions.
3168 case AArch64::STURSi:
3169 case AArch64::STRSpre:
3170 case AArch64::STURDi:
3171 case AArch64::STRDpre:
3172 case AArch64::STURQi:
3173 case AArch64::STRQpre:
3174 case AArch64::STURWi:
3175 case AArch64::STRWpre:
3176 case AArch64::STURXi:
3177 case AArch64::STRXpre:
3178 case AArch64::LDURSi:
3179 case AArch64::LDRSpre:
3180 case AArch64::LDURDi:
3181 case AArch64::LDRDpre:
3182 case AArch64::LDURQi:
3183 case AArch64::LDRQpre:
3184 case AArch64::LDURWi:
3185 case AArch64::LDRWpre:
3186 case AArch64::LDURXi:
3187 case AArch64::LDRXpre:
3188 case AArch64::LDURSWi:
3189 case AArch64::LDRSWpre:
3190 // SVE instructions.
3191 case AArch64::LDR_ZXI:
3192 case AArch64::STR_ZXI:
3193 return true;
3194 }
3195}
3196
3198 switch (MI.getOpcode()) {
3199 default:
3200 assert((!MI.isCall() || !MI.isReturn()) &&
3201 "Unexpected instruction - was a new tail call opcode introduced?");
3202 return false;
3203 case AArch64::TCRETURNdi:
3204 case AArch64::TCRETURNri:
3205 case AArch64::TCRETURNrix16x17:
3206 case AArch64::TCRETURNrix17:
3207 case AArch64::TCRETURNrinotx16:
3208 case AArch64::TCRETURNriALL:
3209 case AArch64::AUTH_TCRETURN:
3210 case AArch64::AUTH_TCRETURN_BTI:
3211 return true;
3212 }
3213}
3214
3216 switch (Opc) {
3217 default:
3218 llvm_unreachable("Opcode has no flag setting equivalent!");
3219 // 32-bit cases:
3220 case AArch64::ADDWri:
3221 return AArch64::ADDSWri;
3222 case AArch64::ADDWrr:
3223 return AArch64::ADDSWrr;
3224 case AArch64::ADDWrs:
3225 return AArch64::ADDSWrs;
3226 case AArch64::ADDWrx:
3227 return AArch64::ADDSWrx;
3228 case AArch64::ANDWri:
3229 return AArch64::ANDSWri;
3230 case AArch64::ANDWrr:
3231 return AArch64::ANDSWrr;
3232 case AArch64::ANDWrs:
3233 return AArch64::ANDSWrs;
3234 case AArch64::BICWrr:
3235 return AArch64::BICSWrr;
3236 case AArch64::BICWrs:
3237 return AArch64::BICSWrs;
3238 case AArch64::SUBWri:
3239 return AArch64::SUBSWri;
3240 case AArch64::SUBWrr:
3241 return AArch64::SUBSWrr;
3242 case AArch64::SUBWrs:
3243 return AArch64::SUBSWrs;
3244 case AArch64::SUBWrx:
3245 return AArch64::SUBSWrx;
3246 // 64-bit cases:
3247 case AArch64::ADDXri:
3248 return AArch64::ADDSXri;
3249 case AArch64::ADDXrr:
3250 return AArch64::ADDSXrr;
3251 case AArch64::ADDXrs:
3252 return AArch64::ADDSXrs;
3253 case AArch64::ADDXrx:
3254 return AArch64::ADDSXrx;
3255 case AArch64::ANDXri:
3256 return AArch64::ANDSXri;
3257 case AArch64::ANDXrr:
3258 return AArch64::ANDSXrr;
3259 case AArch64::ANDXrs:
3260 return AArch64::ANDSXrs;
3261 case AArch64::BICXrr:
3262 return AArch64::BICSXrr;
3263 case AArch64::BICXrs:
3264 return AArch64::BICSXrs;
3265 case AArch64::SUBXri:
3266 return AArch64::SUBSXri;
3267 case AArch64::SUBXrr:
3268 return AArch64::SUBSXrr;
3269 case AArch64::SUBXrs:
3270 return AArch64::SUBSXrs;
3271 case AArch64::SUBXrx:
3272 return AArch64::SUBSXrx;
3273 // SVE instructions:
3274 case AArch64::AND_PPzPP:
3275 return AArch64::ANDS_PPzPP;
3276 case AArch64::BIC_PPzPP:
3277 return AArch64::BICS_PPzPP;
3278 case AArch64::EOR_PPzPP:
3279 return AArch64::EORS_PPzPP;
3280 case AArch64::NAND_PPzPP:
3281 return AArch64::NANDS_PPzPP;
3282 case AArch64::NOR_PPzPP:
3283 return AArch64::NORS_PPzPP;
3284 case AArch64::ORN_PPzPP:
3285 return AArch64::ORNS_PPzPP;
3286 case AArch64::ORR_PPzPP:
3287 return AArch64::ORRS_PPzPP;
3288 case AArch64::BRKA_PPzP:
3289 return AArch64::BRKAS_PPzP;
3290 case AArch64::BRKPA_PPzPP:
3291 return AArch64::BRKPAS_PPzPP;
3292 case AArch64::BRKB_PPzP:
3293 return AArch64::BRKBS_PPzP;
3294 case AArch64::BRKPB_PPzPP:
3295 return AArch64::BRKPBS_PPzPP;
3296 case AArch64::BRKN_PPzP:
3297 return AArch64::BRKNS_PPzP;
3298 case AArch64::RDFFR_PPz:
3299 return AArch64::RDFFRS_PPz;
3300 case AArch64::PTRUE_B:
3301 return AArch64::PTRUES_B;
3302 }
3303}
3304
3305// Is this a candidate for ld/st merging or pairing? For example, we don't
3306// touch volatiles or load/stores that have a hint to avoid pair formation.
3308
3309 bool IsPreLdSt = isPreLdSt(MI);
3310
3311 // If this is a volatile load/store, don't mess with it.
3312 if (MI.hasOrderedMemoryRef())
3313 return false;
3314
3315 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3316 // For Pre-inc LD/ST, the operand is shifted by one.
3317 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3318 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3319 "Expected a reg or frame index operand.");
3320
3321 // For Pre-indexed addressing quadword instructions, the third operand is the
3322 // immediate value.
3323 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3324
3325 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3326 return false;
3327
3328 // Can't merge/pair if the instruction modifies the base register.
3329 // e.g., ldr x0, [x0]
3330 // This case will never occur with an FI base.
3331 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3332 // STR<S,D,Q,W,X>pre, it can be merged.
3333 // For example:
3334 // ldr q0, [x11, #32]!
3335 // ldr q1, [x11, #16]
3336 // to
3337 // ldp q0, q1, [x11, #32]!
3338 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3339 Register BaseReg = MI.getOperand(1).getReg();
3341 if (MI.modifiesRegister(BaseReg, TRI))
3342 return false;
3343 }
3344
3345 // Pairing SVE fills/spills is only valid for little-endian targets that
3346 // implement VLS 128.
3347 switch (MI.getOpcode()) {
3348 default:
3349 break;
3350 case AArch64::LDR_ZXI:
3351 case AArch64::STR_ZXI:
3352 if (!Subtarget.isLittleEndian() ||
3353 Subtarget.getSVEVectorSizeInBits() != 128)
3354 return false;
3355 }
3356
3357 // Check if this load/store has a hint to avoid pair formation.
3358 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3360 return false;
3361
3362 // Do not pair any callee-save store/reload instructions in the
3363 // prologue/epilogue if the CFI information encoded the operations as separate
3364 // instructions, as that will cause the size of the actual prologue to mismatch
3365 // with the prologue size recorded in the Windows CFI.
3366 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
3367 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
3368 MI.getMF()->getFunction().needsUnwindTableEntry();
3369 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3371 return false;
3372
3373 // On some CPUs quad load/store pairs are slower than two single load/stores.
3374 if (Subtarget.isPaired128Slow()) {
3375 switch (MI.getOpcode()) {
3376 default:
3377 break;
3378 case AArch64::LDURQi:
3379 case AArch64::STURQi:
3380 case AArch64::LDRQui:
3381 case AArch64::STRQui:
3382 return false;
3383 }
3384 }
3385
3386 return true;
3387}
3388
3391 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3392 const TargetRegisterInfo *TRI) const {
3393 if (!LdSt.mayLoadOrStore())
3394 return false;
3395
3396 const MachineOperand *BaseOp;
3397 TypeSize WidthN(0, false);
3398 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3399 WidthN, TRI))
3400 return false;
3401 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3402 // vector.
3403 Width = LocationSize::precise(WidthN);
3404 BaseOps.push_back(BaseOp);
3405 return true;
3406}
3407
3408std::optional<ExtAddrMode>
3410 const TargetRegisterInfo *TRI) const {
3411 const MachineOperand *Base; // Filled with the base operand of MI.
3412 int64_t Offset; // Filled with the offset of MI.
3413 bool OffsetIsScalable;
3414 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3415 return std::nullopt;
3416
3417 if (!Base->isReg())
3418 return std::nullopt;
3419 ExtAddrMode AM;
3420 AM.BaseReg = Base->getReg();
3421 AM.Displacement = Offset;
3422 AM.ScaledReg = 0;
3423 AM.Scale = 0;
3424 return AM;
3425}
3426
3428 Register Reg,
3429 const MachineInstr &AddrI,
3430 ExtAddrMode &AM) const {
3431 // Filter out instructions into which we cannot fold.
3432 unsigned NumBytes;
3433 int64_t OffsetScale = 1;
3434 switch (MemI.getOpcode()) {
3435 default:
3436 return false;
3437
3438 case AArch64::LDURQi:
3439 case AArch64::STURQi:
3440 NumBytes = 16;
3441 break;
3442
3443 case AArch64::LDURDi:
3444 case AArch64::STURDi:
3445 case AArch64::LDURXi:
3446 case AArch64::STURXi:
3447 NumBytes = 8;
3448 break;
3449
3450 case AArch64::LDURWi:
3451 case AArch64::LDURSWi:
3452 case AArch64::STURWi:
3453 NumBytes = 4;
3454 break;
3455
3456 case AArch64::LDURHi:
3457 case AArch64::STURHi:
3458 case AArch64::LDURHHi:
3459 case AArch64::STURHHi:
3460 case AArch64::LDURSHXi:
3461 case AArch64::LDURSHWi:
3462 NumBytes = 2;
3463 break;
3464
3465 case AArch64::LDRBroX:
3466 case AArch64::LDRBBroX:
3467 case AArch64::LDRSBXroX:
3468 case AArch64::LDRSBWroX:
3469 case AArch64::STRBroX:
3470 case AArch64::STRBBroX:
3471 case AArch64::LDURBi:
3472 case AArch64::LDURBBi:
3473 case AArch64::LDURSBXi:
3474 case AArch64::LDURSBWi:
3475 case AArch64::STURBi:
3476 case AArch64::STURBBi:
3477 case AArch64::LDRBui:
3478 case AArch64::LDRBBui:
3479 case AArch64::LDRSBXui:
3480 case AArch64::LDRSBWui:
3481 case AArch64::STRBui:
3482 case AArch64::STRBBui:
3483 NumBytes = 1;
3484 break;
3485
3486 case AArch64::LDRQroX:
3487 case AArch64::STRQroX:
3488 case AArch64::LDRQui:
3489 case AArch64::STRQui:
3490 NumBytes = 16;
3491 OffsetScale = 16;
3492 break;
3493
3494 case AArch64::LDRDroX:
3495 case AArch64::STRDroX:
3496 case AArch64::LDRXroX:
3497 case AArch64::STRXroX:
3498 case AArch64::LDRDui:
3499 case AArch64::STRDui:
3500 case AArch64::LDRXui:
3501 case AArch64::STRXui:
3502 NumBytes = 8;
3503 OffsetScale = 8;
3504 break;
3505
3506 case AArch64::LDRWroX:
3507 case AArch64::LDRSWroX:
3508 case AArch64::STRWroX:
3509 case AArch64::LDRWui:
3510 case AArch64::LDRSWui:
3511 case AArch64::STRWui:
3512 NumBytes = 4;
3513 OffsetScale = 4;
3514 break;
3515
3516 case AArch64::LDRHroX:
3517 case AArch64::STRHroX:
3518 case AArch64::LDRHHroX:
3519 case AArch64::STRHHroX:
3520 case AArch64::LDRSHXroX:
3521 case AArch64::LDRSHWroX:
3522 case AArch64::LDRHui:
3523 case AArch64::STRHui:
3524 case AArch64::LDRHHui:
3525 case AArch64::STRHHui:
3526 case AArch64::LDRSHXui:
3527 case AArch64::LDRSHWui:
3528 NumBytes = 2;
3529 OffsetScale = 2;
3530 break;
3531 }
3532
3533 // Check the fold operand is not the loaded/stored value.
3534 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3535 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3536 return false;
3537
3538 // Handle memory instructions with a [Reg, Reg] addressing mode.
3539 if (MemI.getOperand(2).isReg()) {
3540 // Bail if the addressing mode already includes extension of the offset
3541 // register.
3542 if (MemI.getOperand(3).getImm())
3543 return false;
3544
3545 // Check if we actually have a scaled offset.
3546 if (MemI.getOperand(4).getImm() == 0)
3547 OffsetScale = 1;
3548
3549 // If the address instructions is folded into the base register, then the
3550 // addressing mode must not have a scale. Then we can swap the base and the
3551 // scaled registers.
3552 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3553 return false;
3554
3555 switch (AddrI.getOpcode()) {
3556 default:
3557 return false;
3558
3559 case AArch64::SBFMXri:
3560 // sxtw Xa, Wm
3561 // ldr Xd, [Xn, Xa, lsl #N]
3562 // ->
3563 // ldr Xd, [Xn, Wm, sxtw #N]
3564 if (AddrI.getOperand(2).getImm() != 0 ||
3565 AddrI.getOperand(3).getImm() != 31)
3566 return false;
3567
3568 AM.BaseReg = MemI.getOperand(1).getReg();
3569 if (AM.BaseReg == Reg)
3570 AM.BaseReg = MemI.getOperand(2).getReg();
3571 AM.ScaledReg = AddrI.getOperand(1).getReg();
3572 AM.Scale = OffsetScale;
3573 AM.Displacement = 0;
3575 return true;
3576
3577 case TargetOpcode::SUBREG_TO_REG: {
3578 // mov Wa, Wm
3579 // ldr Xd, [Xn, Xa, lsl #N]
3580 // ->
3581 // ldr Xd, [Xn, Wm, uxtw #N]
3582
3583 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3584 if (AddrI.getOperand(2).getImm() != AArch64::sub_32)
3585 return false;
3586
3587 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3588 Register OffsetReg = AddrI.getOperand(1).getReg();
3589 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3590 return false;
3591
3592 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3593 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3594 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3595 DefMI.getOperand(3).getImm() != 0)
3596 return false;
3597
3598 AM.BaseReg = MemI.getOperand(1).getReg();
3599 if (AM.BaseReg == Reg)
3600 AM.BaseReg = MemI.getOperand(2).getReg();
3601 AM.ScaledReg = DefMI.getOperand(2).getReg();
3602 AM.Scale = OffsetScale;
3603 AM.Displacement = 0;
3605 return true;
3606 }
3607 }
3608 }
3609
3610 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3611
3612 // Check we are not breaking a potential conversion to an LDP.
3613 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3614 int64_t NewOffset) -> bool {
3615 int64_t MinOffset, MaxOffset;
3616 switch (NumBytes) {
3617 default:
3618 return true;
3619 case 4:
3620 MinOffset = -256;
3621 MaxOffset = 252;
3622 break;
3623 case 8:
3624 MinOffset = -512;
3625 MaxOffset = 504;
3626 break;
3627 case 16:
3628 MinOffset = -1024;
3629 MaxOffset = 1008;
3630 break;
3631 }
3632 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3633 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3634 };
3635 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3636 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3637 int64_t NewOffset = OldOffset + Disp;
3638 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3639 return false;
3640 // If the old offset would fit into an LDP, but the new offset wouldn't,
3641 // bail out.
3642 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3643 return false;
3644 AM.BaseReg = AddrI.getOperand(1).getReg();
3645 AM.ScaledReg = 0;
3646 AM.Scale = 0;
3647 AM.Displacement = NewOffset;
3649 return true;
3650 };
3651
3652 auto canFoldAddRegIntoAddrMode =
3653 [&](int64_t Scale,
3655 if (MemI.getOperand(2).getImm() != 0)
3656 return false;
3657 if ((unsigned)Scale != Scale)
3658 return false;
3659 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3660 return false;
3661 AM.BaseReg = AddrI.getOperand(1).getReg();
3662 AM.ScaledReg = AddrI.getOperand(2).getReg();
3663 AM.Scale = Scale;
3664 AM.Displacement = 0;
3665 AM.Form = Form;
3666 return true;
3667 };
3668
3669 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3670 unsigned Opcode = MemI.getOpcode();
3671 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3672 Subtarget.isSTRQroSlow();
3673 };
3674
3675 int64_t Disp = 0;
3676 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3677 switch (AddrI.getOpcode()) {
3678 default:
3679 return false;
3680
3681 case AArch64::ADDXri:
3682 // add Xa, Xn, #N
3683 // ldr Xd, [Xa, #M]
3684 // ->
3685 // ldr Xd, [Xn, #N'+M]
3686 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3687 return canFoldAddSubImmIntoAddrMode(Disp);
3688
3689 case AArch64::SUBXri:
3690 // sub Xa, Xn, #N
3691 // ldr Xd, [Xa, #M]
3692 // ->
3693 // ldr Xd, [Xn, #N'+M]
3694 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3695 return canFoldAddSubImmIntoAddrMode(-Disp);
3696
3697 case AArch64::ADDXrs: {
3698 // add Xa, Xn, Xm, lsl #N
3699 // ldr Xd, [Xa]
3700 // ->
3701 // ldr Xd, [Xn, Xm, lsl #N]
3702
3703 // Don't fold the add if the result would be slower, unless optimising for
3704 // size.
3705 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3707 return false;
3708 Shift = AArch64_AM::getShiftValue(Shift);
3709 if (!OptSize) {
3710 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3711 return false;
3712 if (avoidSlowSTRQ(MemI))
3713 return false;
3714 }
3715 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3716 }
3717
3718 case AArch64::ADDXrr:
3719 // add Xa, Xn, Xm
3720 // ldr Xd, [Xa]
3721 // ->
3722 // ldr Xd, [Xn, Xm, lsl #0]
3723
3724 // Don't fold the add if the result would be slower, unless optimising for
3725 // size.
3726 if (!OptSize && avoidSlowSTRQ(MemI))
3727 return false;
3728 return canFoldAddRegIntoAddrMode(1);
3729
3730 case AArch64::ADDXrx:
3731 // add Xa, Xn, Wm, {s,u}xtw #N
3732 // ldr Xd, [Xa]
3733 // ->
3734 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3735
3736 // Don't fold the add if the result would be slower, unless optimising for
3737 // size.
3738 if (!OptSize && avoidSlowSTRQ(MemI))
3739 return false;
3740
3741 // Can fold only sign-/zero-extend of a word.
3742 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3744 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3745 return false;
3746
3747 return canFoldAddRegIntoAddrMode(
3748 1ULL << AArch64_AM::getArithShiftValue(Imm),
3751 }
3752}
3753
3754// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3755// return the opcode of an instruction performing the same operation, but using
3756// the [Reg, Reg] addressing mode.
3757static unsigned regOffsetOpcode(unsigned Opcode) {
3758 switch (Opcode) {
3759 default:
3760 llvm_unreachable("Address folding not implemented for instruction");
3761
3762 case AArch64::LDURQi:
3763 case AArch64::LDRQui:
3764 return AArch64::LDRQroX;
3765 case AArch64::STURQi:
3766 case AArch64::STRQui:
3767 return AArch64::STRQroX;
3768 case AArch64::LDURDi:
3769 case AArch64::LDRDui:
3770 return AArch64::LDRDroX;
3771 case AArch64::STURDi:
3772 case AArch64::STRDui:
3773 return AArch64::STRDroX;
3774 case AArch64::LDURXi:
3775 case AArch64::LDRXui:
3776 return AArch64::LDRXroX;
3777 case AArch64::STURXi:
3778 case AArch64::STRXui:
3779 return AArch64::STRXroX;
3780 case AArch64::LDURWi:
3781 case AArch64::LDRWui:
3782 return AArch64::LDRWroX;
3783 case AArch64::LDURSWi:
3784 case AArch64::LDRSWui:
3785 return AArch64::LDRSWroX;
3786 case AArch64::STURWi:
3787 case AArch64::STRWui:
3788 return AArch64::STRWroX;
3789 case AArch64::LDURHi:
3790 case AArch64::LDRHui:
3791 return AArch64::LDRHroX;
3792 case AArch64::STURHi:
3793 case AArch64::STRHui:
3794 return AArch64::STRHroX;
3795 case AArch64::LDURHHi:
3796 case AArch64::LDRHHui:
3797 return AArch64::LDRHHroX;
3798 case AArch64::STURHHi:
3799 case AArch64::STRHHui:
3800 return AArch64::STRHHroX;
3801 case AArch64::LDURSHXi:
3802 case AArch64::LDRSHXui:
3803 return AArch64::LDRSHXroX;
3804 case AArch64::LDURSHWi:
3805 case AArch64::LDRSHWui:
3806 return AArch64::LDRSHWroX;
3807 case AArch64::LDURBi:
3808 case AArch64::LDRBui:
3809 return AArch64::LDRBroX;
3810 case AArch64::LDURBBi:
3811 case AArch64::LDRBBui:
3812 return AArch64::LDRBBroX;
3813 case AArch64::LDURSBXi:
3814 case AArch64::LDRSBXui:
3815 return AArch64::LDRSBXroX;
3816 case AArch64::LDURSBWi:
3817 case AArch64::LDRSBWui:
3818 return AArch64::LDRSBWroX;
3819 case AArch64::STURBi:
3820 case AArch64::STRBui:
3821 return AArch64::STRBroX;
3822 case AArch64::STURBBi:
3823 case AArch64::STRBBui:
3824 return AArch64::STRBBroX;
3825 }
3826}
3827
3828// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3829// the opcode of an instruction performing the same operation, but using the
3830// [Reg, #Imm] addressing mode with scaled offset.
3831unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3832 switch (Opcode) {
3833 default:
3834 llvm_unreachable("Address folding not implemented for instruction");
3835
3836 case AArch64::LDURQi:
3837 Scale = 16;
3838 return AArch64::LDRQui;
3839 case AArch64::STURQi:
3840 Scale = 16;
3841 return AArch64::STRQui;
3842 case AArch64::LDURDi:
3843 Scale = 8;
3844 return AArch64::LDRDui;
3845 case AArch64::STURDi:
3846 Scale = 8;
3847 return AArch64::STRDui;
3848 case AArch64::LDURXi:
3849 Scale = 8;
3850 return AArch64::LDRXui;
3851 case AArch64::STURXi:
3852 Scale = 8;
3853 return AArch64::STRXui;
3854 case AArch64::LDURWi:
3855 Scale = 4;
3856 return AArch64::LDRWui;
3857 case AArch64::LDURSWi:
3858 Scale = 4;
3859 return AArch64::LDRSWui;
3860 case AArch64::STURWi:
3861 Scale = 4;
3862 return AArch64::STRWui;
3863 case AArch64::LDURHi:
3864 Scale = 2;
3865 return AArch64::LDRHui;
3866 case AArch64::STURHi:
3867 Scale = 2;
3868 return AArch64::STRHui;
3869 case AArch64::LDURHHi:
3870 Scale = 2;
3871 return AArch64::LDRHHui;
3872 case AArch64::STURHHi:
3873 Scale = 2;
3874 return AArch64::STRHHui;
3875 case AArch64::LDURSHXi:
3876 Scale = 2;
3877 return AArch64::LDRSHXui;
3878 case AArch64::LDURSHWi:
3879 Scale = 2;
3880 return AArch64::LDRSHWui;
3881 case AArch64::LDURBi:
3882 Scale = 1;
3883 return AArch64::LDRBui;
3884 case AArch64::LDURBBi:
3885 Scale = 1;
3886 return AArch64::LDRBBui;
3887 case AArch64::LDURSBXi:
3888 Scale = 1;
3889 return AArch64::LDRSBXui;
3890 case AArch64::LDURSBWi:
3891 Scale = 1;
3892 return AArch64::LDRSBWui;
3893 case AArch64::STURBi:
3894 Scale = 1;
3895 return AArch64::STRBui;
3896 case AArch64::STURBBi:
3897 Scale = 1;
3898 return AArch64::STRBBui;
3899 case AArch64::LDRQui:
3900 case AArch64::STRQui:
3901 Scale = 16;
3902 return Opcode;
3903 case AArch64::LDRDui:
3904 case AArch64::STRDui:
3905 case AArch64::LDRXui:
3906 case AArch64::STRXui:
3907 Scale = 8;
3908 return Opcode;
3909 case AArch64::LDRWui:
3910 case AArch64::LDRSWui:
3911 case AArch64::STRWui:
3912 Scale = 4;
3913 return Opcode;
3914 case AArch64::LDRHui:
3915 case AArch64::STRHui:
3916 case AArch64::LDRHHui:
3917 case AArch64::STRHHui:
3918 case AArch64::LDRSHXui:
3919 case AArch64::LDRSHWui:
3920 Scale = 2;
3921 return Opcode;
3922 case AArch64::LDRBui:
3923 case AArch64::LDRBBui:
3924 case AArch64::LDRSBXui:
3925 case AArch64::LDRSBWui:
3926 case AArch64::STRBui:
3927 case AArch64::STRBBui:
3928 Scale = 1;
3929 return Opcode;
3930 }
3931}
3932
3933// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3934// the opcode of an instruction performing the same operation, but using the
3935// [Reg, #Imm] addressing mode with unscaled offset.
3936unsigned unscaledOffsetOpcode(unsigned Opcode) {
3937 switch (Opcode) {
3938 default:
3939 llvm_unreachable("Address folding not implemented for instruction");
3940
3941 case AArch64::LDURQi:
3942 case AArch64::STURQi:
3943 case AArch64::LDURDi:
3944 case AArch64::STURDi:
3945 case AArch64::LDURXi:
3946 case AArch64::STURXi:
3947 case AArch64::LDURWi:
3948 case AArch64::LDURSWi:
3949 case AArch64::STURWi:
3950 case AArch64::LDURHi:
3951 case AArch64::STURHi:
3952 case AArch64::LDURHHi:
3953 case AArch64::STURHHi:
3954 case AArch64::LDURSHXi:
3955 case AArch64::LDURSHWi:
3956 case AArch64::LDURBi:
3957 case AArch64::STURBi:
3958 case AArch64::LDURBBi:
3959 case AArch64::STURBBi:
3960 case AArch64::LDURSBWi:
3961 case AArch64::LDURSBXi:
3962 return Opcode;
3963 case AArch64::LDRQui:
3964 return AArch64::LDURQi;
3965 case AArch64::STRQui:
3966 return AArch64::STURQi;
3967 case AArch64::LDRDui:
3968 return AArch64::LDURDi;
3969 case AArch64::STRDui:
3970 return AArch64::STURDi;
3971 case AArch64::LDRXui:
3972 return AArch64::LDURXi;
3973 case AArch64::STRXui:
3974 return AArch64::STURXi;
3975 case AArch64::LDRWui:
3976 return AArch64::LDURWi;
3977 case AArch64::LDRSWui:
3978 return AArch64::LDURSWi;
3979 case AArch64::STRWui:
3980 return AArch64::STURWi;
3981 case AArch64::LDRHui:
3982 return AArch64::LDURHi;
3983 case AArch64::STRHui:
3984 return AArch64::STURHi;
3985 case AArch64::LDRHHui:
3986 return AArch64::LDURHHi;
3987 case AArch64::STRHHui:
3988 return AArch64::STURHHi;
3989 case AArch64::LDRSHXui:
3990 return AArch64::LDURSHXi;
3991 case AArch64::LDRSHWui:
3992 return AArch64::LDURSHWi;
3993 case AArch64::LDRBBui:
3994 return AArch64::LDURBBi;
3995 case AArch64::LDRBui:
3996 return AArch64::LDURBi;
3997 case AArch64::STRBBui:
3998 return AArch64::STURBBi;
3999 case AArch64::STRBui:
4000 return AArch64::STURBi;
4001 case AArch64::LDRSBWui:
4002 return AArch64::LDURSBWi;
4003 case AArch64::LDRSBXui:
4004 return AArch64::LDURSBXi;
4005 }
4006}
4007
4008// Given the opcode of a memory load/store instruction, return the opcode of an
4009// instruction performing the same operation, but using
4010// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
4011// offset register.
4012static unsigned offsetExtendOpcode(unsigned Opcode) {
4013 switch (Opcode) {
4014 default:
4015 llvm_unreachable("Address folding not implemented for instruction");
4016
4017 case AArch64::LDRQroX:
4018 case AArch64::LDURQi:
4019 case AArch64::LDRQui:
4020 return AArch64::LDRQroW;
4021 case AArch64::STRQroX:
4022 case AArch64::STURQi:
4023 case AArch64::STRQui:
4024 return AArch64::STRQroW;
4025 case AArch64::LDRDroX:
4026 case AArch64::LDURDi:
4027 case AArch64::LDRDui:
4028 return AArch64::LDRDroW;
4029 case AArch64::STRDroX:
4030 case AArch64::STURDi:
4031 case AArch64::STRDui:
4032 return AArch64::STRDroW;
4033 case AArch64::LDRXroX:
4034 case AArch64::LDURXi:
4035 case AArch64::LDRXui:
4036 return AArch64::LDRXroW;
4037 case AArch64::STRXroX:
4038 case AArch64::STURXi:
4039 case AArch64::STRXui:
4040 return AArch64::STRXroW;
4041 case AArch64::LDRWroX:
4042 case AArch64::LDURWi:
4043 case AArch64::LDRWui:
4044 return AArch64::LDRWroW;
4045 case AArch64::LDRSWroX:
4046 case AArch64::LDURSWi:
4047 case AArch64::LDRSWui:
4048 return AArch64::LDRSWroW;
4049 case AArch64::STRWroX:
4050 case AArch64::STURWi:
4051 case AArch64::STRWui:
4052 return AArch64::STRWroW;
4053 case AArch64::LDRHroX:
4054 case AArch64::LDURHi:
4055 case AArch64::LDRHui:
4056 return AArch64::LDRHroW;
4057 case AArch64::STRHroX:
4058 case AArch64::STURHi:
4059 case AArch64::STRHui:
4060 return AArch64::STRHroW;
4061 case AArch64::LDRHHroX:
4062 case AArch64::LDURHHi:
4063 case AArch64::LDRHHui:
4064 return AArch64::LDRHHroW;
4065 case AArch64::STRHHroX:
4066 case AArch64::STURHHi:
4067 case AArch64::STRHHui:
4068 return AArch64::STRHHroW;
4069 case AArch64::LDRSHXroX:
4070 case AArch64::LDURSHXi:
4071 case AArch64::LDRSHXui:
4072 return AArch64::LDRSHXroW;
4073 case AArch64::LDRSHWroX:
4074 case AArch64::LDURSHWi:
4075 case AArch64::LDRSHWui:
4076 return AArch64::LDRSHWroW;
4077 case AArch64::LDRBroX:
4078 case AArch64::LDURBi:
4079 case AArch64::LDRBui:
4080 return AArch64::LDRBroW;
4081 case AArch64::LDRBBroX:
4082 case AArch64::LDURBBi:
4083 case AArch64::LDRBBui:
4084 return AArch64::LDRBBroW;
4085 case AArch64::LDRSBXroX:
4086 case AArch64::LDURSBXi:
4087 case AArch64::LDRSBXui:
4088 return AArch64::LDRSBXroW;
4089 case AArch64::LDRSBWroX:
4090 case AArch64::LDURSBWi:
4091 case AArch64::LDRSBWui:
4092 return AArch64::LDRSBWroW;
4093 case AArch64::STRBroX:
4094 case AArch64::STURBi:
4095 case AArch64::STRBui:
4096 return AArch64::STRBroW;
4097 case AArch64::STRBBroX:
4098 case AArch64::STURBBi:
4099 case AArch64::STRBBui:
4100 return AArch64::STRBBroW;
4101 }
4102}
4103
4105 const ExtAddrMode &AM) const {
4106
4107 const DebugLoc &DL = MemI.getDebugLoc();
4108 MachineBasicBlock &MBB = *MemI.getParent();
4109 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4110
4112 if (AM.ScaledReg) {
4113 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4114 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4115 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4116 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4117 .addReg(MemI.getOperand(0).getReg(),
4118 getDefRegState(MemI.mayLoad()))
4119 .addReg(AM.BaseReg)
4120 .addReg(AM.ScaledReg)
4121 .addImm(0)
4122 .addImm(AM.Scale > 1)
4123 .setMemRefs(MemI.memoperands())
4124 .setMIFlags(MemI.getFlags());
4125 return B.getInstr();
4126 }
4127
4128 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4129 "Addressing mode not supported for folding");
4130
4131 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4132 unsigned Scale = 1;
4133 unsigned Opcode = MemI.getOpcode();
4134 if (isInt<9>(AM.Displacement))
4135 Opcode = unscaledOffsetOpcode(Opcode);
4136 else
4137 Opcode = scaledOffsetOpcode(Opcode, Scale);
4138
4139 auto B =
4140 BuildMI(MBB, MemI, DL, get(Opcode))
4141 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4142 .addReg(AM.BaseReg)
4143 .addImm(AM.Displacement / Scale)
4144 .setMemRefs(MemI.memoperands())
4145 .setMIFlags(MemI.getFlags());
4146 return B.getInstr();
4147 }
4148
4151 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4152 assert(AM.ScaledReg && !AM.Displacement &&
4153 "Address offset can be a register or an immediate, but not both");
4154 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4155 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4156 // Make sure the offset register is in the correct register class.
4157 Register OffsetReg = AM.ScaledReg;
4158 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4159 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4160 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4161 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4162 .addReg(AM.ScaledReg, {}, AArch64::sub_32);
4163 }
4164 auto B =
4165 BuildMI(MBB, MemI, DL, get(Opcode))
4166 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4167 .addReg(AM.BaseReg)
4168 .addReg(OffsetReg)
4170 .addImm(AM.Scale != 1)
4171 .setMemRefs(MemI.memoperands())
4172 .setMIFlags(MemI.getFlags());
4173
4174 return B.getInstr();
4175 }
4176
4178 "Function must not be called with an addressing mode it can't handle");
4179}
4180
4181/// Return true if the opcode is a post-index ld/st instruction, which really
4182/// loads from base+0.
4183static bool isPostIndexLdStOpcode(unsigned Opcode) {
4184 switch (Opcode) {
4185 default:
4186 return false;
4187 case AArch64::LD1Fourv16b_POST:
4188 case AArch64::LD1Fourv1d_POST:
4189 case AArch64::LD1Fourv2d_POST:
4190 case AArch64::LD1Fourv2s_POST:
4191 case AArch64::LD1Fourv4h_POST:
4192 case AArch64::LD1Fourv4s_POST:
4193 case AArch64::LD1Fourv8b_POST:
4194 case AArch64::LD1Fourv8h_POST:
4195 case AArch64::LD1Onev16b_POST:
4196 case AArch64::LD1Onev1d_POST:
4197 case AArch64::LD1Onev2d_POST:
4198 case AArch64::LD1Onev2s_POST:
4199 case AArch64::LD1Onev4h_POST:
4200 case AArch64::LD1Onev4s_POST:
4201 case AArch64::LD1Onev8b_POST:
4202 case AArch64::LD1Onev8h_POST:
4203 case AArch64::LD1Rv16b_POST:
4204 case AArch64::LD1Rv1d_POST:
4205 case AArch64::LD1Rv2d_POST:
4206 case AArch64::LD1Rv2s_POST:
4207 case AArch64::LD1Rv4h_POST:
4208 case AArch64::LD1Rv4s_POST:
4209 case AArch64::LD1Rv8b_POST:
4210 case AArch64::LD1Rv8h_POST:
4211 case AArch64::LD1Threev16b_POST:
4212 case AArch64::LD1Threev1d_POST:
4213 case AArch64::LD1Threev2d_POST:
4214 case AArch64::LD1Threev2s_POST:
4215 case AArch64::LD1Threev4h_POST:
4216 case AArch64::LD1Threev4s_POST:
4217 case AArch64::LD1Threev8b_POST:
4218 case AArch64::LD1Threev8h_POST:
4219 case AArch64::LD1Twov16b_POST:
4220 case AArch64::LD1Twov1d_POST:
4221 case AArch64::LD1Twov2d_POST:
4222 case AArch64::LD1Twov2s_POST:
4223 case AArch64::LD1Twov4h_POST:
4224 case AArch64::LD1Twov4s_POST:
4225 case AArch64::LD1Twov8b_POST:
4226 case AArch64::LD1Twov8h_POST:
4227 case AArch64::LD1i16_POST:
4228 case AArch64::LD1i32_POST:
4229 case AArch64::LD1i64_POST:
4230 case AArch64::LD1i8_POST:
4231 case AArch64::LD2Rv16b_POST:
4232 case AArch64::LD2Rv1d_POST:
4233 case AArch64::LD2Rv2d_POST:
4234 case AArch64::LD2Rv2s_POST:
4235 case AArch64::LD2Rv4h_POST:
4236 case AArch64::LD2Rv4s_POST:
4237 case AArch64::LD2Rv8b_POST:
4238 case AArch64::LD2Rv8h_POST:
4239 case AArch64::LD2Twov16b_POST:
4240 case AArch64::LD2Twov2d_POST:
4241 case AArch64::LD2Twov2s_POST:
4242 case AArch64::LD2Twov4h_POST:
4243 case AArch64::LD2Twov4s_POST:
4244 case AArch64::LD2Twov8b_POST:
4245 case AArch64::LD2Twov8h_POST:
4246 case AArch64::LD2i16_POST:
4247 case AArch64::LD2i32_POST:
4248 case AArch64::LD2i64_POST:
4249 case AArch64::LD2i8_POST:
4250 case AArch64::LD3Rv16b_POST:
4251 case AArch64::LD3Rv1d_POST:
4252 case AArch64::LD3Rv2d_POST:
4253 case AArch64::LD3Rv2s_POST:
4254 case AArch64::LD3Rv4h_POST:
4255 case AArch64::LD3Rv4s_POST:
4256 case AArch64::LD3Rv8b_POST:
4257 case AArch64::LD3Rv8h_POST:
4258 case AArch64::LD3Threev16b_POST:
4259 case AArch64::LD3Threev2d_POST:
4260 case AArch64::LD3Threev2s_POST:
4261 case AArch64::LD3Threev4h_POST:
4262 case AArch64::LD3Threev4s_POST:
4263 case AArch64::LD3Threev8b_POST:
4264 case AArch64::LD3Threev8h_POST:
4265 case AArch64::LD3i16_POST:
4266 case AArch64::LD3i32_POST:
4267 case AArch64::LD3i64_POST:
4268 case AArch64::LD3i8_POST:
4269 case AArch64::LD4Fourv16b_POST:
4270 case AArch64::LD4Fourv2d_POST:
4271 case AArch64::LD4Fourv2s_POST:
4272 case AArch64::LD4Fourv4h_POST:
4273 case AArch64::LD4Fourv4s_POST:
4274 case AArch64::LD4Fourv8b_POST:
4275 case AArch64::LD4Fourv8h_POST:
4276 case AArch64::LD4Rv16b_POST:
4277 case AArch64::LD4Rv1d_POST:
4278 case AArch64::LD4Rv2d_POST:
4279 case AArch64::LD4Rv2s_POST:
4280 case AArch64::LD4Rv4h_POST:
4281 case AArch64::LD4Rv4s_POST:
4282 case AArch64::LD4Rv8b_POST:
4283 case AArch64::LD4Rv8h_POST:
4284 case AArch64::LD4i16_POST:
4285 case AArch64::LD4i32_POST:
4286 case AArch64::LD4i64_POST:
4287 case AArch64::LD4i8_POST:
4288 case AArch64::LDAPRWpost:
4289 case AArch64::LDAPRXpost:
4290 case AArch64::LDIAPPWpost:
4291 case AArch64::LDIAPPXpost:
4292 case AArch64::LDPDpost:
4293 case AArch64::LDPQpost:
4294 case AArch64::LDPSWpost:
4295 case AArch64::LDPSpost:
4296 case AArch64::LDPWpost:
4297 case AArch64::LDPXpost:
4298 case AArch64::LDRBBpost:
4299 case AArch64::LDRBpost:
4300 case AArch64::LDRDpost:
4301 case AArch64::LDRHHpost:
4302 case AArch64::LDRHpost:
4303 case AArch64::LDRQpost:
4304 case AArch64::LDRSBWpost:
4305 case AArch64::LDRSBXpost:
4306 case AArch64::LDRSHWpost:
4307 case AArch64::LDRSHXpost:
4308 case AArch64::LDRSWpost:
4309 case AArch64::LDRSpost:
4310 case AArch64::LDRWpost:
4311 case AArch64::LDRXpost:
4312 case AArch64::ST1Fourv16b_POST:
4313 case AArch64::ST1Fourv1d_POST:
4314 case AArch64::ST1Fourv2d_POST:
4315 case AArch64::ST1Fourv2s_POST:
4316 case AArch64::ST1Fourv4h_POST:
4317 case AArch64::ST1Fourv4s_POST:
4318 case AArch64::ST1Fourv8b_POST:
4319 case AArch64::ST1Fourv8h_POST:
4320 case AArch64::ST1Onev16b_POST:
4321 case AArch64::ST1Onev1d_POST:
4322 case AArch64::ST1Onev2d_POST:
4323 case AArch64::ST1Onev2s_POST:
4324 case AArch64::ST1Onev4h_POST:
4325 case AArch64::ST1Onev4s_POST:
4326 case AArch64::ST1Onev8b_POST:
4327 case AArch64::ST1Onev8h_POST:
4328 case AArch64::ST1Threev16b_POST:
4329 case AArch64::ST1Threev1d_POST:
4330 case AArch64::ST1Threev2d_POST:
4331 case AArch64::ST1Threev2s_POST:
4332 case AArch64::ST1Threev4h_POST:
4333 case AArch64::ST1Threev4s_POST:
4334 case AArch64::ST1Threev8b_POST:
4335 case AArch64::ST1Threev8h_POST:
4336 case AArch64::ST1Twov16b_POST:
4337 case AArch64::ST1Twov1d_POST:
4338 case AArch64::ST1Twov2d_POST:
4339 case AArch64::ST1Twov2s_POST:
4340 case AArch64::ST1Twov4h_POST:
4341 case AArch64::ST1Twov4s_POST:
4342 case AArch64::ST1Twov8b_POST:
4343 case AArch64::ST1Twov8h_POST:
4344 case AArch64::ST1i16_POST:
4345 case AArch64::ST1i32_POST:
4346 case AArch64::ST1i64_POST:
4347 case AArch64::ST1i8_POST:
4348 case AArch64::ST2GPostIndex:
4349 case AArch64::ST2Twov16b_POST:
4350 case AArch64::ST2Twov2d_POST:
4351 case AArch64::ST2Twov2s_POST:
4352 case AArch64::ST2Twov4h_POST:
4353 case AArch64::ST2Twov4s_POST:
4354 case AArch64::ST2Twov8b_POST:
4355 case AArch64::ST2Twov8h_POST:
4356 case AArch64::ST2i16_POST:
4357 case AArch64::ST2i32_POST:
4358 case AArch64::ST2i64_POST:
4359 case AArch64::ST2i8_POST:
4360 case AArch64::ST3Threev16b_POST:
4361 case AArch64::ST3Threev2d_POST:
4362 case AArch64::ST3Threev2s_POST:
4363 case AArch64::ST3Threev4h_POST:
4364 case AArch64::ST3Threev4s_POST:
4365 case AArch64::ST3Threev8b_POST:
4366 case AArch64::ST3Threev8h_POST:
4367 case AArch64::ST3i16_POST:
4368 case AArch64::ST3i32_POST:
4369 case AArch64::ST3i64_POST:
4370 case AArch64::ST3i8_POST:
4371 case AArch64::ST4Fourv16b_POST:
4372 case AArch64::ST4Fourv2d_POST:
4373 case AArch64::ST4Fourv2s_POST:
4374 case AArch64::ST4Fourv4h_POST:
4375 case AArch64::ST4Fourv4s_POST:
4376 case AArch64::ST4Fourv8b_POST:
4377 case AArch64::ST4Fourv8h_POST:
4378 case AArch64::ST4i16_POST:
4379 case AArch64::ST4i32_POST:
4380 case AArch64::ST4i64_POST:
4381 case AArch64::ST4i8_POST:
4382 case AArch64::STGPostIndex:
4383 case AArch64::STGPpost:
4384 case AArch64::STPDpost:
4385 case AArch64::STPQpost:
4386 case AArch64::STPSpost:
4387 case AArch64::STPWpost:
4388 case AArch64::STPXpost:
4389 case AArch64::STRBBpost:
4390 case AArch64::STRBpost:
4391 case AArch64::STRDpost:
4392 case AArch64::STRHHpost:
4393 case AArch64::STRHpost:
4394 case AArch64::STRQpost:
4395 case AArch64::STRSpost:
4396 case AArch64::STRWpost:
4397 case AArch64::STRXpost:
4398 case AArch64::STZ2GPostIndex:
4399 case AArch64::STZGPostIndex:
4400 return true;
4401 }
4402}
4403
4405 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4406 bool &OffsetIsScalable, TypeSize &Width,
4407 const TargetRegisterInfo *TRI) const {
4408 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4409 // Handle only loads/stores with base register followed by immediate offset.
4410 if (LdSt.getNumExplicitOperands() == 3) {
4411 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4412 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4413 !LdSt.getOperand(2).isImm())
4414 return false;
4415 } else if (LdSt.getNumExplicitOperands() == 4) {
4416 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4417 if (!LdSt.getOperand(1).isReg() ||
4418 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4419 !LdSt.getOperand(3).isImm())
4420 return false;
4421 } else
4422 return false;
4423
4424 // Get the scaling factor for the instruction and set the width for the
4425 // instruction.
4426 TypeSize Scale(0U, false);
4427 int64_t Dummy1, Dummy2;
4428
4429 // If this returns false, then it's an instruction we don't want to handle.
4430 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4431 return false;
4432
4433 // Compute the offset. Offset is calculated as the immediate operand
4434 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4435 // set to 1. Postindex are a special case which have an offset of 0.
4436 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4437 BaseOp = &LdSt.getOperand(2);
4438 Offset = 0;
4439 } else if (LdSt.getNumExplicitOperands() == 3) {
4440 BaseOp = &LdSt.getOperand(1);
4441 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4442 } else {
4443 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4444 BaseOp = &LdSt.getOperand(2);
4445 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4446 }
4447 OffsetIsScalable = Scale.isScalable();
4448
4449 return BaseOp->isReg() || BaseOp->isFI();
4450}
4451
4454 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4455 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4456 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4457 return OfsOp;
4458}
4459
4460bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4461 TypeSize &Width, int64_t &MinOffset,
4462 int64_t &MaxOffset) {
4463 switch (Opcode) {
4464 // Not a memory operation or something we want to handle.
4465 default:
4466 Scale = TypeSize::getFixed(0);
4467 Width = TypeSize::getFixed(0);
4468 MinOffset = MaxOffset = 0;
4469 return false;
4470 // LDR / STR
4471 case AArch64::LDRQui:
4472 case AArch64::STRQui:
4473 Scale = TypeSize::getFixed(16);
4474 Width = TypeSize::getFixed(16);
4475 MinOffset = 0;
4476 MaxOffset = 4095;
4477 break;
4478 case AArch64::LDRXui:
4479 case AArch64::LDRDui:
4480 case AArch64::STRXui:
4481 case AArch64::STRDui:
4482 case AArch64::PRFMui:
4483 Scale = TypeSize::getFixed(8);
4484 Width = TypeSize::getFixed(8);
4485 MinOffset = 0;
4486 MaxOffset = 4095;
4487 break;
4488 case AArch64::LDRWui:
4489 case AArch64::LDRSui:
4490 case AArch64::LDRSWui:
4491 case AArch64::STRWui:
4492 case AArch64::STRSui:
4493 Scale = TypeSize::getFixed(4);
4494 Width = TypeSize::getFixed(4);
4495 MinOffset = 0;
4496 MaxOffset = 4095;
4497 break;
4498 case AArch64::LDRHui:
4499 case AArch64::LDRHHui:
4500 case AArch64::LDRSHWui:
4501 case AArch64::LDRSHXui:
4502 case AArch64::STRHui:
4503 case AArch64::STRHHui:
4504 Scale = TypeSize::getFixed(2);
4505 Width = TypeSize::getFixed(2);
4506 MinOffset = 0;
4507 MaxOffset = 4095;
4508 break;
4509 case AArch64::LDRBui:
4510 case AArch64::LDRBBui:
4511 case AArch64::LDRSBWui:
4512 case AArch64::LDRSBXui:
4513 case AArch64::STRBui:
4514 case AArch64::STRBBui:
4515 Scale = TypeSize::getFixed(1);
4516 Width = TypeSize::getFixed(1);
4517 MinOffset = 0;
4518 MaxOffset = 4095;
4519 break;
4520 // post/pre inc
4521 case AArch64::STRQpre:
4522 case AArch64::LDRQpost:
4523 Scale = TypeSize::getFixed(1);
4524 Width = TypeSize::getFixed(16);
4525 MinOffset = -256;
4526 MaxOffset = 255;
4527 break;
4528 case AArch64::LDRDpost:
4529 case AArch64::LDRDpre:
4530 case AArch64::LDRXpost:
4531 case AArch64::LDRXpre:
4532 case AArch64::STRDpost:
4533 case AArch64::STRDpre:
4534 case AArch64::STRXpost:
4535 case AArch64::STRXpre:
4536 Scale = TypeSize::getFixed(1);
4537 Width = TypeSize::getFixed(8);
4538 MinOffset = -256;
4539 MaxOffset = 255;
4540 break;
4541 case AArch64::STRWpost:
4542 case AArch64::STRWpre:
4543 case AArch64::LDRWpost:
4544 case AArch64::LDRWpre:
4545 case AArch64::STRSpost:
4546 case AArch64::STRSpre:
4547 case AArch64::LDRSpost:
4548 case AArch64::LDRSpre:
4549 Scale = TypeSize::getFixed(1);
4550 Width = TypeSize::getFixed(4);
4551 MinOffset = -256;
4552 MaxOffset = 255;
4553 break;
4554 case AArch64::LDRHpost:
4555 case AArch64::LDRHpre:
4556 case AArch64::STRHpost:
4557 case AArch64::STRHpre:
4558 case AArch64::LDRHHpost:
4559 case AArch64::LDRHHpre:
4560 case AArch64::STRHHpost:
4561 case AArch64::STRHHpre:
4562 Scale = TypeSize::getFixed(1);
4563 Width = TypeSize::getFixed(2);
4564 MinOffset = -256;
4565 MaxOffset = 255;
4566 break;
4567 case AArch64::LDRBpost:
4568 case AArch64::LDRBpre:
4569 case AArch64::STRBpost:
4570 case AArch64::STRBpre:
4571 case AArch64::LDRBBpost:
4572 case AArch64::LDRBBpre:
4573 case AArch64::STRBBpost:
4574 case AArch64::STRBBpre:
4575 Scale = TypeSize::getFixed(1);
4576 Width = TypeSize::getFixed(1);
4577 MinOffset = -256;
4578 MaxOffset = 255;
4579 break;
4580 // Unscaled
4581 case AArch64::LDURQi:
4582 case AArch64::STURQi:
4583 Scale = TypeSize::getFixed(1);
4584 Width = TypeSize::getFixed(16);
4585 MinOffset = -256;
4586 MaxOffset = 255;
4587 break;
4588 case AArch64::LDURXi:
4589 case AArch64::LDURDi:
4590 case AArch64::LDAPURXi:
4591 case AArch64::STURXi:
4592 case AArch64::STURDi:
4593 case AArch64::STLURXi:
4594 case AArch64::PRFUMi:
4595 Scale = TypeSize::getFixed(1);
4596 Width = TypeSize::getFixed(8);
4597 MinOffset = -256;
4598 MaxOffset = 255;
4599 break;
4600 case AArch64::LDURWi:
4601 case AArch64::LDURSi:
4602 case AArch64::LDURSWi:
4603 case AArch64::LDAPURi:
4604 case AArch64::LDAPURSWi:
4605 case AArch64::STURWi:
4606 case AArch64::STURSi:
4607 case AArch64::STLURWi:
4608 Scale = TypeSize::getFixed(1);
4609 Width = TypeSize::getFixed(4);
4610 MinOffset = -256;
4611 MaxOffset = 255;
4612 break;
4613 case AArch64::LDURHi:
4614 case AArch64::LDURHHi:
4615 case AArch64::LDURSHXi:
4616 case AArch64::LDURSHWi:
4617 case AArch64::LDAPURHi:
4618 case AArch64::LDAPURSHWi:
4619 case AArch64::LDAPURSHXi:
4620 case AArch64::STURHi:
4621 case AArch64::STURHHi:
4622 case AArch64::STLURHi:
4623 Scale = TypeSize::getFixed(1);
4624 Width = TypeSize::getFixed(2);
4625 MinOffset = -256;
4626 MaxOffset = 255;
4627 break;
4628 case AArch64::LDURBi:
4629 case AArch64::LDURBBi:
4630 case AArch64::LDURSBXi:
4631 case AArch64::LDURSBWi:
4632 case AArch64::LDAPURBi:
4633 case AArch64::LDAPURSBWi:
4634 case AArch64::LDAPURSBXi:
4635 case AArch64::STURBi:
4636 case AArch64::STURBBi:
4637 case AArch64::STLURBi:
4638 Scale = TypeSize::getFixed(1);
4639 Width = TypeSize::getFixed(1);
4640 MinOffset = -256;
4641 MaxOffset = 255;
4642 break;
4643 // LDP / STP (including pre/post inc)
4644 case AArch64::LDPQi:
4645 case AArch64::LDNPQi:
4646 case AArch64::STPQi:
4647 case AArch64::STNPQi:
4648 case AArch64::LDPQpost:
4649 case AArch64::LDPQpre:
4650 case AArch64::STPQpost:
4651 case AArch64::STPQpre:
4652 Scale = TypeSize::getFixed(16);
4653 Width = TypeSize::getFixed(16 * 2);
4654 MinOffset = -64;
4655 MaxOffset = 63;
4656 break;
4657 case AArch64::LDPXi:
4658 case AArch64::LDPDi:
4659 case AArch64::LDNPXi:
4660 case AArch64::LDNPDi:
4661 case AArch64::STPXi:
4662 case AArch64::STPDi:
4663 case AArch64::STNPXi:
4664 case AArch64::STNPDi:
4665 case AArch64::LDPDpost:
4666 case AArch64::LDPDpre:
4667 case AArch64::LDPXpost:
4668 case AArch64::LDPXpre:
4669 case AArch64::STPDpost:
4670 case AArch64::STPDpre:
4671 case AArch64::STPXpost:
4672 case AArch64::STPXpre:
4673 Scale = TypeSize::getFixed(8);
4674 Width = TypeSize::getFixed(8 * 2);
4675 MinOffset = -64;
4676 MaxOffset = 63;
4677 break;
4678 case AArch64::LDPWi:
4679 case AArch64::LDPSi:
4680 case AArch64::LDNPWi:
4681 case AArch64::LDNPSi:
4682 case AArch64::STPWi:
4683 case AArch64::STPSi:
4684 case AArch64::STNPWi:
4685 case AArch64::STNPSi:
4686 case AArch64::LDPSpost:
4687 case AArch64::LDPSpre:
4688 case AArch64::LDPWpost:
4689 case AArch64::LDPWpre:
4690 case AArch64::STPSpost:
4691 case AArch64::STPSpre:
4692 case AArch64::STPWpost:
4693 case AArch64::STPWpre:
4694 Scale = TypeSize::getFixed(4);
4695 Width = TypeSize::getFixed(4 * 2);
4696 MinOffset = -64;
4697 MaxOffset = 63;
4698 break;
4699 case AArch64::StoreSwiftAsyncContext:
4700 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4701 Scale = TypeSize::getFixed(1);
4702 Width = TypeSize::getFixed(8);
4703 MinOffset = 0;
4704 MaxOffset = 4095;
4705 break;
4706 case AArch64::ADDG:
4707 Scale = TypeSize::getFixed(16);
4708 Width = TypeSize::getFixed(0);
4709 MinOffset = 0;
4710 MaxOffset = 63;
4711 break;
4712 case AArch64::TAGPstack:
4713 Scale = TypeSize::getFixed(16);
4714 Width = TypeSize::getFixed(0);
4715 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4716 // of 63 (not 64!).
4717 MinOffset = -63;
4718 MaxOffset = 63;
4719 break;
4720 case AArch64::LDG:
4721 case AArch64::STGi:
4722 case AArch64::STGPreIndex:
4723 case AArch64::STGPostIndex:
4724 case AArch64::STZGi:
4725 case AArch64::STZGPreIndex:
4726 case AArch64::STZGPostIndex:
4727 Scale = TypeSize::getFixed(16);
4728 Width = TypeSize::getFixed(16);
4729 MinOffset = -256;
4730 MaxOffset = 255;
4731 break;
4732 // SVE
4733 case AArch64::STR_ZZZZXI:
4734 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4735 case AArch64::LDR_ZZZZXI:
4736 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4737 Scale = TypeSize::getScalable(16);
4738 Width = TypeSize::getScalable(16 * 4);
4739 MinOffset = -256;
4740 MaxOffset = 252;
4741 break;
4742 case AArch64::STR_ZZZXI:
4743 case AArch64::LDR_ZZZXI:
4744 Scale = TypeSize::getScalable(16);
4745 Width = TypeSize::getScalable(16 * 3);
4746 MinOffset = -256;
4747 MaxOffset = 253;
4748 break;
4749 case AArch64::STR_ZZXI:
4750 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4751 case AArch64::LDR_ZZXI:
4752 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4753 Scale = TypeSize::getScalable(16);
4754 Width = TypeSize::getScalable(16 * 2);
4755 MinOffset = -256;
4756 MaxOffset = 254;
4757 break;
4758 case AArch64::LDR_PXI:
4759 case AArch64::STR_PXI:
4760 Scale = TypeSize::getScalable(2);
4761 Width = TypeSize::getScalable(2);
4762 MinOffset = -256;
4763 MaxOffset = 255;
4764 break;
4765 case AArch64::LDR_PPXI:
4766 case AArch64::STR_PPXI:
4767 Scale = TypeSize::getScalable(2);
4768 Width = TypeSize::getScalable(2 * 2);
4769 MinOffset = -256;
4770 MaxOffset = 254;
4771 break;
4772 case AArch64::LDR_ZXI:
4773 case AArch64::STR_ZXI:
4774 Scale = TypeSize::getScalable(16);
4775 Width = TypeSize::getScalable(16);
4776 MinOffset = -256;
4777 MaxOffset = 255;
4778 break;
4779 case AArch64::LD1B_IMM:
4780 case AArch64::LD1H_IMM:
4781 case AArch64::LD1W_IMM:
4782 case AArch64::LD1D_IMM:
4783 case AArch64::LDNT1B_ZRI:
4784 case AArch64::LDNT1H_ZRI:
4785 case AArch64::LDNT1W_ZRI:
4786 case AArch64::LDNT1D_ZRI:
4787 case AArch64::ST1B_IMM:
4788 case AArch64::ST1H_IMM:
4789 case AArch64::ST1W_IMM:
4790 case AArch64::ST1D_IMM:
4791 case AArch64::STNT1B_ZRI:
4792 case AArch64::STNT1H_ZRI:
4793 case AArch64::STNT1W_ZRI:
4794 case AArch64::STNT1D_ZRI:
4795 case AArch64::LDNF1B_IMM:
4796 case AArch64::LDNF1H_IMM:
4797 case AArch64::LDNF1W_IMM:
4798 case AArch64::LDNF1D_IMM:
4799 // A full vectors worth of data
4800 // Width = mbytes * elements
4801 Scale = TypeSize::getScalable(16);
4802 Width = TypeSize::getScalable(16);
4803 MinOffset = -8;
4804 MaxOffset = 7;
4805 break;
4806 case AArch64::LD2B_IMM:
4807 case AArch64::LD2H_IMM:
4808 case AArch64::LD2W_IMM:
4809 case AArch64::LD2D_IMM:
4810 case AArch64::ST2B_IMM:
4811 case AArch64::ST2H_IMM:
4812 case AArch64::ST2W_IMM:
4813 case AArch64::ST2D_IMM:
4814 Scale = TypeSize::getScalable(32);
4815 Width = TypeSize::getScalable(16 * 2);
4816 MinOffset = -8;
4817 MaxOffset = 7;
4818 break;
4819 case AArch64::LD3B_IMM:
4820 case AArch64::LD3H_IMM:
4821 case AArch64::LD3W_IMM:
4822 case AArch64::LD3D_IMM:
4823 case AArch64::ST3B_IMM:
4824 case AArch64::ST3H_IMM:
4825 case AArch64::ST3W_IMM:
4826 case AArch64::ST3D_IMM:
4827 Scale = TypeSize::getScalable(48);
4828 Width = TypeSize::getScalable(16 * 3);
4829 MinOffset = -8;
4830 MaxOffset = 7;
4831 break;
4832 case AArch64::LD4B_IMM:
4833 case AArch64::LD4H_IMM:
4834 case AArch64::LD4W_IMM:
4835 case AArch64::LD4D_IMM:
4836 case AArch64::ST4B_IMM:
4837 case AArch64::ST4H_IMM:
4838 case AArch64::ST4W_IMM:
4839 case AArch64::ST4D_IMM:
4840 Scale = TypeSize::getScalable(64);
4841 Width = TypeSize::getScalable(16 * 4);
4842 MinOffset = -8;
4843 MaxOffset = 7;
4844 break;
4845 case AArch64::LD1B_H_IMM:
4846 case AArch64::LD1SB_H_IMM:
4847 case AArch64::LD1H_S_IMM:
4848 case AArch64::LD1SH_S_IMM:
4849 case AArch64::LD1W_D_IMM:
4850 case AArch64::LD1SW_D_IMM:
4851 case AArch64::ST1B_H_IMM:
4852 case AArch64::ST1H_S_IMM:
4853 case AArch64::ST1W_D_IMM:
4854 case AArch64::LDNF1B_H_IMM:
4855 case AArch64::LDNF1SB_H_IMM:
4856 case AArch64::LDNF1H_S_IMM:
4857 case AArch64::LDNF1SH_S_IMM:
4858 case AArch64::LDNF1W_D_IMM:
4859 case AArch64::LDNF1SW_D_IMM:
4860 // A half vector worth of data
4861 // Width = mbytes * elements
4862 Scale = TypeSize::getScalable(8);
4863 Width = TypeSize::getScalable(8);
4864 MinOffset = -8;
4865 MaxOffset = 7;
4866 break;
4867 case AArch64::LD1B_S_IMM:
4868 case AArch64::LD1SB_S_IMM:
4869 case AArch64::LD1H_D_IMM:
4870 case AArch64::LD1SH_D_IMM:
4871 case AArch64::ST1B_S_IMM:
4872 case AArch64::ST1H_D_IMM:
4873 case AArch64::LDNF1B_S_IMM:
4874 case AArch64::LDNF1SB_S_IMM:
4875 case AArch64::LDNF1H_D_IMM:
4876 case AArch64::LDNF1SH_D_IMM:
4877 // A quarter vector worth of data
4878 // Width = mbytes * elements
4879 Scale = TypeSize::getScalable(4);
4880 Width = TypeSize::getScalable(4);
4881 MinOffset = -8;
4882 MaxOffset = 7;
4883 break;
4884 case AArch64::LD1B_D_IMM:
4885 case AArch64::LD1SB_D_IMM:
4886 case AArch64::ST1B_D_IMM:
4887 case AArch64::LDNF1B_D_IMM:
4888 case AArch64::LDNF1SB_D_IMM:
4889 // A eighth vector worth of data
4890 // Width = mbytes * elements
4891 Scale = TypeSize::getScalable(2);
4892 Width = TypeSize::getScalable(2);
4893 MinOffset = -8;
4894 MaxOffset = 7;
4895 break;
4896 case AArch64::ST2Gi:
4897 case AArch64::ST2GPreIndex:
4898 case AArch64::ST2GPostIndex:
4899 case AArch64::STZ2Gi:
4900 case AArch64::STZ2GPreIndex:
4901 case AArch64::STZ2GPostIndex:
4902 Scale = TypeSize::getFixed(16);
4903 Width = TypeSize::getFixed(32);
4904 MinOffset = -256;
4905 MaxOffset = 255;
4906 break;
4907 case AArch64::STGPi:
4908 case AArch64::STGPpost:
4909 case AArch64::STGPpre:
4910 Scale = TypeSize::getFixed(16);
4911 Width = TypeSize::getFixed(16);
4912 MinOffset = -64;
4913 MaxOffset = 63;
4914 break;
4915 case AArch64::LD1RB_IMM:
4916 case AArch64::LD1RB_H_IMM:
4917 case AArch64::LD1RB_S_IMM:
4918 case AArch64::LD1RB_D_IMM:
4919 case AArch64::LD1RSB_H_IMM:
4920 case AArch64::LD1RSB_S_IMM:
4921 case AArch64::LD1RSB_D_IMM:
4922 Scale = TypeSize::getFixed(1);
4923 Width = TypeSize::getFixed(1);
4924 MinOffset = 0;
4925 MaxOffset = 63;
4926 break;
4927 case AArch64::LD1RH_IMM:
4928 case AArch64::LD1RH_S_IMM:
4929 case AArch64::LD1RH_D_IMM:
4930 case AArch64::LD1RSH_S_IMM:
4931 case AArch64::LD1RSH_D_IMM:
4932 Scale = TypeSize::getFixed(2);
4933 Width = TypeSize::getFixed(2);
4934 MinOffset = 0;
4935 MaxOffset = 63;
4936 break;
4937 case AArch64::LD1RW_IMM:
4938 case AArch64::LD1RW_D_IMM:
4939 case AArch64::LD1RSW_IMM:
4940 Scale = TypeSize::getFixed(4);
4941 Width = TypeSize::getFixed(4);
4942 MinOffset = 0;
4943 MaxOffset = 63;
4944 break;
4945 case AArch64::LD1RD_IMM:
4946 Scale = TypeSize::getFixed(8);
4947 Width = TypeSize::getFixed(8);
4948 MinOffset = 0;
4949 MaxOffset = 63;
4950 break;
4951 }
4952
4953 return true;
4954}
4955
4956// Scaling factor for unscaled load or store.
4958 switch (Opc) {
4959 default:
4960 llvm_unreachable("Opcode has unknown scale!");
4961 case AArch64::LDRBui:
4962 case AArch64::LDRBBui:
4963 case AArch64::LDURBBi:
4964 case AArch64::LDRSBWui:
4965 case AArch64::LDURSBWi:
4966 case AArch64::STRBui:
4967 case AArch64::STRBBui:
4968 case AArch64::STURBBi:
4969 return 1;
4970 case AArch64::LDRHui:
4971 case AArch64::LDRHHui:
4972 case AArch64::LDURHHi:
4973 case AArch64::LDRSHWui:
4974 case AArch64::LDURSHWi:
4975 case AArch64::STRHui:
4976 case AArch64::STRHHui:
4977 case AArch64::STURHHi:
4978 return 2;
4979 case AArch64::LDRSui:
4980 case AArch64::LDURSi:
4981 case AArch64::LDRSpre:
4982 case AArch64::LDRSWui:
4983 case AArch64::LDURSWi:
4984 case AArch64::LDRSWpre:
4985 case AArch64::LDRWpre:
4986 case AArch64::LDRWui:
4987 case AArch64::LDURWi:
4988 case AArch64::STRSui:
4989 case AArch64::STURSi:
4990 case AArch64::STRSpre:
4991 case AArch64::STRWui:
4992 case AArch64::STURWi:
4993 case AArch64::STRWpre:
4994 case AArch64::LDPSi:
4995 case AArch64::LDPSWi:
4996 case AArch64::LDPWi:
4997 case AArch64::STPSi:
4998 case AArch64::STPWi:
4999 return 4;
5000 case AArch64::LDRDui:
5001 case AArch64::LDURDi:
5002 case AArch64::LDRDpre:
5003 case AArch64::LDRXui:
5004 case AArch64::LDURXi:
5005 case AArch64::LDRXpre:
5006 case AArch64::STRDui:
5007 case AArch64::STURDi:
5008 case AArch64::STRDpre:
5009 case AArch64::STRXui:
5010 case AArch64::STURXi:
5011 case AArch64::STRXpre:
5012 case AArch64::LDPDi:
5013 case AArch64::LDPXi:
5014 case AArch64::STPDi:
5015 case AArch64::STPXi:
5016 return 8;
5017 case AArch64::LDRQui:
5018 case AArch64::LDURQi:
5019 case AArch64::STRQui:
5020 case AArch64::STURQi:
5021 case AArch64::STRQpre:
5022 case AArch64::LDPQi:
5023 case AArch64::LDRQpre:
5024 case AArch64::STPQi:
5025 case AArch64::STGi:
5026 case AArch64::STZGi:
5027 case AArch64::ST2Gi:
5028 case AArch64::STZ2Gi:
5029 case AArch64::STGPi:
5030 return 16;
5031 }
5032}
5033
5035 switch (MI.getOpcode()) {
5036 default:
5037 return false;
5038 case AArch64::LDRWpre:
5039 case AArch64::LDRXpre:
5040 case AArch64::LDRSWpre:
5041 case AArch64::LDRSpre:
5042 case AArch64::LDRDpre:
5043 case AArch64::LDRQpre:
5044 return true;
5045 }
5046}
5047
5049 switch (MI.getOpcode()) {
5050 default:
5051 return false;
5052 case AArch64::STRWpre:
5053 case AArch64::STRXpre:
5054 case AArch64::STRSpre:
5055 case AArch64::STRDpre:
5056 case AArch64::STRQpre:
5057 return true;
5058 }
5059}
5060
5062 return isPreLd(MI) || isPreSt(MI);
5063}
5064
5066 switch (MI.getOpcode()) {
5067 default:
5068 return false;
5069 case AArch64::LDURBBi:
5070 case AArch64::LDURHHi:
5071 case AArch64::LDURWi:
5072 case AArch64::LDRBBui:
5073 case AArch64::LDRHHui:
5074 case AArch64::LDRWui:
5075 case AArch64::LDRBBroX:
5076 case AArch64::LDRHHroX:
5077 case AArch64::LDRWroX:
5078 case AArch64::LDRBBroW:
5079 case AArch64::LDRHHroW:
5080 case AArch64::LDRWroW:
5081 return true;
5082 }
5083}
5084
5086 switch (MI.getOpcode()) {
5087 default:
5088 return false;
5089 case AArch64::LDURSBWi:
5090 case AArch64::LDURSHWi:
5091 case AArch64::LDURSBXi:
5092 case AArch64::LDURSHXi:
5093 case AArch64::LDURSWi:
5094 case AArch64::LDRSBWui:
5095 case AArch64::LDRSHWui:
5096 case AArch64::LDRSBXui:
5097 case AArch64::LDRSHXui:
5098 case AArch64::LDRSWui:
5099 case AArch64::LDRSBWroX:
5100 case AArch64::LDRSHWroX:
5101 case AArch64::LDRSBXroX:
5102 case AArch64::LDRSHXroX:
5103 case AArch64::LDRSWroX:
5104 case AArch64::LDRSBWroW:
5105 case AArch64::LDRSHWroW:
5106 case AArch64::LDRSBXroW:
5107 case AArch64::LDRSHXroW:
5108 case AArch64::LDRSWroW:
5109 return true;
5110 }
5111}
5112
5114 switch (MI.getOpcode()) {
5115 default:
5116 return false;
5117 case AArch64::LDPSi:
5118 case AArch64::LDPSWi:
5119 case AArch64::LDPDi:
5120 case AArch64::LDPQi:
5121 case AArch64::LDPWi:
5122 case AArch64::LDPXi:
5123 case AArch64::STPSi:
5124 case AArch64::STPDi:
5125 case AArch64::STPQi:
5126 case AArch64::STPWi:
5127 case AArch64::STPXi:
5128 case AArch64::STGPi:
5129 return true;
5130 }
5131}
5132
5134 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5135 unsigned Idx =
5137 : 1;
5138 return MI.getOperand(Idx);
5139}
5140
5141const MachineOperand &
5143 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5144 unsigned Idx =
5146 : 2;
5147 return MI.getOperand(Idx);
5148}
5149
5150const MachineOperand &
5152 switch (MI.getOpcode()) {
5153 default:
5154 llvm_unreachable("Unexpected opcode");
5155 case AArch64::LDRBroX:
5156 case AArch64::LDRBBroX:
5157 case AArch64::LDRSBXroX:
5158 case AArch64::LDRSBWroX:
5159 case AArch64::LDRHroX:
5160 case AArch64::LDRHHroX:
5161 case AArch64::LDRSHXroX:
5162 case AArch64::LDRSHWroX:
5163 case AArch64::LDRWroX:
5164 case AArch64::LDRSroX:
5165 case AArch64::LDRSWroX:
5166 case AArch64::LDRDroX:
5167 case AArch64::LDRXroX:
5168 case AArch64::LDRQroX:
5169 return MI.getOperand(4);
5170 }
5171}
5172
5174 Register Reg) {
5175 if (MI.getParent() == nullptr)
5176 return nullptr;
5177 const MachineFunction *MF = MI.getParent()->getParent();
5178 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5179}
5180
5182 auto IsHFPR = [&](const MachineOperand &Op) {
5183 if (!Op.isReg())
5184 return false;
5185 auto Reg = Op.getReg();
5186 if (Reg.isPhysical())
5187 return AArch64::FPR16RegClass.contains(Reg);
5188 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5189 return TRC == &AArch64::FPR16RegClass ||
5190 TRC == &AArch64::FPR16_loRegClass;
5191 };
5192 return llvm::any_of(MI.operands(), IsHFPR);
5193}
5194
5196 auto IsQFPR = [&](const MachineOperand &Op) {
5197 if (!Op.isReg())
5198 return false;
5199 auto Reg = Op.getReg();
5200 if (Reg.isPhysical())
5201 return AArch64::FPR128RegClass.contains(Reg);
5202 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5203 return TRC == &AArch64::FPR128RegClass ||
5204 TRC == &AArch64::FPR128_loRegClass;
5205 };
5206 return llvm::any_of(MI.operands(), IsQFPR);
5207}
5208
5210 switch (MI.getOpcode()) {
5211 case AArch64::BRK:
5212 case AArch64::HLT:
5213 case AArch64::PACIASP:
5214 case AArch64::PACIBSP:
5215 // Implicit BTI behavior.
5216 return true;
5217 case AArch64::PAUTH_PROLOGUE:
5218 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5219 return true;
5220 case AArch64::HINT: {
5221 unsigned Imm = MI.getOperand(0).getImm();
5222 // Explicit BTI instruction.
5223 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5224 return true;
5225 // PACI(A|B)SP instructions.
5226 if (Imm == 25 || Imm == 27)
5227 return true;
5228 return false;
5229 }
5230 default:
5231 return false;
5232 }
5233}
5234
5236 if (Reg == 0)
5237 return false;
5238 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5239 return AArch64::FPR128RegClass.contains(Reg) ||
5240 AArch64::FPR64RegClass.contains(Reg) ||
5241 AArch64::FPR32RegClass.contains(Reg) ||
5242 AArch64::FPR16RegClass.contains(Reg) ||
5243 AArch64::FPR8RegClass.contains(Reg);
5244}
5245
5247 auto IsFPR = [&](const MachineOperand &Op) {
5248 if (!Op.isReg())
5249 return false;
5250 auto Reg = Op.getReg();
5251 if (Reg.isPhysical())
5252 return isFpOrNEON(Reg);
5253
5254 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5255 return TRC == &AArch64::FPR128RegClass ||
5256 TRC == &AArch64::FPR128_loRegClass ||
5257 TRC == &AArch64::FPR64RegClass ||
5258 TRC == &AArch64::FPR64_loRegClass ||
5259 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5260 TRC == &AArch64::FPR8RegClass;
5261 };
5262 return llvm::any_of(MI.operands(), IsFPR);
5263}
5264
5265// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5266// scaled.
5267static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5269
5270 // If the byte-offset isn't a multiple of the stride, we can't scale this
5271 // offset.
5272 if (Offset % Scale != 0)
5273 return false;
5274
5275 // Convert the byte-offset used by unscaled into an "element" offset used
5276 // by the scaled pair load/store instructions.
5277 Offset /= Scale;
5278 return true;
5279}
5280
5281static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5282 if (FirstOpc == SecondOpc)
5283 return true;
5284 // We can also pair sign-ext and zero-ext instructions.
5285 switch (FirstOpc) {
5286 default:
5287 return false;
5288 case AArch64::STRSui:
5289 case AArch64::STURSi:
5290 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5291 case AArch64::STRDui:
5292 case AArch64::STURDi:
5293 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5294 case AArch64::STRQui:
5295 case AArch64::STURQi:
5296 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5297 case AArch64::STRWui:
5298 case AArch64::STURWi:
5299 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5300 case AArch64::STRXui:
5301 case AArch64::STURXi:
5302 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5303 case AArch64::LDRSui:
5304 case AArch64::LDURSi:
5305 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5306 case AArch64::LDRDui:
5307 case AArch64::LDURDi:
5308 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5309 case AArch64::LDRQui:
5310 case AArch64::LDURQi:
5311 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5312 case AArch64::LDRWui:
5313 case AArch64::LDURWi:
5314 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5315 case AArch64::LDRSWui:
5316 case AArch64::LDURSWi:
5317 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5318 case AArch64::LDRXui:
5319 case AArch64::LDURXi:
5320 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5321 }
5322 // These instructions can't be paired based on their opcodes.
5323 return false;
5324}
5325
5326static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5327 int64_t Offset1, unsigned Opcode1, int FI2,
5328 int64_t Offset2, unsigned Opcode2) {
5329 // Accesses through fixed stack object frame indices may access a different
5330 // fixed stack slot. Check that the object offsets + offsets match.
5331 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5332 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5333 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5334 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5335 // Convert to scaled object offsets.
5336 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5337 if (ObjectOffset1 % Scale1 != 0)
5338 return false;
5339 ObjectOffset1 /= Scale1;
5340 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5341 if (ObjectOffset2 % Scale2 != 0)
5342 return false;
5343 ObjectOffset2 /= Scale2;
5344 ObjectOffset1 += Offset1;
5345 ObjectOffset2 += Offset2;
5346 return ObjectOffset1 + 1 == ObjectOffset2;
5347 }
5348
5349 return FI1 == FI2;
5350}
5351
5352/// Detect opportunities for ldp/stp formation.
5353///
5354/// Only called for LdSt for which getMemOperandWithOffset returns true.
5356 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5357 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5358 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5359 unsigned NumBytes) const {
5360 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5361 const MachineOperand &BaseOp1 = *BaseOps1.front();
5362 const MachineOperand &BaseOp2 = *BaseOps2.front();
5363 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5364 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5365 if (BaseOp1.getType() != BaseOp2.getType())
5366 return false;
5367
5368 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5369 "Only base registers and frame indices are supported.");
5370
5371 // Check for both base regs and base FI.
5372 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5373 return false;
5374
5375 // Only cluster up to a single pair.
5376 if (ClusterSize > 2)
5377 return false;
5378
5379 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5380 return false;
5381
5382 // Can we pair these instructions based on their opcodes?
5383 unsigned FirstOpc = FirstLdSt.getOpcode();
5384 unsigned SecondOpc = SecondLdSt.getOpcode();
5385 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5386 return false;
5387
5388 // Can't merge volatiles or load/stores that have a hint to avoid pair
5389 // formation, for example.
5390 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5391 !isCandidateToMergeOrPair(SecondLdSt))
5392 return false;
5393
5394 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5395 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5396 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5397 return false;
5398
5399 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5400 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5401 return false;
5402
5403 // Pairwise instructions have a 7-bit signed offset field.
5404 if (Offset1 > 63 || Offset1 < -64)
5405 return false;
5406
5407 // The caller should already have ordered First/SecondLdSt by offset.
5408 // Note: except for non-equal frame index bases
5409 if (BaseOp1.isFI()) {
5410 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5411 "Caller should have ordered offsets.");
5412
5413 const MachineFrameInfo &MFI =
5414 FirstLdSt.getParent()->getParent()->getFrameInfo();
5415 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5416 BaseOp2.getIndex(), Offset2, SecondOpc);
5417 }
5418
5419 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5420
5421 return Offset1 + 1 == Offset2;
5422}
5423
5425 MCRegister Reg, unsigned SubIdx,
5426 RegState State,
5427 const TargetRegisterInfo *TRI) {
5428 if (!SubIdx)
5429 return MIB.addReg(Reg, State);
5430
5431 if (Reg.isPhysical())
5432 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5433 return MIB.addReg(Reg, State, SubIdx);
5434}
5435
5436static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5437 unsigned NumRegs) {
5438 // We really want the positive remainder mod 32 here, that happens to be
5439 // easily obtainable with a mask.
5440 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5441}
5442
5445 const DebugLoc &DL, MCRegister DestReg,
5446 MCRegister SrcReg, bool KillSrc,
5447 unsigned Opcode,
5448 ArrayRef<unsigned> Indices) const {
5449 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5451 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5452 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5453 unsigned NumRegs = Indices.size();
5454
5455 int SubReg = 0, End = NumRegs, Incr = 1;
5456 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5457 SubReg = NumRegs - 1;
5458 End = -1;
5459 Incr = -1;
5460 }
5461
5462 for (; SubReg != End; SubReg += Incr) {
5463 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5464 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5465 AddSubReg(MIB, SrcReg, Indices[SubReg], {}, TRI);
5466 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5467 }
5468}
5469
5472 const DebugLoc &DL, MCRegister DestReg,
5473 MCRegister SrcReg, bool KillSrc,
5474 unsigned Opcode, unsigned ZeroReg,
5475 llvm::ArrayRef<unsigned> Indices) const {
5477 unsigned NumRegs = Indices.size();
5478
5479#ifndef NDEBUG
5480 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5481 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5482 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5483 "GPR reg sequences should not be able to overlap");
5484#endif
5485
5486 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5487 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5488 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5489 MIB.addReg(ZeroReg);
5490 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5491 MIB.addImm(0);
5492 }
5493}
5494
5495/// Returns true if the instruction at I is in a streaming call site region,
5496/// within a single basic block.
5497/// A "call site streaming region" starts after smstart and ends at smstop
5498/// around a call to a streaming function. This walks backward from I.
5501 MachineFunction &MF = *MBB.getParent();
5503 if (!AFI->hasStreamingModeChanges())
5504 return false;
5505 // Walk backwards to find smstart/smstop
5506 for (MachineInstr &MI : reverse(make_range(MBB.begin(), I))) {
5507 unsigned Opc = MI.getOpcode();
5508 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5509 // Check if this is SM change (not ZA)
5510 int64_t PState = MI.getOperand(0).getImm();
5511 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5512 // Operand 1 is 1 for start, 0 for stop
5513 return MI.getOperand(1).getImm() == 1;
5514 }
5515 }
5516 }
5517 return false;
5518}
5519
5520/// Returns true if in a streaming call site region without SME-FA64.
5521static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5524 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5525}
5526
5529 const DebugLoc &DL, Register DestReg,
5530 Register SrcReg, bool KillSrc,
5531 bool RenamableDest,
5532 bool RenamableSrc) const {
5533 ++NumCopyInstrs;
5534 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5535 AArch64::GPR32spRegClass.contains(SrcReg)) {
5536 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5537 // If either operand is WSP, expand to ADD #0.
5538 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5539 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5540 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5541 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5542 &AArch64::GPR64spRegClass);
5543 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5544 &AArch64::GPR64spRegClass);
5545 // This instruction is reading and writing X registers. This may upset
5546 // the register scavenger and machine verifier, so we need to indicate
5547 // that we are reading an undefined value from SrcRegX, but a proper
5548 // value from SrcReg.
5549 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5550 .addReg(SrcRegX, RegState::Undef)
5551 .addImm(0)
5553 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5554 ++NumZCRegMoveInstrsGPR;
5555 } else {
5556 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5557 .addReg(SrcReg, getKillRegState(KillSrc))
5558 .addImm(0)
5560 if (Subtarget.hasZeroCycleRegMoveGPR32())
5561 ++NumZCRegMoveInstrsGPR;
5562 }
5563 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5564 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5565 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5566 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5567 &AArch64::GPR64spRegClass);
5568 assert(DestRegX.isValid() && "Destination super-reg not valid");
5569 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5570 &AArch64::GPR64spRegClass);
5571 assert(SrcRegX.isValid() && "Source super-reg not valid");
5572 // This instruction is reading and writing X registers. This may upset
5573 // the register scavenger and machine verifier, so we need to indicate
5574 // that we are reading an undefined value from SrcRegX, but a proper
5575 // value from SrcReg.
5576 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5577 .addReg(AArch64::XZR)
5578 .addReg(SrcRegX, RegState::Undef)
5579 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5580 ++NumZCRegMoveInstrsGPR;
5581 } else {
5582 // Otherwise, expand to ORR WZR.
5583 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5584 .addReg(AArch64::WZR)
5585 .addReg(SrcReg, getKillRegState(KillSrc));
5586 if (Subtarget.hasZeroCycleRegMoveGPR32())
5587 ++NumZCRegMoveInstrsGPR;
5588 }
5589 return;
5590 }
5591
5592 // GPR32 zeroing
5593 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5594 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5595 !Subtarget.hasZeroCycleZeroingGPR32()) {
5596 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5597 &AArch64::GPR64spRegClass);
5598 assert(DestRegX.isValid() && "Destination super-reg not valid");
5599 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5600 .addImm(0)
5602 ++NumZCZeroingInstrsGPR;
5603 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5604 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5605 .addImm(0)
5607 ++NumZCZeroingInstrsGPR;
5608 } else {
5609 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5610 .addReg(AArch64::WZR)
5611 .addReg(AArch64::WZR);
5612 }
5613 return;
5614 }
5615
5616 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5617 AArch64::GPR64spRegClass.contains(SrcReg)) {
5618 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5619 // If either operand is SP, expand to ADD #0.
5620 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5621 .addReg(SrcReg, getKillRegState(KillSrc))
5622 .addImm(0)
5624 if (Subtarget.hasZeroCycleRegMoveGPR64())
5625 ++NumZCRegMoveInstrsGPR;
5626 } else {
5627 // Otherwise, expand to ORR XZR.
5628 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5629 .addReg(AArch64::XZR)
5630 .addReg(SrcReg, getKillRegState(KillSrc));
5631 if (Subtarget.hasZeroCycleRegMoveGPR64())
5632 ++NumZCRegMoveInstrsGPR;
5633 }
5634 return;
5635 }
5636
5637 // GPR64 zeroing
5638 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5639 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5640 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5641 .addImm(0)
5643 ++NumZCZeroingInstrsGPR;
5644 } else {
5645 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5646 .addReg(AArch64::XZR)
5647 .addReg(AArch64::XZR);
5648 }
5649 return;
5650 }
5651
5652 // Copy a Predicate register by ORRing with itself.
5653 if (AArch64::PPRRegClass.contains(DestReg) &&
5654 AArch64::PPRRegClass.contains(SrcReg)) {
5655 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5656 "Unexpected SVE register.");
5657 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5658 .addReg(SrcReg) // Pg
5659 .addReg(SrcReg)
5660 .addReg(SrcReg, getKillRegState(KillSrc));
5661 return;
5662 }
5663
5664 // Copy a predicate-as-counter register by ORRing with itself as if it
5665 // were a regular predicate (mask) register.
5666 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5667 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5668 if (DestIsPNR || SrcIsPNR) {
5669 auto ToPPR = [](MCRegister R) -> MCRegister {
5670 return (R - AArch64::PN0) + AArch64::P0;
5671 };
5672 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5673 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5674
5675 if (PPRSrcReg != PPRDestReg) {
5676 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5677 .addReg(PPRSrcReg) // Pg
5678 .addReg(PPRSrcReg)
5679 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5680 if (DestIsPNR)
5681 NewMI.addDef(DestReg, RegState::Implicit);
5682 }
5683 return;
5684 }
5685
5686 // Copy a Z register by ORRing with itself.
5687 if (AArch64::ZPRRegClass.contains(DestReg) &&
5688 AArch64::ZPRRegClass.contains(SrcReg)) {
5689 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5690 "Unexpected SVE register.");
5691 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5692 .addReg(SrcReg)
5693 .addReg(SrcReg, getKillRegState(KillSrc));
5694 return;
5695 }
5696
5697 // Copy a Z register pair by copying the individual sub-registers.
5698 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5699 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5700 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5701 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5702 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5703 "Unexpected SVE register.");
5704 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5705 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5706 Indices);
5707 return;
5708 }
5709
5710 // Copy a Z register triple by copying the individual sub-registers.
5711 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5712 AArch64::ZPR3RegClass.contains(SrcReg)) {
5713 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5714 "Unexpected SVE register.");
5715 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5716 AArch64::zsub2};
5717 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5718 Indices);
5719 return;
5720 }
5721
5722 // Copy a Z register quad by copying the individual sub-registers.
5723 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5724 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5725 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5726 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5727 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5728 "Unexpected SVE register.");
5729 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5730 AArch64::zsub2, AArch64::zsub3};
5731 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5732 Indices);
5733 return;
5734 }
5735
5736 // Copy a DDDD register quad by copying the individual sub-registers.
5737 if (AArch64::DDDDRegClass.contains(DestReg) &&
5738 AArch64::DDDDRegClass.contains(SrcReg)) {
5739 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5740 AArch64::dsub2, AArch64::dsub3};
5741 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5742 Indices);
5743 return;
5744 }
5745
5746 // Copy a DDD register triple by copying the individual sub-registers.
5747 if (AArch64::DDDRegClass.contains(DestReg) &&
5748 AArch64::DDDRegClass.contains(SrcReg)) {
5749 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5750 AArch64::dsub2};
5751 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5752 Indices);
5753 return;
5754 }
5755
5756 // Copy a DD register pair by copying the individual sub-registers.
5757 if (AArch64::DDRegClass.contains(DestReg) &&
5758 AArch64::DDRegClass.contains(SrcReg)) {
5759 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5760 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5761 Indices);
5762 return;
5763 }
5764
5765 // Copy a QQQQ register quad by copying the individual sub-registers.
5766 if (AArch64::QQQQRegClass.contains(DestReg) &&
5767 AArch64::QQQQRegClass.contains(SrcReg)) {
5768 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5769 AArch64::qsub2, AArch64::qsub3};
5770 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5771 Indices);
5772 return;
5773 }
5774
5775 // Copy a QQQ register triple by copying the individual sub-registers.
5776 if (AArch64::QQQRegClass.contains(DestReg) &&
5777 AArch64::QQQRegClass.contains(SrcReg)) {
5778 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5779 AArch64::qsub2};
5780 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5781 Indices);
5782 return;
5783 }
5784
5785 // Copy a QQ register pair by copying the individual sub-registers.
5786 if (AArch64::QQRegClass.contains(DestReg) &&
5787 AArch64::QQRegClass.contains(SrcReg)) {
5788 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5789 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5790 Indices);
5791 return;
5792 }
5793
5794 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5795 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5796 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5797 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5798 AArch64::XZR, Indices);
5799 return;
5800 }
5801
5802 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5803 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5804 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5805 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5806 AArch64::WZR, Indices);
5807 return;
5808 }
5809
5810 if (AArch64::FPR128RegClass.contains(DestReg) &&
5811 AArch64::FPR128RegClass.contains(SrcReg)) {
5812 // In streaming regions, NEON is illegal but streaming-SVE is available.
5813 // Use SVE for copies if we're in a streaming region and SME is available.
5814 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5815 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5816 !Subtarget.isNeonAvailable()) ||
5817 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5818 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5819 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5820 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5821 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5822 } else if (Subtarget.isNeonAvailable()) {
5823 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5824 .addReg(SrcReg)
5825 .addReg(SrcReg, getKillRegState(KillSrc));
5826 if (Subtarget.hasZeroCycleRegMoveFPR128())
5827 ++NumZCRegMoveInstrsFPR;
5828 } else {
5829 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5830 .addReg(AArch64::SP, RegState::Define)
5831 .addReg(SrcReg, getKillRegState(KillSrc))
5832 .addReg(AArch64::SP)
5833 .addImm(-16);
5834 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5835 .addReg(AArch64::SP, RegState::Define)
5836 .addReg(DestReg, RegState::Define)
5837 .addReg(AArch64::SP)
5838 .addImm(16);
5839 }
5840 return;
5841 }
5842
5843 if (AArch64::FPR64RegClass.contains(DestReg) &&
5844 AArch64::FPR64RegClass.contains(SrcReg)) {
5845 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5846 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5847 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5848 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5849 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
5850 &AArch64::FPR128RegClass);
5851 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
5852 &AArch64::FPR128RegClass);
5853 // This instruction is reading and writing Q registers. This may upset
5854 // the register scavenger and machine verifier, so we need to indicate
5855 // that we are reading an undefined value from SrcRegQ, but a proper
5856 // value from SrcReg.
5857 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5858 .addReg(SrcRegQ, RegState::Undef)
5859 .addReg(SrcRegQ, RegState::Undef)
5860 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5861 ++NumZCRegMoveInstrsFPR;
5862 } else {
5863 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5864 .addReg(SrcReg, getKillRegState(KillSrc));
5865 if (Subtarget.hasZeroCycleRegMoveFPR64())
5866 ++NumZCRegMoveInstrsFPR;
5867 }
5868 return;
5869 }
5870
5871 if (AArch64::FPR32RegClass.contains(DestReg) &&
5872 AArch64::FPR32RegClass.contains(SrcReg)) {
5873 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5874 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5875 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5876 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5877 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5878 &AArch64::FPR128RegClass);
5879 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5880 &AArch64::FPR128RegClass);
5881 // This instruction is reading and writing Q registers. This may upset
5882 // the register scavenger and machine verifier, so we need to indicate
5883 // that we are reading an undefined value from SrcRegQ, but a proper
5884 // value from SrcReg.
5885 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5886 .addReg(SrcRegQ, RegState::Undef)
5887 .addReg(SrcRegQ, RegState::Undef)
5888 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5889 ++NumZCRegMoveInstrsFPR;
5890 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5891 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5892 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5893 &AArch64::FPR64RegClass);
5894 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5895 &AArch64::FPR64RegClass);
5896 // This instruction is reading and writing D registers. This may upset
5897 // the register scavenger and machine verifier, so we need to indicate
5898 // that we are reading an undefined value from SrcRegD, but a proper
5899 // value from SrcReg.
5900 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5901 .addReg(SrcRegD, RegState::Undef)
5902 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5903 ++NumZCRegMoveInstrsFPR;
5904 } else {
5905 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5906 .addReg(SrcReg, getKillRegState(KillSrc));
5907 if (Subtarget.hasZeroCycleRegMoveFPR32())
5908 ++NumZCRegMoveInstrsFPR;
5909 }
5910 return;
5911 }
5912
5913 if (AArch64::FPR16RegClass.contains(DestReg) &&
5914 AArch64::FPR16RegClass.contains(SrcReg)) {
5915 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5916 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5917 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5918 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5919 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5920 &AArch64::FPR128RegClass);
5921 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5922 &AArch64::FPR128RegClass);
5923 // This instruction is reading and writing Q registers. This may upset
5924 // the register scavenger and machine verifier, so we need to indicate
5925 // that we are reading an undefined value from SrcRegQ, but a proper
5926 // value from SrcReg.
5927 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5928 .addReg(SrcRegQ, RegState::Undef)
5929 .addReg(SrcRegQ, RegState::Undef)
5930 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5931 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5932 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5933 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5934 &AArch64::FPR64RegClass);
5935 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5936 &AArch64::FPR64RegClass);
5937 // This instruction is reading and writing D registers. This may upset
5938 // the register scavenger and machine verifier, so we need to indicate
5939 // that we are reading an undefined value from SrcRegD, but a proper
5940 // value from SrcReg.
5941 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5942 .addReg(SrcRegD, RegState::Undef)
5943 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5944 } else {
5945 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5946 &AArch64::FPR32RegClass);
5947 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5948 &AArch64::FPR32RegClass);
5949 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5950 .addReg(SrcReg, getKillRegState(KillSrc));
5951 }
5952 return;
5953 }
5954
5955 if (AArch64::FPR8RegClass.contains(DestReg) &&
5956 AArch64::FPR8RegClass.contains(SrcReg)) {
5957 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5958 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5959 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5960 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5961 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5962 &AArch64::FPR128RegClass);
5963 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5964 &AArch64::FPR128RegClass);
5965 // This instruction is reading and writing Q registers. This may upset
5966 // the register scavenger and machine verifier, so we need to indicate
5967 // that we are reading an undefined value from SrcRegQ, but a proper
5968 // value from SrcReg.
5969 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5970 .addReg(SrcRegQ, RegState::Undef)
5971 .addReg(SrcRegQ, RegState::Undef)
5972 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5973 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5974 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5975 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5976 &AArch64::FPR64RegClass);
5977 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5978 &AArch64::FPR64RegClass);
5979 // This instruction is reading and writing D registers. This may upset
5980 // the register scavenger and machine verifier, so we need to indicate
5981 // that we are reading an undefined value from SrcRegD, but a proper
5982 // value from SrcReg.
5983 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5984 .addReg(SrcRegD, RegState::Undef)
5985 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5986 } else {
5987 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5988 &AArch64::FPR32RegClass);
5989 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5990 &AArch64::FPR32RegClass);
5991 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5992 .addReg(SrcReg, getKillRegState(KillSrc));
5993 }
5994 return;
5995 }
5996
5997 // Copies between GPR64 and FPR64.
5998 if (AArch64::FPR64RegClass.contains(DestReg) &&
5999 AArch64::GPR64RegClass.contains(SrcReg)) {
6000 if (AArch64::XZR == SrcReg) {
6001 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
6002 } else {
6003 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
6004 .addReg(SrcReg, getKillRegState(KillSrc));
6005 }
6006 return;
6007 }
6008 if (AArch64::GPR64RegClass.contains(DestReg) &&
6009 AArch64::FPR64RegClass.contains(SrcReg)) {
6010 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
6011 .addReg(SrcReg, getKillRegState(KillSrc));
6012 return;
6013 }
6014 // Copies between GPR32 and FPR32.
6015 if (AArch64::FPR32RegClass.contains(DestReg) &&
6016 AArch64::GPR32RegClass.contains(SrcReg)) {
6017 if (AArch64::WZR == SrcReg) {
6018 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
6019 } else {
6020 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
6021 .addReg(SrcReg, getKillRegState(KillSrc));
6022 }
6023 return;
6024 }
6025 if (AArch64::GPR32RegClass.contains(DestReg) &&
6026 AArch64::FPR32RegClass.contains(SrcReg)) {
6027 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
6028 .addReg(SrcReg, getKillRegState(KillSrc));
6029 return;
6030 }
6031
6032 if (DestReg == AArch64::NZCV) {
6033 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6034 BuildMI(MBB, I, DL, get(AArch64::MSR))
6035 .addImm(AArch64SysReg::NZCV)
6036 .addReg(SrcReg, getKillRegState(KillSrc))
6037 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
6038 return;
6039 }
6040
6041 if (SrcReg == AArch64::NZCV) {
6042 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6043 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
6044 .addImm(AArch64SysReg::NZCV)
6045 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
6046 return;
6047 }
6048
6049#ifndef NDEBUG
6050 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6051 << "\n";
6052#endif
6053 llvm_unreachable("unimplemented reg-to-reg copy");
6054}
6055
6058 MachineBasicBlock::iterator InsertBefore,
6059 const MCInstrDesc &MCID,
6060 Register SrcReg, bool IsKill,
6061 unsigned SubIdx0, unsigned SubIdx1, int FI,
6062 MachineMemOperand *MMO) {
6063 Register SrcReg0 = SrcReg;
6064 Register SrcReg1 = SrcReg;
6065 if (SrcReg.isPhysical()) {
6066 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
6067 SubIdx0 = 0;
6068 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
6069 SubIdx1 = 0;
6070 }
6071 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6072 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
6073 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
6074 .addFrameIndex(FI)
6075 .addImm(0)
6076 .addMemOperand(MMO);
6077}
6078
6081 Register SrcReg, bool isKill, int FI,
6082 const TargetRegisterClass *RC,
6083 Register VReg,
6084 MachineInstr::MIFlag Flags) const {
6085 MachineFunction &MF = *MBB.getParent();
6086 MachineFrameInfo &MFI = MF.getFrameInfo();
6087
6089 MachineMemOperand *MMO =
6091 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6092 unsigned Opc = 0;
6093 bool Offset = true;
6095 unsigned StackID = TargetStackID::Default;
6096 switch (RI.getSpillSize(*RC)) {
6097 case 1:
6098 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6099 Opc = AArch64::STRBui;
6100 break;
6101 case 2: {
6102 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6103 Opc = AArch64::STRHui;
6104 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6105 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6106 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6107 "Unexpected register store without SVE store instructions");
6108 Opc = AArch64::STR_PXI;
6110 }
6111 break;
6112 }
6113 case 4:
6114 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6115 Opc = AArch64::STRWui;
6116 if (SrcReg.isVirtual())
6117 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
6118 else
6119 assert(SrcReg != AArch64::WSP);
6120 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6121 Opc = AArch64::STRSui;
6122 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6123 Opc = AArch64::STR_PPXI;
6125 }
6126 break;
6127 case 8:
6128 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6129 Opc = AArch64::STRXui;
6130 if (SrcReg.isVirtual())
6131 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6132 else
6133 assert(SrcReg != AArch64::SP);
6134 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6135 Opc = AArch64::STRDui;
6136 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6138 get(AArch64::STPWi), SrcReg, isKill,
6139 AArch64::sube32, AArch64::subo32, FI, MMO);
6140 return;
6141 }
6142 break;
6143 case 16:
6144 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6145 Opc = AArch64::STRQui;
6146 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6147 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6148 Opc = AArch64::ST1Twov1d;
6149 Offset = false;
6150 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6152 get(AArch64::STPXi), SrcReg, isKill,
6153 AArch64::sube64, AArch64::subo64, FI, MMO);
6154 return;
6155 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6156 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6157 "Unexpected register store without SVE store instructions");
6158 Opc = AArch64::STR_ZXI;
6160 }
6161 break;
6162 case 24:
6163 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6164 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6165 Opc = AArch64::ST1Threev1d;
6166 Offset = false;
6167 }
6168 break;
6169 case 32:
6170 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6171 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6172 Opc = AArch64::ST1Fourv1d;
6173 Offset = false;
6174 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6175 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6176 Opc = AArch64::ST1Twov2d;
6177 Offset = false;
6178 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6179 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6180 "Unexpected register store without SVE store instructions");
6181 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6183 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6184 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6185 "Unexpected register store without SVE store instructions");
6186 Opc = AArch64::STR_ZZXI;
6188 }
6189 break;
6190 case 48:
6191 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6192 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6193 Opc = AArch64::ST1Threev2d;
6194 Offset = false;
6195 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6196 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6197 "Unexpected register store without SVE store instructions");
6198 Opc = AArch64::STR_ZZZXI;
6200 }
6201 break;
6202 case 64:
6203 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6204 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6205 Opc = AArch64::ST1Fourv2d;
6206 Offset = false;
6207 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6208 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6209 "Unexpected register store without SVE store instructions");
6210 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6212 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6213 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6214 "Unexpected register store without SVE store instructions");
6215 Opc = AArch64::STR_ZZZZXI;
6217 }
6218 break;
6219 }
6220 assert(Opc && "Unknown register class");
6221 MFI.setStackID(FI, StackID);
6222
6224 .addReg(SrcReg, getKillRegState(isKill))
6225 .addFrameIndex(FI);
6226
6227 if (Offset)
6228 MI.addImm(0);
6229 if (PNRReg.isValid())
6230 MI.addDef(PNRReg, RegState::Implicit);
6231 MI.addMemOperand(MMO);
6232}
6233
6236 MachineBasicBlock::iterator InsertBefore,
6237 const MCInstrDesc &MCID,
6238 Register DestReg, unsigned SubIdx0,
6239 unsigned SubIdx1, int FI,
6240 MachineMemOperand *MMO) {
6241 Register DestReg0 = DestReg;
6242 Register DestReg1 = DestReg;
6243 bool IsUndef = true;
6244 if (DestReg.isPhysical()) {
6245 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6246 SubIdx0 = 0;
6247 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6248 SubIdx1 = 0;
6249 IsUndef = false;
6250 }
6251 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6252 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6253 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6254 .addFrameIndex(FI)
6255 .addImm(0)
6256 .addMemOperand(MMO);
6257}
6258
6261 Register DestReg, int FI,
6262 const TargetRegisterClass *RC,
6263 Register VReg, unsigned SubReg,
6264 MachineInstr::MIFlag Flags) const {
6265 MachineFunction &MF = *MBB.getParent();
6266 MachineFrameInfo &MFI = MF.getFrameInfo();
6268 MachineMemOperand *MMO =
6270 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6271
6272 unsigned Opc = 0;
6273 bool Offset = true;
6274 unsigned StackID = TargetStackID::Default;
6276 switch (TRI.getSpillSize(*RC)) {
6277 case 1:
6278 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6279 Opc = AArch64::LDRBui;
6280 break;
6281 case 2: {
6282 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6283 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6284 Opc = AArch64::LDRHui;
6285 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6286 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6287 "Unexpected register load without SVE load instructions");
6288 if (IsPNR)
6289 PNRReg = DestReg;
6290 Opc = AArch64::LDR_PXI;
6292 }
6293 break;
6294 }
6295 case 4:
6296 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6297 Opc = AArch64::LDRWui;
6298 if (DestReg.isVirtual())
6299 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6300 else
6301 assert(DestReg != AArch64::WSP);
6302 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6303 Opc = AArch64::LDRSui;
6304 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6305 Opc = AArch64::LDR_PPXI;
6307 }
6308 break;
6309 case 8:
6310 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6311 Opc = AArch64::LDRXui;
6312 if (DestReg.isVirtual())
6313 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6314 else
6315 assert(DestReg != AArch64::SP);
6316 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6317 Opc = AArch64::LDRDui;
6318 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6320 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6321 AArch64::subo32, FI, MMO);
6322 return;
6323 }
6324 break;
6325 case 16:
6326 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6327 Opc = AArch64::LDRQui;
6328 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6329 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6330 Opc = AArch64::LD1Twov1d;
6331 Offset = false;
6332 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6334 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6335 AArch64::subo64, FI, MMO);
6336 return;
6337 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6338 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6339 "Unexpected register load without SVE load instructions");
6340 Opc = AArch64::LDR_ZXI;
6342 }
6343 break;
6344 case 24:
6345 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6346 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6347 Opc = AArch64::LD1Threev1d;
6348 Offset = false;
6349 }
6350 break;
6351 case 32:
6352 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6353 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6354 Opc = AArch64::LD1Fourv1d;
6355 Offset = false;
6356 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6357 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6358 Opc = AArch64::LD1Twov2d;
6359 Offset = false;
6360 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6361 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6362 "Unexpected register load without SVE load instructions");
6363 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6365 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6366 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6367 "Unexpected register load without SVE load instructions");
6368 Opc = AArch64::LDR_ZZXI;
6370 }
6371 break;
6372 case 48:
6373 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6374 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6375 Opc = AArch64::LD1Threev2d;
6376 Offset = false;
6377 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6378 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6379 "Unexpected register load without SVE load instructions");
6380 Opc = AArch64::LDR_ZZZXI;
6382 }
6383 break;
6384 case 64:
6385 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6386 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6387 Opc = AArch64::LD1Fourv2d;
6388 Offset = false;
6389 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6390 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6391 "Unexpected register load without SVE load instructions");
6392 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6394 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6395 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6396 "Unexpected register load without SVE load instructions");
6397 Opc = AArch64::LDR_ZZZZXI;
6399 }
6400 break;
6401 }
6402
6403 assert(Opc && "Unknown register class");
6404 MFI.setStackID(FI, StackID);
6405
6407 .addReg(DestReg, getDefRegState(true))
6408 .addFrameIndex(FI);
6409 if (Offset)
6410 MI.addImm(0);
6411 if (PNRReg.isValid() && !PNRReg.isVirtual())
6412 MI.addDef(PNRReg, RegState::Implicit);
6413 MI.addMemOperand(MMO);
6414}
6415
6417 const MachineInstr &UseMI,
6418 const TargetRegisterInfo *TRI) {
6419 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6420 UseMI.getIterator()),
6421 [TRI](const MachineInstr &I) {
6422 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6423 I.readsRegister(AArch64::NZCV, TRI);
6424 });
6425}
6426
6427void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6428 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6429 // The smallest scalable element supported by scaled SVE addressing
6430 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6431 // byte offset must always be a multiple of 2.
6432 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6433
6434 // VGSized offsets are divided by '2', because the VG register is the
6435 // the number of 64bit granules as opposed to 128bit vector chunks,
6436 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6437 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6438 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6439 ByteSized = Offset.getFixed();
6440 VGSized = Offset.getScalable() / 2;
6441}
6442
6443/// Returns the offset in parts to which this frame offset can be
6444/// decomposed for the purpose of describing a frame offset.
6445/// For non-scalable offsets this is simply its byte size.
6446void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6447 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6448 int64_t &NumDataVectors) {
6449 // The smallest scalable element supported by scaled SVE addressing
6450 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6451 // byte offset must always be a multiple of 2.
6452 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6453
6454 NumBytes = Offset.getFixed();
6455 NumDataVectors = 0;
6456 NumPredicateVectors = Offset.getScalable() / 2;
6457 // This method is used to get the offsets to adjust the frame offset.
6458 // If the function requires ADDPL to be used and needs more than two ADDPL
6459 // instructions, part of the offset is folded into NumDataVectors so that it
6460 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6461 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6462 NumPredicateVectors > 62) {
6463 NumDataVectors = NumPredicateVectors / 8;
6464 NumPredicateVectors -= NumDataVectors * 8;
6465 }
6466}
6467
6468// Convenience function to create a DWARF expression for: Constant `Operation`.
6469// This helper emits compact sequences for common cases. For example, for`-15
6470// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6473 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6474 // -Constant (1 to 31)
6475 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6476 Operation = dwarf::DW_OP_minus;
6477 } else if (Constant >= 0 && Constant <= 31) {
6478 // Literal value 0 to 31
6479 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6480 } else {
6481 // Signed constant
6482 Expr.push_back(dwarf::DW_OP_consts);
6484 }
6485 return Expr.push_back(Operation);
6486}
6487
6488// Convenience function to create a DWARF expression for a register.
6489static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6490 Expr.push_back((char)dwarf::DW_OP_bregx);
6492 Expr.push_back(0);
6493}
6494
6495// Convenience function to create a DWARF expression for loading a register from
6496// a CFA offset.
6498 int64_t OffsetFromDefCFA) {
6499 // This assumes the top of the DWARF stack contains the CFA.
6500 Expr.push_back(dwarf::DW_OP_dup);
6501 // Add the offset to the register.
6502 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6503 // Dereference the address (loads a 64 bit value)..
6504 Expr.push_back(dwarf::DW_OP_deref);
6505}
6506
6507// Convenience function to create a comment for
6508// (+/-) NumBytes (* RegScale)?
6509static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6510 StringRef RegScale = {}) {
6511 if (NumBytes) {
6512 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6513 if (!RegScale.empty())
6514 Comment << ' ' << RegScale;
6515 }
6516}
6517
6518// Creates an MCCFIInstruction:
6519// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6521 unsigned Reg,
6522 const StackOffset &Offset) {
6523 int64_t NumBytes, NumVGScaledBytes;
6524 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6525 NumVGScaledBytes);
6526 std::string CommentBuffer;
6527 llvm::raw_string_ostream Comment(CommentBuffer);
6528
6529 if (Reg == AArch64::SP)
6530 Comment << "sp";
6531 else if (Reg == AArch64::FP)
6532 Comment << "fp";
6533 else
6534 Comment << printReg(Reg, &TRI);
6535
6536 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6537 SmallString<64> Expr;
6538 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6539 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6540 // Reg + NumBytes
6541 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6542 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6543 appendOffsetComment(NumBytes, Comment);
6544 if (NumVGScaledBytes) {
6545 // + VG * NumVGScaledBytes
6546 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6547 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6548 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6549 Expr.push_back(dwarf::DW_OP_plus);
6550 }
6551
6552 // Wrap this into DW_CFA_def_cfa.
6553 SmallString<64> DefCfaExpr;
6554 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6555 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6556 DefCfaExpr.append(Expr.str());
6557 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6558 Comment.str());
6559}
6560
6562 unsigned FrameReg, unsigned Reg,
6563 const StackOffset &Offset,
6564 bool LastAdjustmentWasScalable) {
6565 if (Offset.getScalable())
6566 return createDefCFAExpression(TRI, Reg, Offset);
6567
6568 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6569 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6570
6571 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6572 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6573}
6574
6577 const StackOffset &OffsetFromDefCFA,
6578 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6579 int64_t NumBytes, NumVGScaledBytes;
6580 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6581 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6582
6583 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6584
6585 // Non-scalable offsets can use DW_CFA_offset directly.
6586 if (!NumVGScaledBytes)
6587 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6588
6589 std::string CommentBuffer;
6590 llvm::raw_string_ostream Comment(CommentBuffer);
6591 Comment << printReg(Reg, &TRI) << " @ cfa";
6592
6593 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6594 assert(NumVGScaledBytes && "Expected scalable offset");
6595 SmallString<64> OffsetExpr;
6596 // + VG * NumVGScaledBytes
6597 StringRef VGRegScale;
6598 if (IncomingVGOffsetFromDefCFA) {
6599 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6600 VGRegScale = "* IncomingVG";
6601 } else {
6602 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6603 VGRegScale = "* VG";
6604 }
6605 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6606 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6607 OffsetExpr.push_back(dwarf::DW_OP_plus);
6608 if (NumBytes) {
6609 // + NumBytes
6610 appendOffsetComment(NumBytes, Comment);
6611 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6612 }
6613
6614 // Wrap this into DW_CFA_expression
6615 SmallString<64> CfaExpr;
6616 CfaExpr.push_back(dwarf::DW_CFA_expression);
6617 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6618 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6619 CfaExpr.append(OffsetExpr.str());
6620
6621 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6622 Comment.str());
6623}
6624
6625// Helper function to emit a frame offset adjustment from a given
6626// pointer (SrcReg), stored into DestReg. This function is explicit
6627// in that it requires the opcode.
6630 const DebugLoc &DL, unsigned DestReg,
6631 unsigned SrcReg, int64_t Offset, unsigned Opc,
6632 const TargetInstrInfo *TII,
6633 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6634 bool *HasWinCFI, bool EmitCFAOffset,
6635 StackOffset CFAOffset, unsigned FrameReg) {
6636 int Sign = 1;
6637 unsigned MaxEncoding, ShiftSize;
6638 switch (Opc) {
6639 case AArch64::ADDXri:
6640 case AArch64::ADDSXri:
6641 case AArch64::SUBXri:
6642 case AArch64::SUBSXri:
6643 MaxEncoding = 0xfff;
6644 ShiftSize = 12;
6645 break;
6646 case AArch64::ADDVL_XXI:
6647 case AArch64::ADDPL_XXI:
6648 case AArch64::ADDSVL_XXI:
6649 case AArch64::ADDSPL_XXI:
6650 MaxEncoding = 31;
6651 ShiftSize = 0;
6652 if (Offset < 0) {
6653 MaxEncoding = 32;
6654 Sign = -1;
6655 Offset = -Offset;
6656 }
6657 break;
6658 default:
6659 llvm_unreachable("Unsupported opcode");
6660 }
6661
6662 // `Offset` can be in bytes or in "scalable bytes".
6663 int VScale = 1;
6664 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6665 VScale = 16;
6666 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6667 VScale = 2;
6668
6669 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6670 // scratch register. If DestReg is a virtual register, use it as the
6671 // scratch register; otherwise, create a new virtual register (to be
6672 // replaced by the scavenger at the end of PEI). That case can be optimized
6673 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6674 // register can be loaded with offset%8 and the add/sub can use an extending
6675 // instruction with LSL#3.
6676 // Currently the function handles any offsets but generates a poor sequence
6677 // of code.
6678 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6679
6680 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6681 Register TmpReg = DestReg;
6682 if (TmpReg == AArch64::XZR)
6683 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6684 &AArch64::GPR64RegClass);
6685 do {
6686 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6687 unsigned LocalShiftSize = 0;
6688 if (ThisVal > MaxEncoding) {
6689 ThisVal = ThisVal >> ShiftSize;
6690 LocalShiftSize = ShiftSize;
6691 }
6692 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6693 "Encoding cannot handle value that big");
6694
6695 Offset -= ThisVal << LocalShiftSize;
6696 if (Offset == 0)
6697 TmpReg = DestReg;
6698 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6699 .addReg(SrcReg)
6700 .addImm(Sign * (int)ThisVal);
6701 if (ShiftSize)
6702 MBI = MBI.addImm(
6704 MBI = MBI.setMIFlag(Flag);
6705
6706 auto Change =
6707 VScale == 1
6708 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6709 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6710 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6711 CFAOffset += Change;
6712 else
6713 CFAOffset -= Change;
6714 if (EmitCFAOffset && DestReg == TmpReg) {
6715 MachineFunction &MF = *MBB.getParent();
6716 const TargetSubtargetInfo &STI = MF.getSubtarget();
6717 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6718
6719 unsigned CFIIndex = MF.addFrameInst(
6720 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6721 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6722 .addCFIIndex(CFIIndex)
6723 .setMIFlags(Flag);
6724 }
6725
6726 if (NeedsWinCFI) {
6727 int Imm = (int)(ThisVal << LocalShiftSize);
6728 if (VScale != 1 && DestReg == AArch64::SP) {
6729 if (HasWinCFI)
6730 *HasWinCFI = true;
6731 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6732 .addImm(ThisVal)
6733 .setMIFlag(Flag);
6734 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6735 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6736 assert(VScale == 1 && "Expected non-scalable operation");
6737 if (HasWinCFI)
6738 *HasWinCFI = true;
6739 if (Imm == 0)
6740 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6741 else
6742 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6743 .addImm(Imm)
6744 .setMIFlag(Flag);
6745 assert(Offset == 0 && "Expected remaining offset to be zero to "
6746 "emit a single SEH directive");
6747 } else if (DestReg == AArch64::SP) {
6748 assert(VScale == 1 && "Expected non-scalable operation");
6749 if (HasWinCFI)
6750 *HasWinCFI = true;
6751 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6752 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6753 .addImm(Imm)
6754 .setMIFlag(Flag);
6755 }
6756 }
6757
6758 SrcReg = TmpReg;
6759 } while (Offset);
6760}
6761
6764 unsigned DestReg, unsigned SrcReg,
6766 MachineInstr::MIFlag Flag, bool SetNZCV,
6767 bool NeedsWinCFI, bool *HasWinCFI,
6768 bool EmitCFAOffset, StackOffset CFAOffset,
6769 unsigned FrameReg) {
6770 // If a function is marked as arm_locally_streaming, then the runtime value of
6771 // vscale in the prologue/epilogue is different the runtime value of vscale
6772 // in the function's body. To avoid having to consider multiple vscales,
6773 // we can use `addsvl` to allocate any scalable stack-slots, which under
6774 // most circumstances will be only locals, not callee-save slots.
6775 const Function &F = MBB.getParent()->getFunction();
6776 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6777
6778 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6779 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6780 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6781
6782 // Insert ADDSXri for scalable offset at the end.
6783 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6784 if (NeedsFinalDefNZCV)
6785 SetNZCV = false;
6786
6787 // First emit non-scalable frame offsets, or a simple 'mov'.
6788 if (Bytes || (!Offset && SrcReg != DestReg)) {
6789 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6790 "SP increment/decrement not 8-byte aligned");
6791 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6792 if (Bytes < 0) {
6793 Bytes = -Bytes;
6794 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6795 }
6796 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6797 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6798 FrameReg);
6799 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6800 ? StackOffset::getFixed(-Bytes)
6801 : StackOffset::getFixed(Bytes);
6802 SrcReg = DestReg;
6803 FrameReg = DestReg;
6804 }
6805
6806 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6807 "WinCFI can't allocate fractions of an SVE data vector");
6808
6809 if (NumDataVectors) {
6810 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6811 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6812 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6813 FrameReg);
6814 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6815 SrcReg = DestReg;
6816 }
6817
6818 if (NumPredicateVectors) {
6819 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6820 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6821 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6822 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6823 FrameReg);
6824 }
6825
6826 if (NeedsFinalDefNZCV)
6827 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
6828 .addReg(DestReg)
6829 .addImm(0)
6830 .addImm(0);
6831}
6832
6835 MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr *&CopyMI,
6836 LiveIntervals *LIS, VirtRegMap *VRM) const {
6837 // This is a bit of a hack. Consider this instruction:
6838 //
6839 // %0 = COPY %sp; GPR64all:%0
6840 //
6841 // We explicitly chose GPR64all for the virtual register so such a copy might
6842 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6843 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6844 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6845 //
6846 // To prevent that, we are going to constrain the %0 register class here.
6847 if (MI.isFullCopy()) {
6848 Register DstReg = MI.getOperand(0).getReg();
6849 Register SrcReg = MI.getOperand(1).getReg();
6850 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6851 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6852 return nullptr;
6853 }
6854 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6855 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6856 return nullptr;
6857 }
6858 // Nothing can folded with copy from/to NZCV.
6859 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6860 return nullptr;
6861 }
6862
6863 // Handle the case where a copy is being spilled or filled but the source
6864 // and destination register class don't match. For example:
6865 //
6866 // %0 = COPY %xzr; GPR64common:%0
6867 //
6868 // In this case we can still safely fold away the COPY and generate the
6869 // following spill code:
6870 //
6871 // STRXui %xzr, %stack.0
6872 //
6873 // This also eliminates spilled cross register class COPYs (e.g. between x and
6874 // d regs) of the same size. For example:
6875 //
6876 // %0 = COPY %1; GPR64:%0, FPR64:%1
6877 //
6878 // will be filled as
6879 //
6880 // LDRDui %0, fi<#0>
6881 //
6882 // instead of
6883 //
6884 // LDRXui %Temp, fi<#0>
6885 // %0 = FMOV %Temp
6886 //
6887 if (MI.isCopy() && Ops.size() == 1 &&
6888 // Make sure we're only folding the explicit COPY defs/uses.
6889 (Ops[0] == 0 || Ops[0] == 1)) {
6890 bool IsSpill = Ops[0] == 0;
6891 bool IsFill = !IsSpill;
6893 const MachineRegisterInfo &MRI = MF.getRegInfo();
6894 MachineBasicBlock &MBB = *MI.getParent();
6895 const MachineOperand &DstMO = MI.getOperand(0);
6896 const MachineOperand &SrcMO = MI.getOperand(1);
6897 Register DstReg = DstMO.getReg();
6898 Register SrcReg = SrcMO.getReg();
6899 // This is slightly expensive to compute for physical regs since
6900 // getMinimalPhysRegClass is slow.
6901 auto getRegClass = [&](unsigned Reg) {
6902 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6903 : TRI.getMinimalPhysRegClass(Reg);
6904 };
6905
6906 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6907 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6908 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6909 "Mismatched register size in non subreg COPY");
6910 if (IsSpill)
6911 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6912 getRegClass(SrcReg), Register());
6913 else
6914 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6915 getRegClass(DstReg), Register());
6916 return &*--InsertPt;
6917 }
6918
6919 // Handle cases like spilling def of:
6920 //
6921 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6922 //
6923 // where the physical register source can be widened and stored to the full
6924 // virtual reg destination stack slot, in this case producing:
6925 //
6926 // STRXui %xzr, %stack.0
6927 //
6928 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6929 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6930 assert(SrcMO.getSubReg() == 0 &&
6931 "Unexpected subreg on physical register");
6932 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6933 FrameIndex, &AArch64::GPR64RegClass, Register());
6934 return &*--InsertPt;
6935 }
6936
6937 // Handle cases like filling use of:
6938 //
6939 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6940 //
6941 // where we can load the full virtual reg source stack slot, into the subreg
6942 // destination, in this case producing:
6943 //
6944 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6945 //
6946 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6947 const TargetRegisterClass *FillRC = nullptr;
6948 switch (DstMO.getSubReg()) {
6949 default:
6950 break;
6951 case AArch64::sub_32:
6952 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
6953 FillRC = &AArch64::GPR32RegClass;
6954 break;
6955 case AArch64::ssub:
6956 FillRC = &AArch64::FPR32RegClass;
6957 break;
6958 case AArch64::dsub:
6959 FillRC = &AArch64::FPR64RegClass;
6960 break;
6961 }
6962
6963 if (FillRC) {
6964 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6965 TRI.getRegSizeInBits(*FillRC) &&
6966 "Mismatched regclass size on folded subreg COPY");
6967 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
6968 Register());
6969 MachineInstr &LoadMI = *--InsertPt;
6970 MachineOperand &LoadDst = LoadMI.getOperand(0);
6971 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6972 LoadDst.setSubReg(DstMO.getSubReg());
6973 LoadDst.setIsUndef();
6974 return &LoadMI;
6975 }
6976 }
6977 }
6978
6979 // Cannot fold.
6980 return nullptr;
6981}
6982
6984 StackOffset &SOffset,
6985 bool *OutUseUnscaledOp,
6986 unsigned *OutUnscaledOp,
6987 int64_t *EmittableOffset) {
6988 // Set output values in case of early exit.
6989 if (EmittableOffset)
6990 *EmittableOffset = 0;
6991 if (OutUseUnscaledOp)
6992 *OutUseUnscaledOp = false;
6993 if (OutUnscaledOp)
6994 *OutUnscaledOp = 0;
6995
6996 // Exit early for structured vector spills/fills as they can't take an
6997 // immediate offset.
6998 switch (MI.getOpcode()) {
6999 default:
7000 break;
7001 case AArch64::LD1Rv1d:
7002 case AArch64::LD1Rv2s:
7003 case AArch64::LD1Rv2d:
7004 case AArch64::LD1Rv4h:
7005 case AArch64::LD1Rv4s:
7006 case AArch64::LD1Rv8b:
7007 case AArch64::LD1Rv8h:
7008 case AArch64::LD1Rv16b:
7009 case AArch64::LD1Twov2d:
7010 case AArch64::LD1Threev2d:
7011 case AArch64::LD1Fourv2d:
7012 case AArch64::LD1Twov1d:
7013 case AArch64::LD1Threev1d:
7014 case AArch64::LD1Fourv1d:
7015 case AArch64::ST1Twov2d:
7016 case AArch64::ST1Threev2d:
7017 case AArch64::ST1Fourv2d:
7018 case AArch64::ST1Twov1d:
7019 case AArch64::ST1Threev1d:
7020 case AArch64::ST1Fourv1d:
7021 case AArch64::ST1i8:
7022 case AArch64::ST1i16:
7023 case AArch64::ST1i32:
7024 case AArch64::ST1i64:
7025 case AArch64::IRG:
7026 case AArch64::IRGstack:
7027 case AArch64::STGloop:
7028 case AArch64::STZGloop:
7030 }
7031
7032 // Get the min/max offset and the scale.
7033 TypeSize ScaleValue(0U, false), Width(0U, false);
7034 int64_t MinOff, MaxOff;
7035 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
7036 MaxOff))
7037 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7038
7039 // Construct the complete offset.
7040 bool IsMulVL = ScaleValue.isScalable();
7041 unsigned Scale = ScaleValue.getKnownMinValue();
7042 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7043
7044 const MachineOperand &ImmOpnd =
7045 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
7046 Offset += ImmOpnd.getImm() * Scale;
7047
7048 // If the offset doesn't match the scale, we rewrite the instruction to
7049 // use the unscaled instruction instead. Likewise, if we have a negative
7050 // offset and there is an unscaled op to use.
7051 std::optional<unsigned> UnscaledOp =
7053 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7054 if (useUnscaledOp &&
7055 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
7056 MaxOff))
7057 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7058
7059 Scale = ScaleValue.getKnownMinValue();
7060 assert(IsMulVL == ScaleValue.isScalable() &&
7061 "Unscaled opcode has different value for scalable");
7062
7063 int64_t Remainder = Offset % Scale;
7064 assert(!(Remainder && useUnscaledOp) &&
7065 "Cannot have remainder when using unscaled op");
7066
7067 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7068 int64_t NewOffset = Offset / Scale;
7069 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7070 Offset = Remainder;
7071 else {
7072 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7073 Offset = Offset - (NewOffset * Scale);
7074 }
7075
7076 if (EmittableOffset)
7077 *EmittableOffset = NewOffset;
7078 if (OutUseUnscaledOp)
7079 *OutUseUnscaledOp = useUnscaledOp;
7080 if (OutUnscaledOp && UnscaledOp)
7081 *OutUnscaledOp = *UnscaledOp;
7082
7083 if (IsMulVL)
7084 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
7085 else
7086 SOffset = StackOffset::get(Offset, SOffset.getScalable());
7088 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7089}
7090
7092 unsigned FrameReg, StackOffset &Offset,
7093 const AArch64InstrInfo *TII) {
7094 unsigned Opcode = MI.getOpcode();
7095 unsigned ImmIdx = FrameRegIdx + 1;
7096
7097 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7098 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
7099 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
7100 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
7101 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
7102 MI.eraseFromParent();
7103 Offset = StackOffset();
7104 return true;
7105 }
7106
7107 int64_t NewOffset;
7108 unsigned UnscaledOp;
7109 bool UseUnscaledOp;
7110 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
7111 &UnscaledOp, &NewOffset);
7114 // Replace the FrameIndex with FrameReg.
7115 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
7116 if (UseUnscaledOp)
7117 MI.setDesc(TII->get(UnscaledOp));
7118
7119 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
7120 return !Offset;
7121 }
7122
7123 return false;
7124}
7125
7131
7132MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7133
7134// AArch64 supports MachineCombiner.
7135bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7136
7137// True when Opc sets flag
7138static bool isCombineInstrSettingFlag(unsigned Opc) {
7139 switch (Opc) {
7140 case AArch64::ADDSWrr:
7141 case AArch64::ADDSWri:
7142 case AArch64::ADDSXrr:
7143 case AArch64::ADDSXri:
7144 case AArch64::SUBSWrr:
7145 case AArch64::SUBSXrr:
7146 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7147 case AArch64::SUBSWri:
7148 case AArch64::SUBSXri:
7149 return true;
7150 default:
7151 break;
7152 }
7153 return false;
7154}
7155
7156// 32b Opcodes that can be combined with a MUL
7157static bool isCombineInstrCandidate32(unsigned Opc) {
7158 switch (Opc) {
7159 case AArch64::ADDWrr:
7160 case AArch64::ADDWri:
7161 case AArch64::SUBWrr:
7162 case AArch64::ADDSWrr:
7163 case AArch64::ADDSWri:
7164 case AArch64::SUBSWrr:
7165 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7166 case AArch64::SUBWri:
7167 case AArch64::SUBSWri:
7168 return true;
7169 default:
7170 break;
7171 }
7172 return false;
7173}
7174
7175// 64b Opcodes that can be combined with a MUL
7176static bool isCombineInstrCandidate64(unsigned Opc) {
7177 switch (Opc) {
7178 case AArch64::ADDXrr:
7179 case AArch64::ADDXri:
7180 case AArch64::SUBXrr:
7181 case AArch64::ADDSXrr:
7182 case AArch64::ADDSXri:
7183 case AArch64::SUBSXrr:
7184 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7185 case AArch64::SUBXri:
7186 case AArch64::SUBSXri:
7187 case AArch64::ADDv8i8:
7188 case AArch64::ADDv16i8:
7189 case AArch64::ADDv4i16:
7190 case AArch64::ADDv8i16:
7191 case AArch64::ADDv2i32:
7192 case AArch64::ADDv4i32:
7193 case AArch64::SUBv8i8:
7194 case AArch64::SUBv16i8:
7195 case AArch64::SUBv4i16:
7196 case AArch64::SUBv8i16:
7197 case AArch64::SUBv2i32:
7198 case AArch64::SUBv4i32:
7199 return true;
7200 default:
7201 break;
7202 }
7203 return false;
7204}
7205
7206// FP Opcodes that can be combined with a FMUL.
7207static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7208 switch (Inst.getOpcode()) {
7209 default:
7210 break;
7211 case AArch64::FADDHrr:
7212 case AArch64::FADDSrr:
7213 case AArch64::FADDDrr:
7214 case AArch64::FADDv4f16:
7215 case AArch64::FADDv8f16:
7216 case AArch64::FADDv2f32:
7217 case AArch64::FADDv2f64:
7218 case AArch64::FADDv4f32:
7219 case AArch64::FSUBHrr:
7220 case AArch64::FSUBSrr:
7221 case AArch64::FSUBDrr:
7222 case AArch64::FSUBv4f16:
7223 case AArch64::FSUBv8f16:
7224 case AArch64::FSUBv2f32:
7225 case AArch64::FSUBv2f64:
7226 case AArch64::FSUBv4f32:
7228 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7229 // the target options or if FADD/FSUB has the contract fast-math flag.
7230 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7232 }
7233 return false;
7234}
7235
7236// Opcodes that can be combined with a MUL
7240
7241//
7242// Utility routine that checks if \param MO is defined by an
7243// \param CombineOpc instruction in the basic block \param MBB
7245 unsigned CombineOpc, unsigned ZeroReg = 0,
7246 bool CheckZeroReg = false) {
7247 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7248 MachineInstr *MI = nullptr;
7249
7250 if (MO.isReg() && MO.getReg().isVirtual())
7251 MI = MRI.getUniqueVRegDef(MO.getReg());
7252 // And it needs to be in the trace (otherwise, it won't have a depth).
7253 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7254 return false;
7255 // Must only used by the user we combine with.
7256 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7257 return false;
7258
7259 if (CheckZeroReg) {
7260 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7261 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7262 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7263 // The third input reg must be zero.
7264 if (MI->getOperand(3).getReg() != ZeroReg)
7265 return false;
7266 }
7267
7268 if (isCombineInstrSettingFlag(CombineOpc) &&
7269 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7270 return false;
7271
7272 return true;
7273}
7274
7275//
7276// Is \param MO defined by an integer multiply and can be combined?
7278 unsigned MulOpc, unsigned ZeroReg) {
7279 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7280}
7281
7282//
7283// Is \param MO defined by a floating-point multiply and can be combined?
7285 unsigned MulOpc) {
7286 return canCombine(MBB, MO, MulOpc);
7287}
7288
7289// TODO: There are many more machine instruction opcodes to match:
7290// 1. Other data types (integer, vectors)
7291// 2. Other math / logic operations (xor, or)
7292// 3. Other forms of the same operation (intrinsics and other variants)
7293bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7294 bool Invert) const {
7295 if (Invert)
7296 return false;
7297 switch (Inst.getOpcode()) {
7298 // == Floating-point types ==
7299 // -- Floating-point instructions --
7300 case AArch64::FADDHrr:
7301 case AArch64::FADDSrr:
7302 case AArch64::FADDDrr:
7303 case AArch64::FMULHrr:
7304 case AArch64::FMULSrr:
7305 case AArch64::FMULDrr:
7306 case AArch64::FMULX16:
7307 case AArch64::FMULX32:
7308 case AArch64::FMULX64:
7309 // -- Advanced SIMD instructions --
7310 case AArch64::FADDv4f16:
7311 case AArch64::FADDv8f16:
7312 case AArch64::FADDv2f32:
7313 case AArch64::FADDv4f32:
7314 case AArch64::FADDv2f64:
7315 case AArch64::FMULv4f16:
7316 case AArch64::FMULv8f16:
7317 case AArch64::FMULv2f32:
7318 case AArch64::FMULv4f32:
7319 case AArch64::FMULv2f64:
7320 case AArch64::FMULXv4f16:
7321 case AArch64::FMULXv8f16:
7322 case AArch64::FMULXv2f32:
7323 case AArch64::FMULXv4f32:
7324 case AArch64::FMULXv2f64:
7325 // -- SVE instructions --
7326 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7327 // in the SVE instruction set (though there are predicated ones).
7328 case AArch64::FADD_ZZZ_H:
7329 case AArch64::FADD_ZZZ_S:
7330 case AArch64::FADD_ZZZ_D:
7331 case AArch64::FMUL_ZZZ_H:
7332 case AArch64::FMUL_ZZZ_S:
7333 case AArch64::FMUL_ZZZ_D:
7336
7337 // == Integer types ==
7338 // -- Base instructions --
7339 // Opcodes MULWrr and MULXrr don't exist because
7340 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7341 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7342 // The machine-combiner does not support three-source-operands machine
7343 // instruction. So we cannot reassociate MULs.
7344 case AArch64::ADDWrr:
7345 case AArch64::ADDXrr:
7346 case AArch64::ANDWrr:
7347 case AArch64::ANDXrr:
7348 case AArch64::ORRWrr:
7349 case AArch64::ORRXrr:
7350 case AArch64::EORWrr:
7351 case AArch64::EORXrr:
7352 case AArch64::EONWrr:
7353 case AArch64::EONXrr:
7354 // -- Advanced SIMD instructions --
7355 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7356 // in the Advanced SIMD instruction set.
7357 case AArch64::ADDv8i8:
7358 case AArch64::ADDv16i8:
7359 case AArch64::ADDv4i16:
7360 case AArch64::ADDv8i16:
7361 case AArch64::ADDv2i32:
7362 case AArch64::ADDv4i32:
7363 case AArch64::ADDv1i64:
7364 case AArch64::ADDv2i64:
7365 case AArch64::MULv8i8:
7366 case AArch64::MULv16i8:
7367 case AArch64::MULv4i16:
7368 case AArch64::MULv8i16:
7369 case AArch64::MULv2i32:
7370 case AArch64::MULv4i32:
7371 case AArch64::ANDv8i8:
7372 case AArch64::ANDv16i8:
7373 case AArch64::ORRv8i8:
7374 case AArch64::ORRv16i8:
7375 case AArch64::EORv8i8:
7376 case AArch64::EORv16i8:
7377 // -- SVE instructions --
7378 case AArch64::ADD_ZZZ_B:
7379 case AArch64::ADD_ZZZ_H:
7380 case AArch64::ADD_ZZZ_S:
7381 case AArch64::ADD_ZZZ_D:
7382 case AArch64::MUL_ZZZ_B:
7383 case AArch64::MUL_ZZZ_H:
7384 case AArch64::MUL_ZZZ_S:
7385 case AArch64::MUL_ZZZ_D:
7386 case AArch64::AND_ZZZ:
7387 case AArch64::ORR_ZZZ:
7388 case AArch64::EOR_ZZZ:
7389 return true;
7390
7391 default:
7392 return false;
7393 }
7394}
7395
7396/// Find instructions that can be turned into madd.
7398 SmallVectorImpl<unsigned> &Patterns) {
7399 unsigned Opc = Root.getOpcode();
7400 MachineBasicBlock &MBB = *Root.getParent();
7401 bool Found = false;
7402
7404 return false;
7406 int Cmp_NZCV =
7407 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7408 // When NZCV is live bail out.
7409 if (Cmp_NZCV == -1)
7410 return false;
7411 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7412 // When opcode can't change bail out.
7413 // CHECKME: do we miss any cases for opcode conversion?
7414 if (NewOpc == Opc)
7415 return false;
7416 Opc = NewOpc;
7417 }
7418
7419 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7420 unsigned Pattern) {
7421 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7422 Patterns.push_back(Pattern);
7423 Found = true;
7424 }
7425 };
7426
7427 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7428 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7429 Patterns.push_back(Pattern);
7430 Found = true;
7431 }
7432 };
7433
7435
7436 switch (Opc) {
7437 default:
7438 break;
7439 case AArch64::ADDWrr:
7440 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7441 "ADDWrr does not have register operands");
7442 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7443 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7444 break;
7445 case AArch64::ADDXrr:
7446 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7447 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7448 break;
7449 case AArch64::SUBWrr:
7450 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7451 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7452 break;
7453 case AArch64::SUBXrr:
7454 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7455 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7456 break;
7457 case AArch64::ADDWri:
7458 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7459 break;
7460 case AArch64::ADDXri:
7461 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7462 break;
7463 case AArch64::SUBWri:
7464 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7465 break;
7466 case AArch64::SUBXri:
7467 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7468 break;
7469 case AArch64::ADDv8i8:
7470 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7471 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7472 break;
7473 case AArch64::ADDv16i8:
7474 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7475 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7476 break;
7477 case AArch64::ADDv4i16:
7478 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7479 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7480 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7481 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7482 break;
7483 case AArch64::ADDv8i16:
7484 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7485 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7486 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7487 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7488 break;
7489 case AArch64::ADDv2i32:
7490 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7491 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7492 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7493 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7494 break;
7495 case AArch64::ADDv4i32:
7496 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7497 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7498 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7499 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7500 break;
7501 case AArch64::SUBv8i8:
7502 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7503 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7504 break;
7505 case AArch64::SUBv16i8:
7506 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7507 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7508 break;
7509 case AArch64::SUBv4i16:
7510 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7511 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7512 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7513 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7514 break;
7515 case AArch64::SUBv8i16:
7516 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7517 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7518 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7519 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7520 break;
7521 case AArch64::SUBv2i32:
7522 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7523 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7524 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7525 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7526 break;
7527 case AArch64::SUBv4i32:
7528 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7529 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7530 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7531 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7532 break;
7533 }
7534 return Found;
7535}
7536
7537bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7538 switch (Opcode) {
7539 default:
7540 break;
7541 case AArch64::UABALB_ZZZ_D:
7542 case AArch64::UABALB_ZZZ_H:
7543 case AArch64::UABALB_ZZZ_S:
7544 case AArch64::UABALT_ZZZ_D:
7545 case AArch64::UABALT_ZZZ_H:
7546 case AArch64::UABALT_ZZZ_S:
7547 case AArch64::SABALB_ZZZ_D:
7548 case AArch64::SABALB_ZZZ_S:
7549 case AArch64::SABALB_ZZZ_H:
7550 case AArch64::SABALT_ZZZ_D:
7551 case AArch64::SABALT_ZZZ_S:
7552 case AArch64::SABALT_ZZZ_H:
7553 case AArch64::UABALv16i8_v8i16:
7554 case AArch64::UABALv2i32_v2i64:
7555 case AArch64::UABALv4i16_v4i32:
7556 case AArch64::UABALv4i32_v2i64:
7557 case AArch64::UABALv8i16_v4i32:
7558 case AArch64::UABALv8i8_v8i16:
7559 case AArch64::UABAv16i8:
7560 case AArch64::UABAv2i32:
7561 case AArch64::UABAv4i16:
7562 case AArch64::UABAv4i32:
7563 case AArch64::UABAv8i16:
7564 case AArch64::UABAv8i8:
7565 case AArch64::SABALv16i8_v8i16:
7566 case AArch64::SABALv2i32_v2i64:
7567 case AArch64::SABALv4i16_v4i32:
7568 case AArch64::SABALv4i32_v2i64:
7569 case AArch64::SABALv8i16_v4i32:
7570 case AArch64::SABALv8i8_v8i16:
7571 case AArch64::SABAv16i8:
7572 case AArch64::SABAv2i32:
7573 case AArch64::SABAv4i16:
7574 case AArch64::SABAv4i32:
7575 case AArch64::SABAv8i16:
7576 case AArch64::SABAv8i8:
7577 return true;
7578 }
7579
7580 return false;
7581}
7582
7583unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7584 unsigned AccumulationOpcode) const {
7585 switch (AccumulationOpcode) {
7586 default:
7587 llvm_unreachable("Unsupported accumulation Opcode!");
7588 case AArch64::UABALB_ZZZ_D:
7589 return AArch64::UABDLB_ZZZ_D;
7590 case AArch64::UABALB_ZZZ_H:
7591 return AArch64::UABDLB_ZZZ_H;
7592 case AArch64::UABALB_ZZZ_S:
7593 return AArch64::UABDLB_ZZZ_S;
7594 case AArch64::UABALT_ZZZ_D:
7595 return AArch64::UABDLT_ZZZ_D;
7596 case AArch64::UABALT_ZZZ_H:
7597 return AArch64::UABDLT_ZZZ_H;
7598 case AArch64::UABALT_ZZZ_S:
7599 return AArch64::UABDLT_ZZZ_S;
7600 case AArch64::UABALv16i8_v8i16:
7601 return AArch64::UABDLv16i8_v8i16;
7602 case AArch64::UABALv2i32_v2i64:
7603 return AArch64::UABDLv2i32_v2i64;
7604 case AArch64::UABALv4i16_v4i32:
7605 return AArch64::UABDLv4i16_v4i32;
7606 case AArch64::UABALv4i32_v2i64:
7607 return AArch64::UABDLv4i32_v2i64;
7608 case AArch64::UABALv8i16_v4i32:
7609 return AArch64::UABDLv8i16_v4i32;
7610 case AArch64::UABALv8i8_v8i16:
7611 return AArch64::UABDLv8i8_v8i16;
7612 case AArch64::UABAv16i8:
7613 return AArch64::UABDv16i8;
7614 case AArch64::UABAv2i32:
7615 return AArch64::UABDv2i32;
7616 case AArch64::UABAv4i16:
7617 return AArch64::UABDv4i16;
7618 case AArch64::UABAv4i32:
7619 return AArch64::UABDv4i32;
7620 case AArch64::UABAv8i16:
7621 return AArch64::UABDv8i16;
7622 case AArch64::UABAv8i8:
7623 return AArch64::UABDv8i8;
7624 case AArch64::SABALB_ZZZ_D:
7625 return AArch64::SABDLB_ZZZ_D;
7626 case AArch64::SABALB_ZZZ_S:
7627 return AArch64::SABDLB_ZZZ_S;
7628 case AArch64::SABALB_ZZZ_H:
7629 return AArch64::SABDLB_ZZZ_H;
7630 case AArch64::SABALT_ZZZ_D:
7631 return AArch64::SABDLT_ZZZ_D;
7632 case AArch64::SABALT_ZZZ_S:
7633 return AArch64::SABDLT_ZZZ_S;
7634 case AArch64::SABALT_ZZZ_H:
7635 return AArch64::SABDLT_ZZZ_H;
7636 case AArch64::SABALv16i8_v8i16:
7637 return AArch64::SABDLv16i8_v8i16;
7638 case AArch64::SABALv2i32_v2i64:
7639 return AArch64::SABDLv2i32_v2i64;
7640 case AArch64::SABALv4i16_v4i32:
7641 return AArch64::SABDLv4i16_v4i32;
7642 case AArch64::SABALv4i32_v2i64:
7643 return AArch64::SABDLv4i32_v2i64;
7644 case AArch64::SABALv8i16_v4i32:
7645 return AArch64::SABDLv8i16_v4i32;
7646 case AArch64::SABALv8i8_v8i16:
7647 return AArch64::SABDLv8i8_v8i16;
7648 case AArch64::SABAv16i8:
7649 return AArch64::SABDv16i8;
7650 case AArch64::SABAv2i32:
7651 return AArch64::SABAv2i32;
7652 case AArch64::SABAv4i16:
7653 return AArch64::SABDv4i16;
7654 case AArch64::SABAv4i32:
7655 return AArch64::SABDv4i32;
7656 case AArch64::SABAv8i16:
7657 return AArch64::SABDv8i16;
7658 case AArch64::SABAv8i8:
7659 return AArch64::SABDv8i8;
7660 }
7661}
7662
7663/// Floating-Point Support
7664
7665/// Find instructions that can be turned into madd.
7667 SmallVectorImpl<unsigned> &Patterns) {
7668
7669 if (!isCombineInstrCandidateFP(Root))
7670 return false;
7671
7672 MachineBasicBlock &MBB = *Root.getParent();
7673 bool Found = false;
7674
7675 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7676 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7677 Patterns.push_back(Pattern);
7678 return true;
7679 }
7680 return false;
7681 };
7682
7684
7685 switch (Root.getOpcode()) {
7686 default:
7687 assert(false && "Unsupported FP instruction in combiner\n");
7688 break;
7689 case AArch64::FADDHrr:
7690 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7691 "FADDHrr does not have register operands");
7692
7693 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7694 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7695 break;
7696 case AArch64::FADDSrr:
7697 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7698 "FADDSrr does not have register operands");
7699
7700 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7701 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7702
7703 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7704 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7705 break;
7706 case AArch64::FADDDrr:
7707 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7708 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7709
7710 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7711 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7712 break;
7713 case AArch64::FADDv4f16:
7714 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7715 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7716
7717 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7718 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7719 break;
7720 case AArch64::FADDv8f16:
7721 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7722 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7723
7724 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7725 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7726 break;
7727 case AArch64::FADDv2f32:
7728 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7729 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7730
7731 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7732 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7733 break;
7734 case AArch64::FADDv2f64:
7735 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7736 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7737
7738 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7739 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7740 break;
7741 case AArch64::FADDv4f32:
7742 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7743 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7744
7745 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7746 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7747 break;
7748 case AArch64::FSUBHrr:
7749 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7750 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7751 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7752 break;
7753 case AArch64::FSUBSrr:
7754 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7755
7756 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7757 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7758
7759 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7760 break;
7761 case AArch64::FSUBDrr:
7762 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7763
7764 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7765 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7766
7767 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7768 break;
7769 case AArch64::FSUBv4f16:
7770 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7771 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7772
7773 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7774 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7775 break;
7776 case AArch64::FSUBv8f16:
7777 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7778 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7779
7780 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7781 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7782 break;
7783 case AArch64::FSUBv2f32:
7784 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7785 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7786
7787 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7788 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7789 break;
7790 case AArch64::FSUBv2f64:
7791 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7792 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7793
7794 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7795 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7796 break;
7797 case AArch64::FSUBv4f32:
7798 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7799 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7800
7801 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7802 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7803 break;
7804 }
7805 return Found;
7806}
7807
7809 SmallVectorImpl<unsigned> &Patterns) {
7810 MachineBasicBlock &MBB = *Root.getParent();
7811 bool Found = false;
7812
7813 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7814 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7815 MachineOperand &MO = Root.getOperand(Operand);
7816 MachineInstr *MI = nullptr;
7817 if (MO.isReg() && MO.getReg().isVirtual())
7818 MI = MRI.getUniqueVRegDef(MO.getReg());
7819 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7820 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7821 MI->getOperand(1).getReg().isVirtual())
7822 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7823 if (MI && MI->getOpcode() == Opcode) {
7824 Patterns.push_back(Pattern);
7825 return true;
7826 }
7827 return false;
7828 };
7829
7831
7832 switch (Root.getOpcode()) {
7833 default:
7834 return false;
7835 case AArch64::FMULv2f32:
7836 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7837 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7838 break;
7839 case AArch64::FMULv2f64:
7840 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7841 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7842 break;
7843 case AArch64::FMULv4f16:
7844 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7845 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7846 break;
7847 case AArch64::FMULv4f32:
7848 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7849 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7850 break;
7851 case AArch64::FMULv8f16:
7852 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7853 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7854 break;
7855 }
7856
7857 return Found;
7858}
7859
7861 SmallVectorImpl<unsigned> &Patterns) {
7862 unsigned Opc = Root.getOpcode();
7863 MachineBasicBlock &MBB = *Root.getParent();
7864 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7865
7866 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7867 MachineOperand &MO = Root.getOperand(1);
7869 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7870 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7874 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7875 Patterns.push_back(Pattern);
7876 return true;
7877 }
7878 return false;
7879 };
7880
7881 switch (Opc) {
7882 default:
7883 break;
7884 case AArch64::FNEGDr:
7885 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7886 case AArch64::FNEGSr:
7887 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7888 }
7889
7890 return false;
7891}
7892
7893/// Return true when a code sequence can improve throughput. It
7894/// should be called only for instructions in loops.
7895/// \param Pattern - combiner pattern
7897 switch (Pattern) {
7898 default:
7899 break;
8005 return true;
8006 } // end switch (Pattern)
8007 return false;
8008}
8009
8010/// Find other MI combine patterns.
8012 SmallVectorImpl<unsigned> &Patterns) {
8013 // A - (B + C) ==> (A - B) - C or (A - C) - B
8014 unsigned Opc = Root.getOpcode();
8015 MachineBasicBlock &MBB = *Root.getParent();
8016
8017 switch (Opc) {
8018 case AArch64::SUBWrr:
8019 case AArch64::SUBSWrr:
8020 case AArch64::SUBXrr:
8021 case AArch64::SUBSXrr:
8022 // Found candidate root.
8023 break;
8024 default:
8025 return false;
8026 }
8027
8029 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
8030 -1)
8031 return false;
8032
8033 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
8034 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
8035 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
8036 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
8039 return true;
8040 }
8041
8042 return false;
8043}
8044
8045/// Check if the given instruction forms a gather load pattern that can be
8046/// optimized for better Memory-Level Parallelism (MLP). This function
8047/// identifies chains of NEON lane load instructions that load data from
8048/// different memory addresses into individual lanes of a 128-bit vector
8049/// register, then attempts to split the pattern into parallel loads to break
8050/// the serial dependency between instructions.
8051///
8052/// Pattern Matched:
8053/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8054/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8055///
8056/// Transformed Into:
8057/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8058/// to combine the results, enabling better memory-level parallelism.
8059///
8060/// Supported Element Types:
8061/// - 32-bit elements (LD1i32, 4 lanes total)
8062/// - 16-bit elements (LD1i16, 8 lanes total)
8063/// - 8-bit elements (LD1i8, 16 lanes total)
8065 SmallVectorImpl<unsigned> &Patterns,
8066 unsigned LoadLaneOpCode, unsigned NumLanes) {
8067 const MachineFunction *MF = Root.getMF();
8068
8069 // Early exit if optimizing for size.
8070 if (MF->getFunction().hasMinSize())
8071 return false;
8072
8073 const MachineRegisterInfo &MRI = MF->getRegInfo();
8075
8076 // The root of the pattern must load into the last lane of the vector.
8077 if (Root.getOperand(2).getImm() != NumLanes - 1)
8078 return false;
8079
8080 // Check that we have load into all lanes except lane 0.
8081 // For each load we also want to check that:
8082 // 1. It has a single non-debug use (since we will be replacing the virtual
8083 // register)
8084 // 2. That the addressing mode only uses a single pointer operand
8085 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8086 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
8087 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8089 while (!RemainingLanes.empty() && CurrInstr &&
8090 CurrInstr->getOpcode() == LoadLaneOpCode &&
8091 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
8092 CurrInstr->getNumOperands() == 4) {
8093 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
8094 LoadInstrs.push_back(CurrInstr);
8095 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8096 }
8097
8098 // Check that we have found a match for lanes N-1.. 1.
8099 if (!RemainingLanes.empty())
8100 return false;
8101
8102 // Match the SUBREG_TO_REG sequence.
8103 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8104 return false;
8105
8106 // Verify that the subreg to reg loads an integer into the first lane.
8107 auto Lane0LoadReg = CurrInstr->getOperand(1).getReg();
8108 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8109 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8110 return false;
8111
8112 // Verify that it also has a single non debug use.
8113 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
8114 return false;
8115
8116 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
8117
8118 // If there is any chance of aliasing, do not apply the pattern.
8119 // Walk backward through the MBB starting from Root.
8120 // Exit early if we've encountered all load instructions or hit the search
8121 // limit.
8122 auto MBBItr = Root.getIterator();
8123 unsigned RemainingSteps = GatherOptSearchLimit;
8124 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8125 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
8126 const MachineBasicBlock *MBB = Root.getParent();
8127
8128 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8129 !RemainingLoadInstrs.empty();
8130 --MBBItr, --RemainingSteps) {
8131 const MachineInstr &CurrInstr = *MBBItr;
8132
8133 // Remove this instruction from remaining loads if it's one we're tracking.
8134 RemainingLoadInstrs.erase(&CurrInstr);
8135
8136 // Check for potential aliasing with any of the load instructions to
8137 // optimize.
8138 if (CurrInstr.isLoadFoldBarrier())
8139 return false;
8140 }
8141
8142 // If we hit the search limit without finding all load instructions,
8143 // don't match the pattern.
8144 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8145 return false;
8146
8147 switch (NumLanes) {
8148 case 4:
8150 break;
8151 case 8:
8153 break;
8154 case 16:
8156 break;
8157 default:
8158 llvm_unreachable("Got bad number of lanes for gather pattern.");
8159 }
8160
8161 return true;
8162}
8163
8164/// Search for patterns of LD instructions we can optimize.
8166 SmallVectorImpl<unsigned> &Patterns) {
8167
8168 // The pattern searches for loads into single lanes.
8169 switch (Root.getOpcode()) {
8170 case AArch64::LD1i32:
8171 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8172 case AArch64::LD1i16:
8173 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8174 case AArch64::LD1i8:
8175 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8176 default:
8177 return false;
8178 }
8179}
8180
8181/// Generate optimized instruction sequence for gather load patterns to improve
8182/// Memory-Level Parallelism (MLP). This function transforms a chain of
8183/// sequential NEON lane loads into parallel vector loads that can execute
8184/// concurrently.
8185static void
8189 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8190 unsigned Pattern, unsigned NumLanes) {
8191 MachineFunction &MF = *Root.getParent()->getParent();
8192 MachineRegisterInfo &MRI = MF.getRegInfo();
8194
8195 // Gather the initial load instructions to build the pattern.
8196 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8197 MachineInstr *CurrInstr = &Root;
8198 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8199 LoadToLaneInstrs.push_back(CurrInstr);
8200 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8201 }
8202
8203 // Sort the load instructions according to the lane.
8204 llvm::sort(LoadToLaneInstrs,
8205 [](const MachineInstr *A, const MachineInstr *B) {
8206 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8207 });
8208
8209 MachineInstr *SubregToReg = CurrInstr;
8210 LoadToLaneInstrs.push_back(
8211 MRI.getUniqueVRegDef(SubregToReg->getOperand(1).getReg()));
8212 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8213
8214 const TargetRegisterClass *FPR128RegClass =
8215 MRI.getRegClass(Root.getOperand(0).getReg());
8216
8217 // Helper lambda to create a LD1 instruction.
8218 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8219 Register SrcRegister, unsigned Lane,
8220 Register OffsetRegister,
8221 bool OffsetRegisterKillState) {
8222 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8223 MachineInstrBuilder LoadIndexIntoRegister =
8224 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8225 NewRegister)
8226 .addReg(SrcRegister)
8227 .addImm(Lane)
8228 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
8229 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8230 InsInstrs.push_back(LoadIndexIntoRegister);
8231 return NewRegister;
8232 };
8233
8234 // Helper to create load instruction based on the NumLanes in the NEON
8235 // register we are rewriting.
8236 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8237 Register OffsetReg,
8238 bool KillState) -> MachineInstrBuilder {
8239 unsigned Opcode;
8240 switch (NumLanes) {
8241 case 4:
8242 Opcode = AArch64::LDRSui;
8243 break;
8244 case 8:
8245 Opcode = AArch64::LDRHui;
8246 break;
8247 case 16:
8248 Opcode = AArch64::LDRBui;
8249 break;
8250 default:
8252 "Got unsupported number of lanes in machine-combiner gather pattern");
8253 }
8254 // Immediate offset load
8255 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8256 .addReg(OffsetReg)
8257 .addImm(0);
8258 };
8259
8260 // Load the remaining lanes into register 0.
8261 auto LanesToLoadToReg0 =
8262 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8263 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8264 Register PrevReg = SubregToReg->getOperand(0).getReg();
8265 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8266 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8267 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8268 OffsetRegOperand.getReg(),
8269 OffsetRegOperand.isKill());
8270 DelInstrs.push_back(LoadInstr);
8271 }
8272 Register LastLoadReg0 = PrevReg;
8273
8274 // First load into register 1. Perform an integer load to zero out the upper
8275 // lanes in a single instruction.
8276 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8277 MachineInstr *OriginalSplitLoad =
8278 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8279 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8280 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8281
8282 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8283 OriginalSplitLoad->getOperand(3);
8284 MachineInstrBuilder MiddleIndexLoadInstr =
8285 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8286 OriginalSplitToLoadOffsetOperand.getReg(),
8287 OriginalSplitToLoadOffsetOperand.isKill());
8288
8289 InstrIdxForVirtReg.insert(
8290 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8291 InsInstrs.push_back(MiddleIndexLoadInstr);
8292 DelInstrs.push_back(OriginalSplitLoad);
8293
8294 // Subreg To Reg instruction for register 1.
8295 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8296 unsigned SubregType;
8297 switch (NumLanes) {
8298 case 4:
8299 SubregType = AArch64::ssub;
8300 break;
8301 case 8:
8302 SubregType = AArch64::hsub;
8303 break;
8304 case 16:
8305 SubregType = AArch64::bsub;
8306 break;
8307 default:
8309 "Got invalid NumLanes for machine-combiner gather pattern");
8310 }
8311
8312 auto SubRegToRegInstr =
8313 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8314 DestRegForSubregToReg)
8315 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8316 .addImm(SubregType);
8317 InstrIdxForVirtReg.insert(
8318 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8319 InsInstrs.push_back(SubRegToRegInstr);
8320
8321 // Load remaining lanes into register 1.
8322 auto LanesToLoadToReg1 =
8323 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8324 LoadToLaneInstrsAscending.end());
8325 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8326 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8327 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8328 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8329 OffsetRegOperand.getReg(),
8330 OffsetRegOperand.isKill());
8331
8332 // Do not add the last reg to DelInstrs - it will be removed later.
8333 if (Index == NumLanes / 2 - 2) {
8334 break;
8335 }
8336 DelInstrs.push_back(LoadInstr);
8337 }
8338 Register LastLoadReg1 = PrevReg;
8339
8340 // Create the final zip instruction to combine the results.
8341 MachineInstrBuilder ZipInstr =
8342 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8343 Root.getOperand(0).getReg())
8344 .addReg(LastLoadReg0)
8345 .addReg(LastLoadReg1);
8346 InsInstrs.push_back(ZipInstr);
8347}
8348
8362
8363/// Return true when there is potentially a faster code sequence for an
8364/// instruction chain ending in \p Root. All potential patterns are listed in
8365/// the \p Pattern vector. Pattern should be sorted in priority order since the
8366/// pattern evaluator stops checking as soon as it finds a faster sequence.
8367
8368bool AArch64InstrInfo::getMachineCombinerPatterns(
8369 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8370 bool DoRegPressureReduce) const {
8371 // Integer patterns
8372 if (getMaddPatterns(Root, Patterns))
8373 return true;
8374 // Floating point patterns
8375 if (getFMULPatterns(Root, Patterns))
8376 return true;
8377 if (getFMAPatterns(Root, Patterns))
8378 return true;
8379 if (getFNEGPatterns(Root, Patterns))
8380 return true;
8381
8382 // Other patterns
8383 if (getMiscPatterns(Root, Patterns))
8384 return true;
8385
8386 // Load patterns
8387 if (getLoadPatterns(Root, Patterns))
8388 return true;
8389
8390 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8391 DoRegPressureReduce);
8392}
8393
8395/// genFusedMultiply - Generate fused multiply instructions.
8396/// This function supports both integer and floating point instructions.
8397/// A typical example:
8398/// F|MUL I=A,B,0
8399/// F|ADD R,I,C
8400/// ==> F|MADD R,A,B,C
8401/// \param MF Containing MachineFunction
8402/// \param MRI Register information
8403/// \param TII Target information
8404/// \param Root is the F|ADD instruction
8405/// \param [out] InsInstrs is a vector of machine instructions and will
8406/// contain the generated madd instruction
8407/// \param IdxMulOpd is index of operand in Root that is the result of
8408/// the F|MUL. In the example above IdxMulOpd is 1.
8409/// \param MaddOpc the opcode fo the f|madd instruction
8410/// \param RC Register class of operands
8411/// \param kind of fma instruction (addressing mode) to be generated
8412/// \param ReplacedAddend is the result register from the instruction
8413/// replacing the non-combined operand, if any.
8414static MachineInstr *
8416 const TargetInstrInfo *TII, MachineInstr &Root,
8417 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8418 unsigned MaddOpc, const TargetRegisterClass *RC,
8420 const Register *ReplacedAddend = nullptr) {
8421 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8422
8423 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8424 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8425 Register ResultReg = Root.getOperand(0).getReg();
8426 Register SrcReg0 = MUL->getOperand(1).getReg();
8427 bool Src0IsKill = MUL->getOperand(1).isKill();
8428 Register SrcReg1 = MUL->getOperand(2).getReg();
8429 bool Src1IsKill = MUL->getOperand(2).isKill();
8430
8431 Register SrcReg2;
8432 bool Src2IsKill;
8433 if (ReplacedAddend) {
8434 // If we just generated a new addend, we must be it's only use.
8435 SrcReg2 = *ReplacedAddend;
8436 Src2IsKill = true;
8437 } else {
8438 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8439 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8440 }
8441
8442 if (ResultReg.isVirtual())
8443 MRI.constrainRegClass(ResultReg, RC);
8444 if (SrcReg0.isVirtual())
8445 MRI.constrainRegClass(SrcReg0, RC);
8446 if (SrcReg1.isVirtual())
8447 MRI.constrainRegClass(SrcReg1, RC);
8448 if (SrcReg2.isVirtual())
8449 MRI.constrainRegClass(SrcReg2, RC);
8450
8452 if (kind == FMAInstKind::Default)
8453 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8454 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8455 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8456 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8457 else if (kind == FMAInstKind::Indexed)
8458 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8459 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8460 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8461 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8462 .addImm(MUL->getOperand(3).getImm());
8463 else if (kind == FMAInstKind::Accumulator)
8464 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8465 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8466 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8467 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8468 else
8469 assert(false && "Invalid FMA instruction kind \n");
8470 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8471 InsInstrs.push_back(MIB);
8472 return MUL;
8473}
8474
8475static MachineInstr *
8477 const TargetInstrInfo *TII, MachineInstr &Root,
8479 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8480
8481 unsigned Opc = 0;
8482 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8483 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8484 Opc = AArch64::FNMADDSrrr;
8485 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8486 Opc = AArch64::FNMADDDrrr;
8487 else
8488 return nullptr;
8489
8490 Register ResultReg = Root.getOperand(0).getReg();
8491 Register SrcReg0 = MAD->getOperand(1).getReg();
8492 Register SrcReg1 = MAD->getOperand(2).getReg();
8493 Register SrcReg2 = MAD->getOperand(3).getReg();
8494 bool Src0IsKill = MAD->getOperand(1).isKill();
8495 bool Src1IsKill = MAD->getOperand(2).isKill();
8496 bool Src2IsKill = MAD->getOperand(3).isKill();
8497 if (ResultReg.isVirtual())
8498 MRI.constrainRegClass(ResultReg, RC);
8499 if (SrcReg0.isVirtual())
8500 MRI.constrainRegClass(SrcReg0, RC);
8501 if (SrcReg1.isVirtual())
8502 MRI.constrainRegClass(SrcReg1, RC);
8503 if (SrcReg2.isVirtual())
8504 MRI.constrainRegClass(SrcReg2, RC);
8505
8507 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8508 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8509 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8510 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8511 InsInstrs.push_back(MIB);
8512
8513 return MAD;
8514}
8515
8516/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8517static MachineInstr *
8520 unsigned IdxDupOp, unsigned MulOpc,
8521 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8522 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8523 "Invalid index of FMUL operand");
8524
8525 MachineFunction &MF = *Root.getMF();
8527
8528 MachineInstr *Dup =
8529 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8530
8531 if (Dup->getOpcode() == TargetOpcode::COPY)
8532 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8533
8534 Register DupSrcReg = Dup->getOperand(1).getReg();
8535 MRI.clearKillFlags(DupSrcReg);
8536 MRI.constrainRegClass(DupSrcReg, RC);
8537
8538 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8539
8540 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8541 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8542
8543 Register ResultReg = Root.getOperand(0).getReg();
8544
8546 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8547 .add(MulOp)
8548 .addReg(DupSrcReg)
8549 .addImm(DupSrcLane);
8550
8551 InsInstrs.push_back(MIB);
8552 return &Root;
8553}
8554
8555/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8556/// instructions.
8557///
8558/// \see genFusedMultiply
8562 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8563 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8565}
8566
8567/// genNeg - Helper to generate an intermediate negation of the second operand
8568/// of Root
8570 const TargetInstrInfo *TII, MachineInstr &Root,
8572 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8573 unsigned MnegOpc, const TargetRegisterClass *RC) {
8574 Register NewVR = MRI.createVirtualRegister(RC);
8576 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8577 .add(Root.getOperand(2));
8578 InsInstrs.push_back(MIB);
8579
8580 assert(InstrIdxForVirtReg.empty());
8581 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8582
8583 return NewVR;
8584}
8585
8586/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8587/// instructions with an additional negation of the accumulator
8591 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8592 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8593 assert(IdxMulOpd == 1);
8594
8595 Register NewVR =
8596 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8597 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8598 FMAInstKind::Accumulator, &NewVR);
8599}
8600
8601/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8602/// instructions.
8603///
8604/// \see genFusedMultiply
8608 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8609 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8611}
8612
8613/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8614/// instructions with an additional negation of the accumulator
8618 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8619 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8620 assert(IdxMulOpd == 1);
8621
8622 Register NewVR =
8623 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8624
8625 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8626 FMAInstKind::Indexed, &NewVR);
8627}
8628
8629/// genMaddR - Generate madd instruction and combine mul and add using
8630/// an extra virtual register
8631/// Example - an ADD intermediate needs to be stored in a register:
8632/// MUL I=A,B,0
8633/// ADD R,I,Imm
8634/// ==> ORR V, ZR, Imm
8635/// ==> MADD R,A,B,V
8636/// \param MF Containing MachineFunction
8637/// \param MRI Register information
8638/// \param TII Target information
8639/// \param Root is the ADD instruction
8640/// \param [out] InsInstrs is a vector of machine instructions and will
8641/// contain the generated madd instruction
8642/// \param IdxMulOpd is index of operand in Root that is the result of
8643/// the MUL. In the example above IdxMulOpd is 1.
8644/// \param MaddOpc the opcode fo the madd instruction
8645/// \param VR is a virtual register that holds the value of an ADD operand
8646/// (V in the example above).
8647/// \param RC Register class of operands
8649 const TargetInstrInfo *TII, MachineInstr &Root,
8651 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8652 const TargetRegisterClass *RC) {
8653 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8654
8655 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8656 Register ResultReg = Root.getOperand(0).getReg();
8657 Register SrcReg0 = MUL->getOperand(1).getReg();
8658 bool Src0IsKill = MUL->getOperand(1).isKill();
8659 Register SrcReg1 = MUL->getOperand(2).getReg();
8660 bool Src1IsKill = MUL->getOperand(2).isKill();
8661
8662 if (ResultReg.isVirtual())
8663 MRI.constrainRegClass(ResultReg, RC);
8664 if (SrcReg0.isVirtual())
8665 MRI.constrainRegClass(SrcReg0, RC);
8666 if (SrcReg1.isVirtual())
8667 MRI.constrainRegClass(SrcReg1, RC);
8669 MRI.constrainRegClass(VR, RC);
8670
8672 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8673 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8674 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8675 .addReg(VR);
8676 // Insert the MADD
8677 InsInstrs.push_back(MIB);
8678 return MUL;
8679}
8680
8681/// Do the following transformation
8682/// A - (B + C) ==> (A - B) - C
8683/// A - (B + C) ==> (A - C) - B
8685 const TargetInstrInfo *TII, MachineInstr &Root,
8688 unsigned IdxOpd1,
8689 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8690 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8691 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8692 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8693
8694 Register ResultReg = Root.getOperand(0).getReg();
8695 Register RegA = Root.getOperand(1).getReg();
8696 bool RegAIsKill = Root.getOperand(1).isKill();
8697 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8698 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8699 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8700 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8701 Register NewVR =
8703
8704 unsigned Opcode = Root.getOpcode();
8705 if (Opcode == AArch64::SUBSWrr)
8706 Opcode = AArch64::SUBWrr;
8707 else if (Opcode == AArch64::SUBSXrr)
8708 Opcode = AArch64::SUBXrr;
8709 else
8710 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8711 "Unexpected instruction opcode.");
8712
8713 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8714 Flags &= ~MachineInstr::NoSWrap;
8715 Flags &= ~MachineInstr::NoUWrap;
8716
8717 MachineInstrBuilder MIB1 =
8718 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8719 .addReg(RegA, getKillRegState(RegAIsKill))
8720 .addReg(RegB, getKillRegState(RegBIsKill))
8721 .setMIFlags(Flags);
8722 MachineInstrBuilder MIB2 =
8723 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8724 .addReg(NewVR, getKillRegState(true))
8725 .addReg(RegC, getKillRegState(RegCIsKill))
8726 .setMIFlags(Flags);
8727
8728 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8729 InsInstrs.push_back(MIB1);
8730 InsInstrs.push_back(MIB2);
8731 DelInstrs.push_back(AddMI);
8732 DelInstrs.push_back(&Root);
8733}
8734
8735unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8736 unsigned int AccumulatorOpCode) const {
8737 switch (AccumulatorOpCode) {
8738 case AArch64::UABALB_ZZZ_D:
8739 case AArch64::SABALB_ZZZ_D:
8740 case AArch64::UABALT_ZZZ_D:
8741 case AArch64::SABALT_ZZZ_D:
8742 return AArch64::ADD_ZZZ_D;
8743 case AArch64::UABALB_ZZZ_H:
8744 case AArch64::SABALB_ZZZ_H:
8745 case AArch64::UABALT_ZZZ_H:
8746 case AArch64::SABALT_ZZZ_H:
8747 return AArch64::ADD_ZZZ_H;
8748 case AArch64::UABALB_ZZZ_S:
8749 case AArch64::SABALB_ZZZ_S:
8750 case AArch64::UABALT_ZZZ_S:
8751 case AArch64::SABALT_ZZZ_S:
8752 return AArch64::ADD_ZZZ_S;
8753 case AArch64::UABALv16i8_v8i16:
8754 case AArch64::SABALv8i8_v8i16:
8755 case AArch64::SABAv8i16:
8756 case AArch64::UABAv8i16:
8757 return AArch64::ADDv8i16;
8758 case AArch64::SABALv2i32_v2i64:
8759 case AArch64::UABALv2i32_v2i64:
8760 case AArch64::SABALv4i32_v2i64:
8761 return AArch64::ADDv2i64;
8762 case AArch64::UABALv4i16_v4i32:
8763 case AArch64::SABALv4i16_v4i32:
8764 case AArch64::SABALv8i16_v4i32:
8765 case AArch64::SABAv4i32:
8766 case AArch64::UABAv4i32:
8767 return AArch64::ADDv4i32;
8768 case AArch64::UABALv4i32_v2i64:
8769 return AArch64::ADDv2i64;
8770 case AArch64::UABALv8i16_v4i32:
8771 return AArch64::ADDv4i32;
8772 case AArch64::UABALv8i8_v8i16:
8773 case AArch64::SABALv16i8_v8i16:
8774 return AArch64::ADDv8i16;
8775 case AArch64::UABAv16i8:
8776 case AArch64::SABAv16i8:
8777 return AArch64::ADDv16i8;
8778 case AArch64::UABAv4i16:
8779 case AArch64::SABAv4i16:
8780 return AArch64::ADDv4i16;
8781 case AArch64::UABAv2i32:
8782 case AArch64::SABAv2i32:
8783 return AArch64::ADDv2i32;
8784 case AArch64::UABAv8i8:
8785 case AArch64::SABAv8i8:
8786 return AArch64::ADDv8i8;
8787 default:
8788 llvm_unreachable("Unknown accumulator opcode");
8789 }
8790}
8791
8792/// When getMachineCombinerPatterns() finds potential patterns,
8793/// this function generates the instructions that could replace the
8794/// original code sequence
8795void AArch64InstrInfo::genAlternativeCodeSequence(
8796 MachineInstr &Root, unsigned Pattern,
8799 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8800 MachineBasicBlock &MBB = *Root.getParent();
8801 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8802 MachineFunction &MF = *MBB.getParent();
8803 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8804
8805 MachineInstr *MUL = nullptr;
8806 const TargetRegisterClass *RC;
8807 unsigned Opc;
8808 switch (Pattern) {
8809 default:
8810 // Reassociate instructions.
8811 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8812 DelInstrs, InstrIdxForVirtReg);
8813 return;
8815 // A - (B + C)
8816 // ==> (A - B) - C
8817 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
8818 InstrIdxForVirtReg);
8819 return;
8821 // A - (B + C)
8822 // ==> (A - C) - B
8823 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
8824 InstrIdxForVirtReg);
8825 return;
8828 // MUL I=A,B,0
8829 // ADD R,I,C
8830 // ==> MADD R,A,B,C
8831 // --- Create(MADD);
8833 Opc = AArch64::MADDWrrr;
8834 RC = &AArch64::GPR32RegClass;
8835 } else {
8836 Opc = AArch64::MADDXrrr;
8837 RC = &AArch64::GPR64RegClass;
8838 }
8839 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8840 break;
8843 // MUL I=A,B,0
8844 // ADD R,C,I
8845 // ==> MADD R,A,B,C
8846 // --- Create(MADD);
8848 Opc = AArch64::MADDWrrr;
8849 RC = &AArch64::GPR32RegClass;
8850 } else {
8851 Opc = AArch64::MADDXrrr;
8852 RC = &AArch64::GPR64RegClass;
8853 }
8854 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8855 break;
8860 // MUL I=A,B,0
8861 // ADD/SUB R,I,Imm
8862 // ==> MOV V, Imm/-Imm
8863 // ==> MADD R,A,B,V
8864 // --- Create(MADD);
8865 const TargetRegisterClass *RC;
8866 unsigned BitSize, MovImm;
8869 MovImm = AArch64::MOVi32imm;
8870 RC = &AArch64::GPR32spRegClass;
8871 BitSize = 32;
8872 Opc = AArch64::MADDWrrr;
8873 RC = &AArch64::GPR32RegClass;
8874 } else {
8875 MovImm = AArch64::MOVi64imm;
8876 RC = &AArch64::GPR64spRegClass;
8877 BitSize = 64;
8878 Opc = AArch64::MADDXrrr;
8879 RC = &AArch64::GPR64RegClass;
8880 }
8881 Register NewVR = MRI.createVirtualRegister(RC);
8882 uint64_t Imm = Root.getOperand(2).getImm();
8883
8884 if (Root.getOperand(3).isImm()) {
8885 unsigned Val = Root.getOperand(3).getImm();
8886 Imm = Imm << Val;
8887 }
8888 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8890 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
8891 // Check that the immediate can be composed via a single instruction.
8893 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
8894 if (Insn.size() != 1)
8895 return;
8896 MachineInstrBuilder MIB1 =
8897 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
8898 .addImm(IsSub ? -Imm : Imm);
8899 InsInstrs.push_back(MIB1);
8900 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8901 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8902 break;
8903 }
8906 // MUL I=A,B,0
8907 // SUB R,I, C
8908 // ==> SUB V, 0, C
8909 // ==> MADD R,A,B,V // = -C + A*B
8910 // --- Create(MADD);
8911 const TargetRegisterClass *SubRC;
8912 unsigned SubOpc, ZeroReg;
8914 SubOpc = AArch64::SUBWrr;
8915 SubRC = &AArch64::GPR32spRegClass;
8916 ZeroReg = AArch64::WZR;
8917 Opc = AArch64::MADDWrrr;
8918 RC = &AArch64::GPR32RegClass;
8919 } else {
8920 SubOpc = AArch64::SUBXrr;
8921 SubRC = &AArch64::GPR64spRegClass;
8922 ZeroReg = AArch64::XZR;
8923 Opc = AArch64::MADDXrrr;
8924 RC = &AArch64::GPR64RegClass;
8925 }
8926 Register NewVR = MRI.createVirtualRegister(SubRC);
8927 // SUB NewVR, 0, C
8928 MachineInstrBuilder MIB1 =
8929 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
8930 .addReg(ZeroReg)
8931 .add(Root.getOperand(2));
8932 InsInstrs.push_back(MIB1);
8933 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8934 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8935 break;
8936 }
8939 // MUL I=A,B,0
8940 // SUB R,C,I
8941 // ==> MSUB R,A,B,C (computes C - A*B)
8942 // --- Create(MSUB);
8944 Opc = AArch64::MSUBWrrr;
8945 RC = &AArch64::GPR32RegClass;
8946 } else {
8947 Opc = AArch64::MSUBXrrr;
8948 RC = &AArch64::GPR64RegClass;
8949 }
8950 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8951 break;
8953 Opc = AArch64::MLAv8i8;
8954 RC = &AArch64::FPR64RegClass;
8955 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8956 break;
8958 Opc = AArch64::MLAv8i8;
8959 RC = &AArch64::FPR64RegClass;
8960 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8961 break;
8963 Opc = AArch64::MLAv16i8;
8964 RC = &AArch64::FPR128RegClass;
8965 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8966 break;
8968 Opc = AArch64::MLAv16i8;
8969 RC = &AArch64::FPR128RegClass;
8970 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8971 break;
8973 Opc = AArch64::MLAv4i16;
8974 RC = &AArch64::FPR64RegClass;
8975 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8976 break;
8978 Opc = AArch64::MLAv4i16;
8979 RC = &AArch64::FPR64RegClass;
8980 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8981 break;
8983 Opc = AArch64::MLAv8i16;
8984 RC = &AArch64::FPR128RegClass;
8985 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8986 break;
8988 Opc = AArch64::MLAv8i16;
8989 RC = &AArch64::FPR128RegClass;
8990 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8991 break;
8993 Opc = AArch64::MLAv2i32;
8994 RC = &AArch64::FPR64RegClass;
8995 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8996 break;
8998 Opc = AArch64::MLAv2i32;
8999 RC = &AArch64::FPR64RegClass;
9000 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9001 break;
9003 Opc = AArch64::MLAv4i32;
9004 RC = &AArch64::FPR128RegClass;
9005 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9006 break;
9008 Opc = AArch64::MLAv4i32;
9009 RC = &AArch64::FPR128RegClass;
9010 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9011 break;
9012
9014 Opc = AArch64::MLAv8i8;
9015 RC = &AArch64::FPR64RegClass;
9016 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9017 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
9018 RC);
9019 break;
9021 Opc = AArch64::MLSv8i8;
9022 RC = &AArch64::FPR64RegClass;
9023 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9024 break;
9026 Opc = AArch64::MLAv16i8;
9027 RC = &AArch64::FPR128RegClass;
9028 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9029 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
9030 RC);
9031 break;
9033 Opc = AArch64::MLSv16i8;
9034 RC = &AArch64::FPR128RegClass;
9035 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9036 break;
9038 Opc = AArch64::MLAv4i16;
9039 RC = &AArch64::FPR64RegClass;
9040 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9041 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9042 RC);
9043 break;
9045 Opc = AArch64::MLSv4i16;
9046 RC = &AArch64::FPR64RegClass;
9047 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9048 break;
9050 Opc = AArch64::MLAv8i16;
9051 RC = &AArch64::FPR128RegClass;
9052 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9053 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9054 RC);
9055 break;
9057 Opc = AArch64::MLSv8i16;
9058 RC = &AArch64::FPR128RegClass;
9059 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9060 break;
9062 Opc = AArch64::MLAv2i32;
9063 RC = &AArch64::FPR64RegClass;
9064 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9065 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9066 RC);
9067 break;
9069 Opc = AArch64::MLSv2i32;
9070 RC = &AArch64::FPR64RegClass;
9071 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9072 break;
9074 Opc = AArch64::MLAv4i32;
9075 RC = &AArch64::FPR128RegClass;
9076 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9077 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9078 RC);
9079 break;
9081 Opc = AArch64::MLSv4i32;
9082 RC = &AArch64::FPR128RegClass;
9083 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9084 break;
9085
9087 Opc = AArch64::MLAv4i16_indexed;
9088 RC = &AArch64::FPR64RegClass;
9089 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9090 break;
9092 Opc = AArch64::MLAv4i16_indexed;
9093 RC = &AArch64::FPR64RegClass;
9094 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9095 break;
9097 Opc = AArch64::MLAv8i16_indexed;
9098 RC = &AArch64::FPR128RegClass;
9099 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9100 break;
9102 Opc = AArch64::MLAv8i16_indexed;
9103 RC = &AArch64::FPR128RegClass;
9104 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9105 break;
9107 Opc = AArch64::MLAv2i32_indexed;
9108 RC = &AArch64::FPR64RegClass;
9109 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9110 break;
9112 Opc = AArch64::MLAv2i32_indexed;
9113 RC = &AArch64::FPR64RegClass;
9114 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9115 break;
9117 Opc = AArch64::MLAv4i32_indexed;
9118 RC = &AArch64::FPR128RegClass;
9119 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9120 break;
9122 Opc = AArch64::MLAv4i32_indexed;
9123 RC = &AArch64::FPR128RegClass;
9124 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9125 break;
9126
9128 Opc = AArch64::MLAv4i16_indexed;
9129 RC = &AArch64::FPR64RegClass;
9130 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9131 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9132 RC);
9133 break;
9135 Opc = AArch64::MLSv4i16_indexed;
9136 RC = &AArch64::FPR64RegClass;
9137 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9138 break;
9140 Opc = AArch64::MLAv8i16_indexed;
9141 RC = &AArch64::FPR128RegClass;
9142 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9143 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9144 RC);
9145 break;
9147 Opc = AArch64::MLSv8i16_indexed;
9148 RC = &AArch64::FPR128RegClass;
9149 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9150 break;
9152 Opc = AArch64::MLAv2i32_indexed;
9153 RC = &AArch64::FPR64RegClass;
9154 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9155 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9156 RC);
9157 break;
9159 Opc = AArch64::MLSv2i32_indexed;
9160 RC = &AArch64::FPR64RegClass;
9161 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9162 break;
9164 Opc = AArch64::MLAv4i32_indexed;
9165 RC = &AArch64::FPR128RegClass;
9166 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9167 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9168 RC);
9169 break;
9171 Opc = AArch64::MLSv4i32_indexed;
9172 RC = &AArch64::FPR128RegClass;
9173 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9174 break;
9175
9176 // Floating Point Support
9178 Opc = AArch64::FMADDHrrr;
9179 RC = &AArch64::FPR16RegClass;
9180 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9181 break;
9183 Opc = AArch64::FMADDSrrr;
9184 RC = &AArch64::FPR32RegClass;
9185 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9186 break;
9188 Opc = AArch64::FMADDDrrr;
9189 RC = &AArch64::FPR64RegClass;
9190 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9191 break;
9192
9194 Opc = AArch64::FMADDHrrr;
9195 RC = &AArch64::FPR16RegClass;
9196 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9197 break;
9199 Opc = AArch64::FMADDSrrr;
9200 RC = &AArch64::FPR32RegClass;
9201 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9202 break;
9204 Opc = AArch64::FMADDDrrr;
9205 RC = &AArch64::FPR64RegClass;
9206 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9207 break;
9208
9210 Opc = AArch64::FMLAv1i32_indexed;
9211 RC = &AArch64::FPR32RegClass;
9212 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9214 break;
9216 Opc = AArch64::FMLAv1i32_indexed;
9217 RC = &AArch64::FPR32RegClass;
9218 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9220 break;
9221
9223 Opc = AArch64::FMLAv1i64_indexed;
9224 RC = &AArch64::FPR64RegClass;
9225 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9227 break;
9229 Opc = AArch64::FMLAv1i64_indexed;
9230 RC = &AArch64::FPR64RegClass;
9231 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9233 break;
9234
9236 RC = &AArch64::FPR64RegClass;
9237 Opc = AArch64::FMLAv4i16_indexed;
9238 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9240 break;
9242 RC = &AArch64::FPR64RegClass;
9243 Opc = AArch64::FMLAv4f16;
9244 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9246 break;
9248 RC = &AArch64::FPR64RegClass;
9249 Opc = AArch64::FMLAv4i16_indexed;
9250 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9252 break;
9254 RC = &AArch64::FPR64RegClass;
9255 Opc = AArch64::FMLAv4f16;
9256 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9258 break;
9259
9262 RC = &AArch64::FPR64RegClass;
9264 Opc = AArch64::FMLAv2i32_indexed;
9265 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9267 } else {
9268 Opc = AArch64::FMLAv2f32;
9269 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9271 }
9272 break;
9275 RC = &AArch64::FPR64RegClass;
9277 Opc = AArch64::FMLAv2i32_indexed;
9278 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9280 } else {
9281 Opc = AArch64::FMLAv2f32;
9282 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9284 }
9285 break;
9286
9288 RC = &AArch64::FPR128RegClass;
9289 Opc = AArch64::FMLAv8i16_indexed;
9290 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9292 break;
9294 RC = &AArch64::FPR128RegClass;
9295 Opc = AArch64::FMLAv8f16;
9296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9298 break;
9300 RC = &AArch64::FPR128RegClass;
9301 Opc = AArch64::FMLAv8i16_indexed;
9302 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9304 break;
9306 RC = &AArch64::FPR128RegClass;
9307 Opc = AArch64::FMLAv8f16;
9308 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9310 break;
9311
9314 RC = &AArch64::FPR128RegClass;
9316 Opc = AArch64::FMLAv2i64_indexed;
9317 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9319 } else {
9320 Opc = AArch64::FMLAv2f64;
9321 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9323 }
9324 break;
9327 RC = &AArch64::FPR128RegClass;
9329 Opc = AArch64::FMLAv2i64_indexed;
9330 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9332 } else {
9333 Opc = AArch64::FMLAv2f64;
9334 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9336 }
9337 break;
9338
9341 RC = &AArch64::FPR128RegClass;
9343 Opc = AArch64::FMLAv4i32_indexed;
9344 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9346 } else {
9347 Opc = AArch64::FMLAv4f32;
9348 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9350 }
9351 break;
9352
9355 RC = &AArch64::FPR128RegClass;
9357 Opc = AArch64::FMLAv4i32_indexed;
9358 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9360 } else {
9361 Opc = AArch64::FMLAv4f32;
9362 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9364 }
9365 break;
9366
9368 Opc = AArch64::FNMSUBHrrr;
9369 RC = &AArch64::FPR16RegClass;
9370 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9371 break;
9373 Opc = AArch64::FNMSUBSrrr;
9374 RC = &AArch64::FPR32RegClass;
9375 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9376 break;
9378 Opc = AArch64::FNMSUBDrrr;
9379 RC = &AArch64::FPR64RegClass;
9380 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9381 break;
9382
9384 Opc = AArch64::FNMADDHrrr;
9385 RC = &AArch64::FPR16RegClass;
9386 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9387 break;
9389 Opc = AArch64::FNMADDSrrr;
9390 RC = &AArch64::FPR32RegClass;
9391 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9392 break;
9394 Opc = AArch64::FNMADDDrrr;
9395 RC = &AArch64::FPR64RegClass;
9396 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9397 break;
9398
9400 Opc = AArch64::FMSUBHrrr;
9401 RC = &AArch64::FPR16RegClass;
9402 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9403 break;
9405 Opc = AArch64::FMSUBSrrr;
9406 RC = &AArch64::FPR32RegClass;
9407 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9408 break;
9410 Opc = AArch64::FMSUBDrrr;
9411 RC = &AArch64::FPR64RegClass;
9412 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9413 break;
9414
9416 Opc = AArch64::FMLSv1i32_indexed;
9417 RC = &AArch64::FPR32RegClass;
9418 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9420 break;
9421
9423 Opc = AArch64::FMLSv1i64_indexed;
9424 RC = &AArch64::FPR64RegClass;
9425 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9427 break;
9428
9431 RC = &AArch64::FPR64RegClass;
9432 Register NewVR = MRI.createVirtualRegister(RC);
9433 MachineInstrBuilder MIB1 =
9434 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9435 .add(Root.getOperand(2));
9436 InsInstrs.push_back(MIB1);
9437 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9439 Opc = AArch64::FMLAv4f16;
9440 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9441 FMAInstKind::Accumulator, &NewVR);
9442 } else {
9443 Opc = AArch64::FMLAv4i16_indexed;
9444 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9445 FMAInstKind::Indexed, &NewVR);
9446 }
9447 break;
9448 }
9450 RC = &AArch64::FPR64RegClass;
9451 Opc = AArch64::FMLSv4f16;
9452 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9454 break;
9456 RC = &AArch64::FPR64RegClass;
9457 Opc = AArch64::FMLSv4i16_indexed;
9458 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9460 break;
9461
9464 RC = &AArch64::FPR64RegClass;
9466 Opc = AArch64::FMLSv2i32_indexed;
9467 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9469 } else {
9470 Opc = AArch64::FMLSv2f32;
9471 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9473 }
9474 break;
9475
9478 RC = &AArch64::FPR128RegClass;
9479 Register NewVR = MRI.createVirtualRegister(RC);
9480 MachineInstrBuilder MIB1 =
9481 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9482 .add(Root.getOperand(2));
9483 InsInstrs.push_back(MIB1);
9484 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9486 Opc = AArch64::FMLAv8f16;
9487 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9488 FMAInstKind::Accumulator, &NewVR);
9489 } else {
9490 Opc = AArch64::FMLAv8i16_indexed;
9491 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9492 FMAInstKind::Indexed, &NewVR);
9493 }
9494 break;
9495 }
9497 RC = &AArch64::FPR128RegClass;
9498 Opc = AArch64::FMLSv8f16;
9499 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9501 break;
9503 RC = &AArch64::FPR128RegClass;
9504 Opc = AArch64::FMLSv8i16_indexed;
9505 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9507 break;
9508
9511 RC = &AArch64::FPR128RegClass;
9513 Opc = AArch64::FMLSv2i64_indexed;
9514 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9516 } else {
9517 Opc = AArch64::FMLSv2f64;
9518 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9520 }
9521 break;
9522
9525 RC = &AArch64::FPR128RegClass;
9527 Opc = AArch64::FMLSv4i32_indexed;
9528 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9530 } else {
9531 Opc = AArch64::FMLSv4f32;
9532 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9534 }
9535 break;
9538 RC = &AArch64::FPR64RegClass;
9539 Register NewVR = MRI.createVirtualRegister(RC);
9540 MachineInstrBuilder MIB1 =
9541 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9542 .add(Root.getOperand(2));
9543 InsInstrs.push_back(MIB1);
9544 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9546 Opc = AArch64::FMLAv2i32_indexed;
9547 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9548 FMAInstKind::Indexed, &NewVR);
9549 } else {
9550 Opc = AArch64::FMLAv2f32;
9551 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9552 FMAInstKind::Accumulator, &NewVR);
9553 }
9554 break;
9555 }
9558 RC = &AArch64::FPR128RegClass;
9559 Register NewVR = MRI.createVirtualRegister(RC);
9560 MachineInstrBuilder MIB1 =
9561 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9562 .add(Root.getOperand(2));
9563 InsInstrs.push_back(MIB1);
9564 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9566 Opc = AArch64::FMLAv4i32_indexed;
9567 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9568 FMAInstKind::Indexed, &NewVR);
9569 } else {
9570 Opc = AArch64::FMLAv4f32;
9571 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9572 FMAInstKind::Accumulator, &NewVR);
9573 }
9574 break;
9575 }
9578 RC = &AArch64::FPR128RegClass;
9579 Register NewVR = MRI.createVirtualRegister(RC);
9580 MachineInstrBuilder MIB1 =
9581 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9582 .add(Root.getOperand(2));
9583 InsInstrs.push_back(MIB1);
9584 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9586 Opc = AArch64::FMLAv2i64_indexed;
9587 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9588 FMAInstKind::Indexed, &NewVR);
9589 } else {
9590 Opc = AArch64::FMLAv2f64;
9591 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9592 FMAInstKind::Accumulator, &NewVR);
9593 }
9594 break;
9595 }
9598 unsigned IdxDupOp =
9600 : 2;
9601 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9602 &AArch64::FPR128RegClass, MRI);
9603 break;
9604 }
9607 unsigned IdxDupOp =
9609 : 2;
9610 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9611 &AArch64::FPR128RegClass, MRI);
9612 break;
9613 }
9616 unsigned IdxDupOp =
9618 : 2;
9619 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9620 &AArch64::FPR128_loRegClass, MRI);
9621 break;
9622 }
9625 unsigned IdxDupOp =
9627 : 2;
9628 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9629 &AArch64::FPR128RegClass, MRI);
9630 break;
9631 }
9634 unsigned IdxDupOp =
9636 : 2;
9637 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9638 &AArch64::FPR128_loRegClass, MRI);
9639 break;
9640 }
9642 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9643 break;
9644 }
9646 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9647 Pattern, 4);
9648 break;
9649 }
9651 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9652 Pattern, 8);
9653 break;
9654 }
9656 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9657 Pattern, 16);
9658 break;
9659 }
9660
9661 } // end switch (Pattern)
9662 // Record MUL and ADD/SUB for deletion
9663 if (MUL)
9664 DelInstrs.push_back(MUL);
9665 DelInstrs.push_back(&Root);
9666
9667 // Set the flags on the inserted instructions to be the merged flags of the
9668 // instructions that we have combined.
9669 uint32_t Flags = Root.getFlags();
9670 if (MUL)
9671 Flags = Root.mergeFlagsWith(*MUL);
9672 for (auto *MI : InsInstrs)
9673 MI->setFlags(Flags);
9674}
9675
9676/// Replace csincr-branch sequence by simple conditional branch
9677///
9678/// Examples:
9679/// 1. \code
9680/// csinc w9, wzr, wzr, <condition code>
9681/// tbnz w9, #0, 0x44
9682/// \endcode
9683/// to
9684/// \code
9685/// b.<inverted condition code>
9686/// \endcode
9687///
9688/// 2. \code
9689/// csinc w9, wzr, wzr, <condition code>
9690/// tbz w9, #0, 0x44
9691/// \endcode
9692/// to
9693/// \code
9694/// b.<condition code>
9695/// \endcode
9696///
9697/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9698/// compare's constant operand is power of 2.
9699///
9700/// Examples:
9701/// \code
9702/// and w8, w8, #0x400
9703/// cbnz w8, L1
9704/// \endcode
9705/// to
9706/// \code
9707/// tbnz w8, #10, L1
9708/// \endcode
9709///
9710/// \param MI Conditional Branch
9711/// \return True when the simple conditional branch is generated
9712///
9714 bool IsNegativeBranch = false;
9715 bool IsTestAndBranch = false;
9716 unsigned TargetBBInMI = 0;
9717 switch (MI.getOpcode()) {
9718 default:
9719 llvm_unreachable("Unknown branch instruction?");
9720 case AArch64::Bcc:
9721 case AArch64::CBWPri:
9722 case AArch64::CBXPri:
9723 case AArch64::CBBAssertExt:
9724 case AArch64::CBHAssertExt:
9725 case AArch64::CBWPrr:
9726 case AArch64::CBXPrr:
9727 return false;
9728 case AArch64::CBZW:
9729 case AArch64::CBZX:
9730 TargetBBInMI = 1;
9731 break;
9732 case AArch64::CBNZW:
9733 case AArch64::CBNZX:
9734 TargetBBInMI = 1;
9735 IsNegativeBranch = true;
9736 break;
9737 case AArch64::TBZW:
9738 case AArch64::TBZX:
9739 TargetBBInMI = 2;
9740 IsTestAndBranch = true;
9741 break;
9742 case AArch64::TBNZW:
9743 case AArch64::TBNZX:
9744 TargetBBInMI = 2;
9745 IsNegativeBranch = true;
9746 IsTestAndBranch = true;
9747 break;
9748 }
9749 // So we increment a zero register and test for bits other
9750 // than bit 0? Conservatively bail out in case the verifier
9751 // missed this case.
9752 if (IsTestAndBranch && MI.getOperand(1).getImm())
9753 return false;
9754
9755 // Find Definition.
9756 assert(MI.getParent() && "Incomplete machine instruction\n");
9757 MachineBasicBlock *MBB = MI.getParent();
9758 MachineFunction *MF = MBB->getParent();
9759 MachineRegisterInfo *MRI = &MF->getRegInfo();
9760 Register VReg = MI.getOperand(0).getReg();
9761 if (!VReg.isVirtual())
9762 return false;
9763
9764 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9765
9766 // Look through COPY instructions to find definition.
9767 while (DefMI->isCopy()) {
9768 Register CopyVReg = DefMI->getOperand(1).getReg();
9769 if (!MRI->hasOneNonDBGUse(CopyVReg))
9770 return false;
9771 if (!MRI->hasOneDef(CopyVReg))
9772 return false;
9773 DefMI = MRI->getVRegDef(CopyVReg);
9774 }
9775
9776 switch (DefMI->getOpcode()) {
9777 default:
9778 return false;
9779 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9780 case AArch64::ANDWri:
9781 case AArch64::ANDXri: {
9782 if (IsTestAndBranch)
9783 return false;
9784 if (DefMI->getParent() != MBB)
9785 return false;
9786 if (!MRI->hasOneNonDBGUse(VReg))
9787 return false;
9788
9789 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9791 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
9792 if (!isPowerOf2_64(Mask))
9793 return false;
9794
9795 MachineOperand &MO = DefMI->getOperand(1);
9796 Register NewReg = MO.getReg();
9797 if (!NewReg.isVirtual())
9798 return false;
9799
9800 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9801
9802 MachineBasicBlock &RefToMBB = *MBB;
9803 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
9804 DebugLoc DL = MI.getDebugLoc();
9805 unsigned Imm = Log2_64(Mask);
9806 unsigned Opc = (Imm < 32)
9807 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9808 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9809 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
9810 .addReg(NewReg)
9811 .addImm(Imm)
9812 .addMBB(TBB);
9813 // Register lives on to the CBZ now.
9814 MO.setIsKill(false);
9815
9816 // For immediate smaller than 32, we need to use the 32-bit
9817 // variant (W) in all cases. Indeed the 64-bit variant does not
9818 // allow to encode them.
9819 // Therefore, if the input register is 64-bit, we need to take the
9820 // 32-bit sub-part.
9821 if (!Is32Bit && Imm < 32)
9822 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
9823 MI.eraseFromParent();
9824 return true;
9825 }
9826 // Look for CSINC
9827 case AArch64::CSINCWr:
9828 case AArch64::CSINCXr: {
9829 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
9830 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
9831 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
9832 DefMI->getOperand(2).getReg() == AArch64::XZR))
9833 return false;
9834
9835 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
9836 true) != -1)
9837 return false;
9838
9839 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
9840 // Convert only when the condition code is not modified between
9841 // the CSINC and the branch. The CC may be used by other
9842 // instructions in between.
9844 return false;
9845 MachineBasicBlock &RefToMBB = *MBB;
9846 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
9847 DebugLoc DL = MI.getDebugLoc();
9848 if (IsNegativeBranch)
9850 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
9851 MI.eraseFromParent();
9852 return true;
9853 }
9854 }
9855}
9856
9857std::pair<unsigned, unsigned>
9858AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9859 const unsigned Mask = AArch64II::MO_FRAGMENT;
9860 return std::make_pair(TF & Mask, TF & ~Mask);
9861}
9862
9864AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9865 using namespace AArch64II;
9866
9867 static const std::pair<unsigned, const char *> TargetFlags[] = {
9868 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9869 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9870 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9871 {MO_HI12, "aarch64-hi12"}};
9872 return ArrayRef(TargetFlags);
9873}
9874
9876AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9877 using namespace AArch64II;
9878
9879 static const std::pair<unsigned, const char *> TargetFlags[] = {
9880 {MO_COFFSTUB, "aarch64-coffstub"},
9881 {MO_GOT, "aarch64-got"},
9882 {MO_NC, "aarch64-nc"},
9883 {MO_S, "aarch64-s"},
9884 {MO_TLS, "aarch64-tls"},
9885 {MO_DLLIMPORT, "aarch64-dllimport"},
9886 {MO_PREL, "aarch64-prel"},
9887 {MO_TAGGED, "aarch64-tagged"},
9888 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9889 };
9890 return ArrayRef(TargetFlags);
9891}
9892
9894AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9895 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9896 {{MOSuppressPair, "aarch64-suppress-pair"},
9897 {MOStridedAccess, "aarch64-strided-access"}};
9898 return ArrayRef(TargetFlags);
9899}
9900
9901/// Constants defining how certain sequences should be outlined.
9902/// This encompasses how an outlined function should be called, and what kind of
9903/// frame should be emitted for that outlined function.
9904///
9905/// \p MachineOutlinerDefault implies that the function should be called with
9906/// a save and restore of LR to the stack.
9907///
9908/// That is,
9909///
9910/// I1 Save LR OUTLINED_FUNCTION:
9911/// I2 --> BL OUTLINED_FUNCTION I1
9912/// I3 Restore LR I2
9913/// I3
9914/// RET
9915///
9916/// * Call construction overhead: 3 (save + BL + restore)
9917/// * Frame construction overhead: 1 (ret)
9918/// * Requires stack fixups? Yes
9919///
9920/// \p MachineOutlinerTailCall implies that the function is being created from
9921/// a sequence of instructions ending in a return.
9922///
9923/// That is,
9924///
9925/// I1 OUTLINED_FUNCTION:
9926/// I2 --> B OUTLINED_FUNCTION I1
9927/// RET I2
9928/// RET
9929///
9930/// * Call construction overhead: 1 (B)
9931/// * Frame construction overhead: 0 (Return included in sequence)
9932/// * Requires stack fixups? No
9933///
9934/// \p MachineOutlinerNoLRSave implies that the function should be called using
9935/// a BL instruction, but doesn't require LR to be saved and restored. This
9936/// happens when LR is known to be dead.
9937///
9938/// That is,
9939///
9940/// I1 OUTLINED_FUNCTION:
9941/// I2 --> BL OUTLINED_FUNCTION I1
9942/// I3 I2
9943/// I3
9944/// RET
9945///
9946/// * Call construction overhead: 1 (BL)
9947/// * Frame construction overhead: 1 (RET)
9948/// * Requires stack fixups? No
9949///
9950/// \p MachineOutlinerThunk implies that the function is being created from
9951/// a sequence of instructions ending in a call. The outlined function is
9952/// called with a BL instruction, and the outlined function tail-calls the
9953/// original call destination.
9954///
9955/// That is,
9956///
9957/// I1 OUTLINED_FUNCTION:
9958/// I2 --> BL OUTLINED_FUNCTION I1
9959/// BL f I2
9960/// B f
9961/// * Call construction overhead: 1 (BL)
9962/// * Frame construction overhead: 0
9963/// * Requires stack fixups? No
9964///
9965/// \p MachineOutlinerRegSave implies that the function should be called with a
9966/// save and restore of LR to an available register. This allows us to avoid
9967/// stack fixups. Note that this outlining variant is compatible with the
9968/// NoLRSave case.
9969///
9970/// That is,
9971///
9972/// I1 Save LR OUTLINED_FUNCTION:
9973/// I2 --> BL OUTLINED_FUNCTION I1
9974/// I3 Restore LR I2
9975/// I3
9976/// RET
9977///
9978/// * Call construction overhead: 3 (save + BL + restore)
9979/// * Frame construction overhead: 1 (ret)
9980/// * Requires stack fixups? No
9982 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9983 MachineOutlinerTailCall, /// Only emit a branch.
9984 MachineOutlinerNoLRSave, /// Emit a call and return.
9985 MachineOutlinerThunk, /// Emit a call and tail-call.
9986 MachineOutlinerRegSave /// Same as default, but save to a register.
9987};
9988
9994
9996AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9997 MachineFunction *MF = C.getMF();
9998 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9999 const AArch64RegisterInfo *ARI =
10000 static_cast<const AArch64RegisterInfo *>(&TRI);
10001 // Check if there is an available register across the sequence that we can
10002 // use.
10003 for (unsigned Reg : AArch64::GPR64RegClass) {
10004 if (!ARI->isReservedReg(*MF, Reg) &&
10005 Reg != AArch64::LR && // LR is not reserved, but don't use it.
10006 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
10007 Reg != AArch64::X17 && // Ditto for X17.
10008 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
10009 C.isAvailableInsideSeq(Reg, TRI))
10010 return Reg;
10011 }
10012 return Register();
10013}
10014
10015static bool
10017 const outliner::Candidate &b) {
10018 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10019 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10020
10021 return MFIa->getSignReturnAddressCondition() ==
10023}
10024
10025static bool
10027 const outliner::Candidate &b) {
10028 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10029 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10030
10031 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10032}
10033
10035 const outliner::Candidate &b) {
10036 const AArch64Subtarget &SubtargetA =
10038 const AArch64Subtarget &SubtargetB =
10039 b.getMF()->getSubtarget<AArch64Subtarget>();
10040 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10041}
10042
10043std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10044AArch64InstrInfo::getOutliningCandidateInfo(
10045 const MachineModuleInfo &MMI,
10046 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10047 unsigned MinRepeats) const {
10048 unsigned SequenceSize = 0;
10049 for (auto &MI : RepeatedSequenceLocs[0])
10050 SequenceSize += getInstSizeInBytes(MI);
10051
10052 unsigned NumBytesToCreateFrame = 0;
10053
10054 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10055 // These instructions are fused together by the scheduler.
10056 // Any candidate where ADRP is the last instruction should be rejected
10057 // as that will lead to splitting ADRP pair.
10058 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10059 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10060 if (LastMI.getOpcode() == AArch64::ADRP &&
10061 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10062 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10063 return std::nullopt;
10064 }
10065
10066 // Similarly any candidate where the first instruction is ADD/LDR with a
10067 // page offset should be rejected to avoid ADRP splitting.
10068 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10069 FirstMI.getOpcode() == AArch64::LDRXui) &&
10070 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10071 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10072 return std::nullopt;
10073 }
10074
10075 // We only allow outlining for functions having exactly matching return
10076 // address signing attributes, i.e., all share the same value for the
10077 // attribute "sign-return-address" and all share the same type of key they
10078 // are signed with.
10079 // Additionally we require all functions to simultaneously either support
10080 // v8.3a features or not. Otherwise an outlined function could get signed
10081 // using dedicated v8.3 instructions and a call from a function that doesn't
10082 // support v8.3 instructions would therefore be invalid.
10083 if (std::adjacent_find(
10084 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
10085 [](const outliner::Candidate &a, const outliner::Candidate &b) {
10086 // Return true if a and b are non-equal w.r.t. return address
10087 // signing or support of v8.3a features
10088 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10089 outliningCandidatesSigningKeyConsensus(a, b) &&
10090 outliningCandidatesV8_3OpsConsensus(a, b)) {
10091 return false;
10092 }
10093 return true;
10094 }) != RepeatedSequenceLocs.end()) {
10095 return std::nullopt;
10096 }
10097
10098 // Since at this point all candidates agree on their return address signing
10099 // picking just one is fine. If the candidate functions potentially sign their
10100 // return addresses, the outlined function should do the same. Note that in
10101 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10102 // not certainly true that the outlined function will have to sign its return
10103 // address but this decision is made later, when the decision to outline
10104 // has already been made.
10105 // The same holds for the number of additional instructions we need: On
10106 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10107 // necessary. However, at this point we don't know if the outlined function
10108 // will have a RET instruction so we assume the worst.
10109 const TargetRegisterInfo &TRI = getRegisterInfo();
10110 // Performing a tail call may require extra checks when PAuth is enabled.
10111 // If PAuth is disabled, set it to zero for uniformity.
10112 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10113 const auto RASignCondition = RepeatedSequenceLocs[0]
10114 .getMF()
10115 ->getInfo<AArch64FunctionInfo>()
10116 ->getSignReturnAddressCondition();
10117 if (RASignCondition != SignReturnAddress::None) {
10118 // One PAC and one AUT instructions
10119 NumBytesToCreateFrame += 8;
10120
10121 // PAuth is enabled - set extra tail call cost, if any.
10122 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10123 *RepeatedSequenceLocs[0].getMF());
10124 NumBytesToCheckLRInTCEpilogue =
10126 // Checking the authenticated LR value may significantly impact
10127 // SequenceSize, so account for it for more precise results.
10128 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
10129 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10130
10131 // We have to check if sp modifying instructions would get outlined.
10132 // If so we only allow outlining if sp is unchanged overall, so matching
10133 // sub and add instructions are okay to outline, all other sp modifications
10134 // are not
10135 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10136 int SPValue = 0;
10137 for (auto &MI : C) {
10138 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
10139 switch (MI.getOpcode()) {
10140 case AArch64::ADDXri:
10141 case AArch64::ADDWri:
10142 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10143 assert(MI.getOperand(2).isImm() &&
10144 "Expected operand to be immediate");
10145 assert(MI.getOperand(1).isReg() &&
10146 "Expected operand to be a register");
10147 // Check if the add just increments sp. If so, we search for
10148 // matching sub instructions that decrement sp. If not, the
10149 // modification is illegal
10150 if (MI.getOperand(1).getReg() == AArch64::SP)
10151 SPValue += MI.getOperand(2).getImm();
10152 else
10153 return true;
10154 break;
10155 case AArch64::SUBXri:
10156 case AArch64::SUBWri:
10157 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10158 assert(MI.getOperand(2).isImm() &&
10159 "Expected operand to be immediate");
10160 assert(MI.getOperand(1).isReg() &&
10161 "Expected operand to be a register");
10162 // Check if the sub just decrements sp. If so, we search for
10163 // matching add instructions that increment sp. If not, the
10164 // modification is illegal
10165 if (MI.getOperand(1).getReg() == AArch64::SP)
10166 SPValue -= MI.getOperand(2).getImm();
10167 else
10168 return true;
10169 break;
10170 default:
10171 return true;
10172 }
10173 }
10174 }
10175 if (SPValue)
10176 return true;
10177 return false;
10178 };
10179 // Remove candidates with illegal stack modifying instructions
10180 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10181
10182 // If the sequence doesn't have enough candidates left, then we're done.
10183 if (RepeatedSequenceLocs.size() < MinRepeats)
10184 return std::nullopt;
10185 }
10186
10187 // Properties about candidate MBBs that hold for all of them.
10188 unsigned FlagsSetInAll = 0xF;
10189
10190 // Compute liveness information for each candidate, and set FlagsSetInAll.
10191 for (outliner::Candidate &C : RepeatedSequenceLocs)
10192 FlagsSetInAll &= C.Flags;
10193
10194 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10195
10196 // Helper lambda which sets call information for every candidate.
10197 auto SetCandidateCallInfo =
10198 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10199 for (outliner::Candidate &C : RepeatedSequenceLocs)
10200 C.setCallInfo(CallID, NumBytesForCall);
10201 };
10202
10203 unsigned FrameID = MachineOutlinerDefault;
10204 NumBytesToCreateFrame += 4;
10205
10206 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10207 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10208 });
10209
10210 // We check to see if CFI Instructions are present, and if they are
10211 // we find the number of CFI Instructions in the candidates.
10212 unsigned CFICount = 0;
10213 for (auto &I : RepeatedSequenceLocs[0]) {
10214 if (I.isCFIInstruction())
10215 CFICount++;
10216 }
10217
10218 // We compare the number of found CFI Instructions to the number of CFI
10219 // instructions in the parent function for each candidate. We must check this
10220 // since if we outline one of the CFI instructions in a function, we have to
10221 // outline them all for correctness. If we do not, the address offsets will be
10222 // incorrect between the two sections of the program.
10223 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10224 std::vector<MCCFIInstruction> CFIInstructions =
10225 C.getMF()->getFrameInstructions();
10226
10227 if (CFICount > 0 && CFICount != CFIInstructions.size())
10228 return std::nullopt;
10229 }
10230
10231 // Returns true if an instructions is safe to fix up, false otherwise.
10232 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10233 if (MI.isCall())
10234 return true;
10235
10236 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10237 !MI.readsRegister(AArch64::SP, &TRI))
10238 return true;
10239
10240 // Any modification of SP will break our code to save/restore LR.
10241 // FIXME: We could handle some instructions which add a constant
10242 // offset to SP, with a bit more work.
10243 if (MI.modifiesRegister(AArch64::SP, &TRI))
10244 return false;
10245
10246 // At this point, we have a stack instruction that we might need to
10247 // fix up. We'll handle it if it's a load or store.
10248 if (MI.mayLoadOrStore()) {
10249 const MachineOperand *Base; // Filled with the base operand of MI.
10250 int64_t Offset; // Filled with the offset of MI.
10251 bool OffsetIsScalable;
10252
10253 // Does it allow us to offset the base operand and is the base the
10254 // register SP?
10255 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10256 !Base->isReg() || Base->getReg() != AArch64::SP)
10257 return false;
10258
10259 // Fixe-up code below assumes bytes.
10260 if (OffsetIsScalable)
10261 return false;
10262
10263 // Find the minimum/maximum offset for this instruction and check
10264 // if fixing it up would be in range.
10265 int64_t MinOffset,
10266 MaxOffset; // Unscaled offsets for the instruction.
10267 // The scale to multiply the offsets by.
10268 TypeSize Scale(0U, false), DummyWidth(0U, false);
10269 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10270
10271 Offset += 16; // Update the offset to what it would be if we outlined.
10272 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10273 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10274 return false;
10275
10276 // It's in range, so we can outline it.
10277 return true;
10278 }
10279
10280 // FIXME: Add handling for instructions like "add x0, sp, #8".
10281
10282 // We can't fix it up, so don't outline it.
10283 return false;
10284 };
10285
10286 // True if it's possible to fix up each stack instruction in this sequence.
10287 // Important for frames/call variants that modify the stack.
10288 bool AllStackInstrsSafe =
10289 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10290
10291 // If the last instruction in any candidate is a terminator, then we should
10292 // tail call all of the candidates.
10293 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10294 FrameID = MachineOutlinerTailCall;
10295 NumBytesToCreateFrame = 0;
10296 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10297 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10298 }
10299
10300 else if (LastInstrOpcode == AArch64::BL ||
10301 ((LastInstrOpcode == AArch64::BLR ||
10302 LastInstrOpcode == AArch64::BLRNoIP) &&
10303 !HasBTI)) {
10304 // FIXME: Do we need to check if the code after this uses the value of LR?
10305 FrameID = MachineOutlinerThunk;
10306 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10307 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10308 }
10309
10310 else {
10311 // We need to decide how to emit calls + frames. We can always emit the same
10312 // frame if we don't need to save to the stack. If we have to save to the
10313 // stack, then we need a different frame.
10314 unsigned NumBytesNoStackCalls = 0;
10315 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10316
10317 // Check if we have to save LR.
10318 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10319 bool LRAvailable =
10321 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10322 : true;
10323 // If we have a noreturn caller, then we're going to be conservative and
10324 // say that we have to save LR. If we don't have a ret at the end of the
10325 // block, then we can't reason about liveness accurately.
10326 //
10327 // FIXME: We can probably do better than always disabling this in
10328 // noreturn functions by fixing up the liveness info.
10329 bool IsNoReturn =
10330 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10331
10332 // Is LR available? If so, we don't need a save.
10333 if (LRAvailable && !IsNoReturn) {
10334 NumBytesNoStackCalls += 4;
10335 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10336 CandidatesWithoutStackFixups.push_back(C);
10337 }
10338
10339 // Is an unused register available? If so, we won't modify the stack, so
10340 // we can outline with the same frame type as those that don't save LR.
10341 else if (findRegisterToSaveLRTo(C)) {
10342 NumBytesNoStackCalls += 12;
10343 C.setCallInfo(MachineOutlinerRegSave, 12);
10344 CandidatesWithoutStackFixups.push_back(C);
10345 }
10346
10347 // Is SP used in the sequence at all? If not, we don't have to modify
10348 // the stack, so we are guaranteed to get the same frame.
10349 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10350 NumBytesNoStackCalls += 12;
10351 C.setCallInfo(MachineOutlinerDefault, 12);
10352 CandidatesWithoutStackFixups.push_back(C);
10353 }
10354
10355 // If we outline this, we need to modify the stack. Pretend we don't
10356 // outline this by saving all of its bytes.
10357 else {
10358 NumBytesNoStackCalls += SequenceSize;
10359 }
10360 }
10361
10362 // If there are no places where we have to save LR, then note that we
10363 // don't have to update the stack. Otherwise, give every candidate the
10364 // default call type, as long as it's safe to do so.
10365 if (!AllStackInstrsSafe ||
10366 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10367 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10368 FrameID = MachineOutlinerNoLRSave;
10369 if (RepeatedSequenceLocs.size() < MinRepeats)
10370 return std::nullopt;
10371 } else {
10372 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10373
10374 // Bugzilla ID: 46767
10375 // TODO: Check if fixing up the stack more than once is safe so we can
10376 // outline these.
10377 //
10378 // An outline resulting in a caller that requires stack fixups at the
10379 // callsite to a callee that also requires stack fixups can happen when
10380 // there are no available registers at the candidate callsite for a
10381 // candidate that itself also has calls.
10382 //
10383 // In other words if function_containing_sequence in the following pseudo
10384 // assembly requires that we save LR at the point of the call, but there
10385 // are no available registers: in this case we save using SP and as a
10386 // result the SP offsets requires stack fixups by multiples of 16.
10387 //
10388 // function_containing_sequence:
10389 // ...
10390 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10391 // call OUTLINED_FUNCTION_N
10392 // restore LR from SP
10393 // ...
10394 //
10395 // OUTLINED_FUNCTION_N:
10396 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10397 // ...
10398 // bl foo
10399 // restore LR from SP
10400 // ret
10401 //
10402 // Because the code to handle more than one stack fixup does not
10403 // currently have the proper checks for legality, these cases will assert
10404 // in the AArch64 MachineOutliner. This is because the code to do this
10405 // needs more hardening, testing, better checks that generated code is
10406 // legal, etc and because it is only verified to handle a single pass of
10407 // stack fixup.
10408 //
10409 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10410 // these cases until they are known to be handled. Bugzilla 46767 is
10411 // referenced in comments at the assert site.
10412 //
10413 // To avoid asserting (or generating non-legal code on noassert builds)
10414 // we remove all candidates which would need more than one stack fixup by
10415 // pruning the cases where the candidate has calls while also having no
10416 // available LR and having no available general purpose registers to copy
10417 // LR to (ie one extra stack save/restore).
10418 //
10419 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10420 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10421 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10422 return (llvm::any_of(C, IsCall)) &&
10423 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10424 !findRegisterToSaveLRTo(C));
10425 });
10426 }
10427 }
10428
10429 // If we dropped all of the candidates, bail out here.
10430 if (RepeatedSequenceLocs.size() < MinRepeats)
10431 return std::nullopt;
10432 }
10433
10434 // Does every candidate's MBB contain a call? If so, then we might have a call
10435 // in the range.
10436 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10437 // Check if the range contains a call. These require a save + restore of the
10438 // link register.
10439 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10440 bool ModStackToSaveLR = false;
10441 if (any_of(drop_end(FirstCand),
10442 [](const MachineInstr &MI) { return MI.isCall(); }))
10443 ModStackToSaveLR = true;
10444
10445 // Handle the last instruction separately. If this is a tail call, then the
10446 // last instruction is a call. We don't want to save + restore in this case.
10447 // However, it could be possible that the last instruction is a call without
10448 // it being valid to tail call this sequence. We should consider this as
10449 // well.
10450 else if (FrameID != MachineOutlinerThunk &&
10451 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10452 ModStackToSaveLR = true;
10453
10454 if (ModStackToSaveLR) {
10455 // We can't fix up the stack. Bail out.
10456 if (!AllStackInstrsSafe)
10457 return std::nullopt;
10458
10459 // Save + restore LR.
10460 NumBytesToCreateFrame += 8;
10461 }
10462 }
10463
10464 // If we have CFI instructions, we can only outline if the outlined section
10465 // can be a tail call
10466 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10467 return std::nullopt;
10468
10469 return std::make_unique<outliner::OutlinedFunction>(
10470 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10471}
10472
10473void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10474 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10475 // If a bunch of candidates reach this point they must agree on their return
10476 // address signing. It is therefore enough to just consider the signing
10477 // behaviour of one of them
10478 const auto &CFn = Candidates.front().getMF()->getFunction();
10479
10480 if (CFn.hasFnAttribute("ptrauth-returns"))
10481 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10482 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10483 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10484 // Since all candidates belong to the same module, just copy the
10485 // function-level attributes of an arbitrary function.
10486 if (CFn.hasFnAttribute("sign-return-address"))
10487 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10488 if (CFn.hasFnAttribute("sign-return-address-key"))
10489 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10490
10491 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10492}
10493
10494bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10495 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10496 const Function &F = MF.getFunction();
10497
10498 // Can F be deduplicated by the linker? If it can, don't outline from it.
10499 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10500 return false;
10501
10502 // Don't outline from functions with section markings; the program could
10503 // expect that all the code is in the named section.
10504 // FIXME: Allow outlining from multiple functions with the same section
10505 // marking.
10506 if (F.hasSection())
10507 return false;
10508
10509 // Outlining from functions with redzones is unsafe since the outliner may
10510 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10511 // outline from it.
10512 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10513 if (!AFI || AFI->hasRedZone().value_or(true))
10514 return false;
10515
10516 // FIXME: Determine whether it is safe to outline from functions which contain
10517 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10518 // outlined together and ensure it is safe to outline with async unwind info,
10519 // required for saving & restoring VG around calls.
10520 if (AFI->hasStreamingModeChanges())
10521 return false;
10522
10523 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10525 return false;
10526
10527 // It's safe to outline from MF.
10528 return true;
10529}
10530
10532AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10533 unsigned &Flags) const {
10535 "Must track liveness!");
10537 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10538 Ranges;
10539 // According to the AArch64 Procedure Call Standard, the following are
10540 // undefined on entry/exit from a function call:
10541 //
10542 // * Registers x16, x17, (and thus w16, w17)
10543 // * Condition codes (and thus the NZCV register)
10544 //
10545 // If any of these registers are used inside or live across an outlined
10546 // function, then they may be modified later, either by the compiler or
10547 // some other tool (like the linker).
10548 //
10549 // To avoid outlining in these situations, partition each block into ranges
10550 // where these registers are dead. We will only outline from those ranges.
10551 LiveRegUnits LRU(getRegisterInfo());
10552 auto AreAllUnsafeRegsDead = [&LRU]() {
10553 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10554 LRU.available(AArch64::NZCV);
10555 };
10556
10557 // We need to know if LR is live across an outlining boundary later on in
10558 // order to decide how we'll create the outlined call, frame, etc.
10559 //
10560 // It's pretty expensive to check this for *every candidate* within a block.
10561 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10562 // to compute liveness from the end of the block for O(n) candidates within
10563 // the block.
10564 //
10565 // So, to improve the average case, let's keep track of liveness from the end
10566 // of the block to the beginning of *every outlinable range*. If we know that
10567 // LR is available in every range we could outline from, then we know that
10568 // we don't need to check liveness for any candidate within that range.
10569 bool LRAvailableEverywhere = true;
10570 // Compute liveness bottom-up.
10571 LRU.addLiveOuts(MBB);
10572 // Update flags that require info about the entire MBB.
10573 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10574 if (MI.isCall() && !MI.isTerminator())
10576 };
10577 // Range: [RangeBegin, RangeEnd)
10578 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10579 unsigned RangeLen;
10580 auto CreateNewRangeStartingAt =
10581 [&RangeBegin, &RangeEnd,
10582 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10583 RangeBegin = NewBegin;
10584 RangeEnd = std::next(RangeBegin);
10585 RangeLen = 0;
10586 };
10587 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10588 // At least one unsafe register is not dead. We do not want to outline at
10589 // this point. If it is long enough to outline from and does not cross a
10590 // bundle boundary, save the range [RangeBegin, RangeEnd).
10591 if (RangeLen <= 1)
10592 return;
10593 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10594 return;
10595 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10596 return;
10597 Ranges.emplace_back(RangeBegin, RangeEnd);
10598 };
10599 // Find the first point where all unsafe registers are dead.
10600 // FIND: <safe instr> <-- end of first potential range
10601 // SKIP: <unsafe def>
10602 // SKIP: ... everything between ...
10603 // SKIP: <unsafe use>
10604 auto FirstPossibleEndPt = MBB.instr_rbegin();
10605 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10606 LRU.stepBackward(*FirstPossibleEndPt);
10607 // Update flags that impact how we outline across the entire block,
10608 // regardless of safety.
10609 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10610 if (AreAllUnsafeRegsDead())
10611 break;
10612 }
10613 // If we exhausted the entire block, we have no safe ranges to outline.
10614 if (FirstPossibleEndPt == MBB.instr_rend())
10615 return Ranges;
10616 // Current range.
10617 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10618 // StartPt points to the first place where all unsafe registers
10619 // are dead (if there is any such point). Begin partitioning the MBB into
10620 // ranges.
10621 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10622 LRU.stepBackward(MI);
10623 UpdateWholeMBBFlags(MI);
10624 if (!AreAllUnsafeRegsDead()) {
10625 SaveRangeIfNonEmpty();
10626 CreateNewRangeStartingAt(MI.getIterator());
10627 continue;
10628 }
10629 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10630 RangeBegin = MI.getIterator();
10631 ++RangeLen;
10632 }
10633 // Above loop misses the last (or only) range. If we are still safe, then
10634 // let's save the range.
10635 if (AreAllUnsafeRegsDead())
10636 SaveRangeIfNonEmpty();
10637 if (Ranges.empty())
10638 return Ranges;
10639 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10640 // the order.
10641 std::reverse(Ranges.begin(), Ranges.end());
10642 // If there is at least one outlinable range where LR is unavailable
10643 // somewhere, remember that.
10644 if (!LRAvailableEverywhere)
10646 return Ranges;
10647}
10648
10650AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10652 unsigned Flags) const {
10653 MachineInstr &MI = *MIT;
10654
10655 // Don't outline anything used for return address signing. The outlined
10656 // function will get signed later if needed
10657 switch (MI.getOpcode()) {
10658 case AArch64::PACM:
10659 case AArch64::PACIASP:
10660 case AArch64::PACIBSP:
10661 case AArch64::PACIASPPC:
10662 case AArch64::PACIBSPPC:
10663 case AArch64::AUTIASP:
10664 case AArch64::AUTIBSP:
10665 case AArch64::AUTIASPPCi:
10666 case AArch64::AUTIASPPCr:
10667 case AArch64::AUTIBSPPCi:
10668 case AArch64::AUTIBSPPCr:
10669 case AArch64::RETAA:
10670 case AArch64::RETAB:
10671 case AArch64::RETAASPPCi:
10672 case AArch64::RETAASPPCr:
10673 case AArch64::RETABSPPCi:
10674 case AArch64::RETABSPPCr:
10675 case AArch64::EMITBKEY:
10676 case AArch64::PAUTH_PROLOGUE:
10677 case AArch64::PAUTH_EPILOGUE:
10679 }
10680
10681 // We can only outline these if we will tail call the outlined function, or
10682 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10683 // in a tail call.
10684 //
10685 // FIXME: If the proper fixups for the offset are implemented, this should be
10686 // possible.
10687 if (MI.isCFIInstruction())
10689
10690 // Is this a terminator for a basic block?
10691 if (MI.isTerminator())
10692 // TargetInstrInfo::getOutliningType has already filtered out anything
10693 // that would break this, so we can allow it here.
10695
10696 // Make sure none of the operands are un-outlinable.
10697 for (const MachineOperand &MOP : MI.operands()) {
10698 // A check preventing CFI indices was here before, but only CFI
10699 // instructions should have those.
10700 assert(!MOP.isCFIIndex());
10701
10702 // If it uses LR or W30 explicitly, then don't touch it.
10703 if (MOP.isReg() && !MOP.isImplicit() &&
10704 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10706 }
10707
10708 // Special cases for instructions that can always be outlined, but will fail
10709 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10710 // be outlined because they don't require a *specific* value to be in LR.
10711 if (MI.getOpcode() == AArch64::ADRP)
10713
10714 // If MI is a call we might be able to outline it. We don't want to outline
10715 // any calls that rely on the position of items on the stack. When we outline
10716 // something containing a call, we have to emit a save and restore of LR in
10717 // the outlined function. Currently, this always happens by saving LR to the
10718 // stack. Thus, if we outline, say, half the parameters for a function call
10719 // plus the call, then we'll break the callee's expectations for the layout
10720 // of the stack.
10721 //
10722 // FIXME: Allow calls to functions which construct a stack frame, as long
10723 // as they don't access arguments on the stack.
10724 // FIXME: Figure out some way to analyze functions defined in other modules.
10725 // We should be able to compute the memory usage based on the IR calling
10726 // convention, even if we can't see the definition.
10727 if (MI.isCall()) {
10728 // Get the function associated with the call. Look at each operand and find
10729 // the one that represents the callee and get its name.
10730 const Function *Callee = nullptr;
10731 for (const MachineOperand &MOP : MI.operands()) {
10732 if (MOP.isGlobal()) {
10733 Callee = dyn_cast<Function>(MOP.getGlobal());
10734 break;
10735 }
10736 }
10737
10738 // Never outline calls to mcount. There isn't any rule that would require
10739 // this, but the Linux kernel's "ftrace" feature depends on it.
10740 if (Callee && Callee->getName() == "\01_mcount")
10742
10743 // If we don't know anything about the callee, assume it depends on the
10744 // stack layout of the caller. In that case, it's only legal to outline
10745 // as a tail-call. Explicitly list the call instructions we know about so we
10746 // don't get unexpected results with call pseudo-instructions.
10747 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10748 if (MI.getOpcode() == AArch64::BLR ||
10749 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10750 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10751
10752 if (!Callee)
10753 return UnknownCallOutlineType;
10754
10755 // We have a function we have information about. Check it if it's something
10756 // can safely outline.
10757 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10758
10759 // We don't know what's going on with the callee at all. Don't touch it.
10760 if (!CalleeMF)
10761 return UnknownCallOutlineType;
10762
10763 // Check if we know anything about the callee saves on the function. If we
10764 // don't, then don't touch it, since that implies that we haven't
10765 // computed anything about its stack frame yet.
10766 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10767 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10768 MFI.getNumObjects() > 0)
10769 return UnknownCallOutlineType;
10770
10771 // At this point, we can say that CalleeMF ought to not pass anything on the
10772 // stack. Therefore, we can outline it.
10774 }
10775
10776 // Don't touch the link register or W30.
10777 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10778 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10780
10781 // Don't outline BTI instructions, because that will prevent the outlining
10782 // site from being indirectly callable.
10783 if (hasBTISemantics(MI))
10785
10787}
10788
10789void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10790 for (MachineInstr &MI : MBB) {
10791 const MachineOperand *Base;
10792 TypeSize Width(0, false);
10793 int64_t Offset;
10794 bool OffsetIsScalable;
10795
10796 // Is this a load or store with an immediate offset with SP as the base?
10797 if (!MI.mayLoadOrStore() ||
10798 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
10799 &RI) ||
10800 (Base->isReg() && Base->getReg() != AArch64::SP))
10801 continue;
10802
10803 // It is, so we have to fix it up.
10804 TypeSize Scale(0U, false);
10805 int64_t Dummy1, Dummy2;
10806
10807 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
10808 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10809 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
10810 assert(Scale != 0 && "Unexpected opcode!");
10811 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10812
10813 // We've pushed the return address to the stack, so add 16 to the offset.
10814 // This is safe, since we already checked if it would overflow when we
10815 // checked if this instruction was legal to outline.
10816 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10817 StackOffsetOperand.setImm(NewImm);
10818 }
10819}
10820
10822 const AArch64InstrInfo *TII,
10823 bool ShouldSignReturnAddr) {
10824 if (!ShouldSignReturnAddr)
10825 return;
10826
10827 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
10829 TII->createPauthEpilogueInstr(MBB, DebugLoc());
10830}
10831
10832void AArch64InstrInfo::buildOutlinedFrame(
10834 const outliner::OutlinedFunction &OF) const {
10835
10836 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10837
10838 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10839 FI->setOutliningStyle("Tail Call");
10840 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10841 // For thunk outlining, rewrite the last instruction from a call to a
10842 // tail-call.
10843 MachineInstr *Call = &*--MBB.instr_end();
10844 unsigned TailOpcode;
10845 if (Call->getOpcode() == AArch64::BL) {
10846 TailOpcode = AArch64::TCRETURNdi;
10847 } else {
10848 assert(Call->getOpcode() == AArch64::BLR ||
10849 Call->getOpcode() == AArch64::BLRNoIP);
10850 TailOpcode = AArch64::TCRETURNriALL;
10851 }
10852 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
10853 .add(Call->getOperand(0))
10854 .addImm(0);
10855 MBB.insert(MBB.end(), TC);
10857
10858 FI->setOutliningStyle("Thunk");
10859 }
10860
10861 bool IsLeafFunction = true;
10862
10863 // Is there a call in the outlined range?
10864 auto IsNonTailCall = [](const MachineInstr &MI) {
10865 return MI.isCall() && !MI.isReturn();
10866 };
10867
10868 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
10869 // Fix up the instructions in the range, since we're going to modify the
10870 // stack.
10871
10872 // Bugzilla ID: 46767
10873 // TODO: Check if fixing up twice is safe so we can outline these.
10874 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10875 "Can only fix up stack references once");
10876 fixupPostOutline(MBB);
10877
10878 IsLeafFunction = false;
10879
10880 // LR has to be a live in so that we can save it.
10881 if (!MBB.isLiveIn(AArch64::LR))
10882 MBB.addLiveIn(AArch64::LR);
10883
10886
10887 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10888 OF.FrameConstructionID == MachineOutlinerThunk)
10889 Et = std::prev(MBB.end());
10890
10891 // Insert a save before the outlined region
10892 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10893 .addReg(AArch64::SP, RegState::Define)
10894 .addReg(AArch64::LR)
10895 .addReg(AArch64::SP)
10896 .addImm(-16);
10897 It = MBB.insert(It, STRXpre);
10898
10899 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10900 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10901
10902 // Add a CFI saying the stack was moved 16 B down.
10903 CFIBuilder.buildDefCFAOffset(16);
10904
10905 // Add a CFI saying that the LR that we want to find is now 16 B higher
10906 // than before.
10907 CFIBuilder.buildOffset(AArch64::LR, -16);
10908 }
10909
10910 // Insert a restore before the terminator for the function.
10911 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10912 .addReg(AArch64::SP, RegState::Define)
10913 .addReg(AArch64::LR, RegState::Define)
10914 .addReg(AArch64::SP)
10915 .addImm(16);
10916 Et = MBB.insert(Et, LDRXpost);
10917 }
10918
10919 auto RASignCondition = FI->getSignReturnAddressCondition();
10920 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10921 RASignCondition, !IsLeafFunction);
10922
10923 // If this is a tail call outlined function, then there's already a return.
10924 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10925 OF.FrameConstructionID == MachineOutlinerThunk) {
10926 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10927 return;
10928 }
10929
10930 // It's not a tail call, so we have to insert the return ourselves.
10931
10932 // LR has to be a live in so that we can return to it.
10933 if (!MBB.isLiveIn(AArch64::LR))
10934 MBB.addLiveIn(AArch64::LR);
10935
10936 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
10937 .addReg(AArch64::LR);
10938 MBB.insert(MBB.end(), ret);
10939
10940 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10941
10942 FI->setOutliningStyle("Function");
10943
10944 // Did we have to modify the stack by saving the link register?
10945 if (OF.FrameConstructionID != MachineOutlinerDefault)
10946 return;
10947
10948 // We modified the stack.
10949 // Walk over the basic block and fix up all the stack accesses.
10950 fixupPostOutline(MBB);
10951}
10952
10953MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10956
10957 // Are we tail calling?
10958 if (C.CallConstructionID == MachineOutlinerTailCall) {
10959 // If yes, then we can just branch to the label.
10960 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
10961 .addGlobalAddress(M.getNamedValue(MF.getName()))
10962 .addImm(0));
10963 return It;
10964 }
10965
10966 // Are we saving the link register?
10967 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10968 C.CallConstructionID == MachineOutlinerThunk) {
10969 // No, so just insert the call.
10970 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10971 .addGlobalAddress(M.getNamedValue(MF.getName())));
10972 return It;
10973 }
10974
10975 // We want to return the spot where we inserted the call.
10977
10978 // Instructions for saving and restoring LR around the call instruction we're
10979 // going to insert.
10980 MachineInstr *Save;
10981 MachineInstr *Restore;
10982 // Can we save to a register?
10983 if (C.CallConstructionID == MachineOutlinerRegSave) {
10984 // FIXME: This logic should be sunk into a target-specific interface so that
10985 // we don't have to recompute the register.
10986 Register Reg = findRegisterToSaveLRTo(C);
10987 assert(Reg && "No callee-saved register available?");
10988
10989 // LR has to be a live in so that we can save it.
10990 if (!MBB.isLiveIn(AArch64::LR))
10991 MBB.addLiveIn(AArch64::LR);
10992
10993 // Save and restore LR from Reg.
10994 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
10995 .addReg(AArch64::XZR)
10996 .addReg(AArch64::LR)
10997 .addImm(0);
10998 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
10999 .addReg(AArch64::XZR)
11000 .addReg(Reg)
11001 .addImm(0);
11002 } else {
11003 // We have the default case. Save and restore from SP.
11004 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11005 .addReg(AArch64::SP, RegState::Define)
11006 .addReg(AArch64::LR)
11007 .addReg(AArch64::SP)
11008 .addImm(-16);
11009 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11010 .addReg(AArch64::SP, RegState::Define)
11011 .addReg(AArch64::LR, RegState::Define)
11012 .addReg(AArch64::SP)
11013 .addImm(16);
11014 }
11015
11016 It = MBB.insert(It, Save);
11017 It++;
11018
11019 // Insert the call.
11020 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11021 .addGlobalAddress(M.getNamedValue(MF.getName())));
11022 CallPt = It;
11023 It++;
11024
11025 It = MBB.insert(It, Restore);
11026 return CallPt;
11027}
11028
11029bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11030 MachineFunction &MF) const {
11031 return MF.getFunction().hasMinSize();
11032}
11033
11034void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11036 DebugLoc &DL,
11037 bool AllowSideEffects) const {
11038 const MachineFunction &MF = *MBB.getParent();
11039 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11040 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11041
11042 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11043 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
11044 } else if (STI.isSVEorStreamingSVEAvailable()) {
11045 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
11046 .addImm(0)
11047 .addImm(0);
11048 } else if (STI.isNeonAvailable()) {
11049 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
11050 .addImm(0);
11051 } else {
11052 // This is a streaming-compatible function without SVE. We don't have full
11053 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11054 // So given `movi v..` would be illegal use `fmov d..` instead.
11055 assert(STI.hasNEON() && "Expected to have NEON.");
11056 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
11057 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
11058 }
11059}
11060
11061std::optional<DestSourcePair>
11063
11064 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11065 // and zero immediate operands used as an alias for mov instruction.
11066 if (((MI.getOpcode() == AArch64::ORRWrs &&
11067 MI.getOperand(1).getReg() == AArch64::WZR &&
11068 MI.getOperand(3).getImm() == 0x0) ||
11069 (MI.getOpcode() == AArch64::ORRWrr &&
11070 MI.getOperand(1).getReg() == AArch64::WZR)) &&
11071 // Check that the w->w move is not a zero-extending w->x mov.
11072 (!MI.getOperand(0).getReg().isVirtual() ||
11073 MI.getOperand(0).getSubReg() == 0) &&
11074 (!MI.getOperand(0).getReg().isPhysical() ||
11075 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
11076 /*TRI=*/nullptr) == -1))
11077 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11078
11079 if (MI.getOpcode() == AArch64::ORRXrs &&
11080 MI.getOperand(1).getReg() == AArch64::XZR &&
11081 MI.getOperand(3).getImm() == 0x0)
11082 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11083
11084 return std::nullopt;
11085}
11086
11087std::optional<DestSourcePair>
11089 if ((MI.getOpcode() == AArch64::ORRWrs &&
11090 MI.getOperand(1).getReg() == AArch64::WZR &&
11091 MI.getOperand(3).getImm() == 0x0) ||
11092 (MI.getOpcode() == AArch64::ORRWrr &&
11093 MI.getOperand(1).getReg() == AArch64::WZR))
11094 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11095 return std::nullopt;
11096}
11097
11098std::optional<RegImmPair>
11099AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11100 int Sign = 1;
11101 int64_t Offset = 0;
11102
11103 // TODO: Handle cases where Reg is a super- or sub-register of the
11104 // destination register.
11105 const MachineOperand &Op0 = MI.getOperand(0);
11106 if (!Op0.isReg() || Reg != Op0.getReg())
11107 return std::nullopt;
11108
11109 switch (MI.getOpcode()) {
11110 default:
11111 return std::nullopt;
11112 case AArch64::SUBWri:
11113 case AArch64::SUBXri:
11114 case AArch64::SUBSWri:
11115 case AArch64::SUBSXri:
11116 Sign *= -1;
11117 [[fallthrough]];
11118 case AArch64::ADDSWri:
11119 case AArch64::ADDSXri:
11120 case AArch64::ADDWri:
11121 case AArch64::ADDXri: {
11122 // TODO: Third operand can be global address (usually some string).
11123 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
11124 !MI.getOperand(2).isImm())
11125 return std::nullopt;
11126 int Shift = MI.getOperand(3).getImm();
11127 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11128 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
11129 }
11130 }
11131 return RegImmPair{MI.getOperand(1).getReg(), Offset};
11132}
11133
11134/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11135/// the destination register then, if possible, describe the value in terms of
11136/// the source register.
11137static std::optional<ParamLoadedValue>
11139 const TargetInstrInfo *TII,
11140 const TargetRegisterInfo *TRI) {
11141 auto DestSrc = TII->isCopyLikeInstr(MI);
11142 if (!DestSrc)
11143 return std::nullopt;
11144
11145 Register DestReg = DestSrc->Destination->getReg();
11146 Register SrcReg = DestSrc->Source->getReg();
11147
11148 if (!DestReg.isValid() || !SrcReg.isValid())
11149 return std::nullopt;
11150
11151 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11152
11153 // If the described register is the destination, just return the source.
11154 if (DestReg == DescribedReg)
11155 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11156
11157 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11158 if (MI.getOpcode() == AArch64::ORRWrs &&
11159 TRI->isSuperRegister(DestReg, DescribedReg))
11160 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11161
11162 // We may need to describe the lower part of a ORRXrs move.
11163 if (MI.getOpcode() == AArch64::ORRXrs &&
11164 TRI->isSubRegister(DestReg, DescribedReg)) {
11165 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11166 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11167 }
11168
11169 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11170 "Unhandled ORR[XW]rs copy case");
11171
11172 return std::nullopt;
11173}
11174
11175bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11176 // Functions cannot be split to different sections on AArch64 if they have
11177 // a red zone. This is because relaxing a cross-section branch may require
11178 // incrementing the stack pointer to spill a register, which would overwrite
11179 // the red zone.
11180 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11181 return false;
11182
11184}
11185
11186bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11187 const MachineBasicBlock &MBB) const {
11188 // Asm Goto blocks can contain conditional branches to goto labels, which can
11189 // get moved out of range of the branch instruction.
11190 auto isAsmGoto = [](const MachineInstr &MI) {
11191 return MI.getOpcode() == AArch64::INLINEASM_BR;
11192 };
11193 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11194 return false;
11195
11196 // Because jump tables are label-relative instead of table-relative, they all
11197 // must be in the same section or relocation fixup handling will fail.
11198
11199 // Check if MBB is a jump table target
11200 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11201 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11202 return llvm::is_contained(JTE.MBBs, &MBB);
11203 };
11204 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11205 return false;
11206
11207 // Check if MBB contains a jump table lookup
11208 for (const MachineInstr &MI : MBB) {
11209 switch (MI.getOpcode()) {
11210 case TargetOpcode::G_BRJT:
11211 case AArch64::JumpTableDest32:
11212 case AArch64::JumpTableDest16:
11213 case AArch64::JumpTableDest8:
11214 return false;
11215 default:
11216 continue;
11217 }
11218 }
11219
11220 // MBB isn't a special case, so it's safe to be split to the cold section.
11221 return true;
11222}
11223
11224std::optional<ParamLoadedValue>
11225AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11226 Register Reg) const {
11227 const MachineFunction *MF = MI.getMF();
11228 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11229 switch (MI.getOpcode()) {
11230 case AArch64::MOVZWi:
11231 case AArch64::MOVZXi: {
11232 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11233 // 64-bit parameters, so we need to consider super-registers.
11234 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11235 return std::nullopt;
11236
11237 if (!MI.getOperand(1).isImm())
11238 return std::nullopt;
11239 int64_t Immediate = MI.getOperand(1).getImm();
11240 int Shift = MI.getOperand(2).getImm();
11241 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11242 nullptr);
11243 }
11244 case AArch64::ORRWrs:
11245 case AArch64::ORRXrs:
11246 return describeORRLoadedValue(MI, Reg, this, TRI);
11247 }
11248
11250}
11251
11252bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11253 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11254 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11255 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11256 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11257
11258 // Anyexts are nops.
11259 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11260 return true;
11261
11262 Register DefReg = ExtMI.getOperand(0).getReg();
11263 if (!MRI.hasOneNonDBGUse(DefReg))
11264 return false;
11265
11266 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11267 // addressing mode.
11268 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11269 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11270}
11271
11272uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11273 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11274}
11275
11276bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11277 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11278}
11279
11280bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11281 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11282}
11283
11284unsigned int
11285AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11286 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11287}
11288
11289bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11290 unsigned Scale) const {
11291 if (Offset && Scale)
11292 return false;
11293
11294 // Check Reg + Imm
11295 if (!Scale) {
11296 // 9-bit signed offset
11297 if (isInt<9>(Offset))
11298 return true;
11299
11300 // 12-bit unsigned offset
11301 unsigned Shift = Log2_64(NumBytes);
11302 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11303 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11304 (Offset >> Shift) << Shift == Offset)
11305 return true;
11306 return false;
11307 }
11308
11309 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11310 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11311}
11312
11314 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11315 return AArch64::BLRNoIP;
11316 else
11317 return AArch64::BLR;
11318}
11319
11321 DebugLoc DL) const {
11322 MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
11323 auto Builder = BuildMI(MBB, InsertPt, DL, get(AArch64::PAUTH_EPILOGUE))
11325
11326 const auto *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
11327 if (AFI->branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
11328 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11329}
11330
11332AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11333 Register TargetReg, bool FrameSetup) const {
11334 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11335
11336 MachineBasicBlock &MBB = *MBBI->getParent();
11337 MachineFunction &MF = *MBB.getParent();
11338 const AArch64InstrInfo *TII =
11339 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11340 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11341 DebugLoc DL = MBB.findDebugLoc(MBBI);
11342
11343 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11344 MachineBasicBlock *LoopTestMBB =
11345 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11346 MF.insert(MBBInsertPoint, LoopTestMBB);
11347 MachineBasicBlock *LoopBodyMBB =
11348 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11349 MF.insert(MBBInsertPoint, LoopBodyMBB);
11350 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11351 MF.insert(MBBInsertPoint, ExitMBB);
11352 MachineInstr::MIFlag Flags =
11354
11355 // LoopTest:
11356 // SUB SP, SP, #ProbeSize
11357 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11358 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11359
11360 // CMP SP, TargetReg
11361 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11362 AArch64::XZR)
11363 .addReg(AArch64::SP)
11364 .addReg(TargetReg)
11366 .setMIFlags(Flags);
11367
11368 // B.<Cond> LoopExit
11369 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11371 .addMBB(ExitMBB)
11372 .setMIFlags(Flags);
11373
11374 // LDR XZR, [SP]
11375 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::LDRXui))
11376 .addDef(AArch64::XZR)
11377 .addReg(AArch64::SP)
11378 .addImm(0)
11382 Align(8)))
11383 .setMIFlags(Flags);
11384
11385 // B loop
11386 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11387 .addMBB(LoopTestMBB)
11388 .setMIFlags(Flags);
11389
11390 // LoopExit:
11391 // MOV SP, TargetReg
11392 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11393 .addReg(TargetReg)
11394 .addImm(0)
11396 .setMIFlags(Flags);
11397
11398 // LDR XZR, [SP]
11399 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11400 .addReg(AArch64::XZR, RegState::Define)
11401 .addReg(AArch64::SP)
11402 .addImm(0)
11403 .setMIFlags(Flags);
11404
11405 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11407
11408 LoopTestMBB->addSuccessor(ExitMBB);
11409 LoopTestMBB->addSuccessor(LoopBodyMBB);
11410 LoopBodyMBB->addSuccessor(LoopTestMBB);
11411 MBB.addSuccessor(LoopTestMBB);
11412
11413 // Update liveins.
11414 if (MF.getRegInfo().reservedRegsFrozen())
11415 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11416
11417 return ExitMBB->begin();
11418}
11419
11420namespace {
11421class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11422 MachineFunction *MF;
11423 const TargetInstrInfo *TII;
11424 const TargetRegisterInfo *TRI;
11425 MachineRegisterInfo &MRI;
11426
11427 /// The block of the loop
11428 MachineBasicBlock *LoopBB;
11429 /// The conditional branch of the loop
11430 MachineInstr *CondBranch;
11431 /// The compare instruction for loop control
11432 MachineInstr *Comp;
11433 /// The number of the operand of the loop counter value in Comp
11434 unsigned CompCounterOprNum;
11435 /// The instruction that updates the loop counter value
11436 MachineInstr *Update;
11437 /// The number of the operand of the loop counter value in Update
11438 unsigned UpdateCounterOprNum;
11439 /// The initial value of the loop counter
11440 Register Init;
11441 /// True iff Update is a predecessor of Comp
11442 bool IsUpdatePriorComp;
11443
11444 /// The normalized condition used by createTripCountGreaterCondition()
11446
11447public:
11448 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11449 MachineInstr *Comp, unsigned CompCounterOprNum,
11450 MachineInstr *Update, unsigned UpdateCounterOprNum,
11451 Register Init, bool IsUpdatePriorComp,
11452 const SmallVectorImpl<MachineOperand> &Cond)
11453 : MF(Comp->getParent()->getParent()),
11454 TII(MF->getSubtarget().getInstrInfo()),
11455 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11456 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11457 CompCounterOprNum(CompCounterOprNum), Update(Update),
11458 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11459 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11460
11461 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11462 // Make the instructions for loop control be placed in stage 0.
11463 // The predecessors of Comp are considered by the caller.
11464 return MI == Comp;
11465 }
11466
11467 std::optional<bool> createTripCountGreaterCondition(
11468 int TC, MachineBasicBlock &MBB,
11469 SmallVectorImpl<MachineOperand> &CondParam) override {
11470 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11471 // Cond is normalized for such use.
11472 // The predecessors of the branch are assumed to have already been inserted.
11473 CondParam = Cond;
11474 return {};
11475 }
11476
11477 void createRemainingIterationsGreaterCondition(
11478 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11479 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11480
11481 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11482
11483 void adjustTripCount(int TripCountAdjust) override {}
11484
11485 bool isMVEExpanderSupported() override { return true; }
11486};
11487} // namespace
11488
11489/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11490/// is replaced by ReplaceReg. The output register is newly created.
11491/// The other operands are unchanged from MI.
11492static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11493 Register ReplaceReg, MachineBasicBlock &MBB,
11494 MachineBasicBlock::iterator InsertTo) {
11495 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11496 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11497 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11498 Register Result = 0;
11499 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11500 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11501 Result = MRI.createVirtualRegister(
11502 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11503 NewMI->getOperand(I).setReg(Result);
11504 } else if (I == ReplaceOprNum) {
11505 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11506 NewMI->getOperand(I).setReg(ReplaceReg);
11507 }
11508 }
11509 MBB.insert(InsertTo, NewMI);
11510 return Result;
11511}
11512
11513void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11516 // Create and accumulate conditions for next TC iterations.
11517 // Example:
11518 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11519 // # iteration of the kernel
11520 //
11521 // # insert the following instructions
11522 // cond = CSINCXr 0, 0, C, implicit $nzcv
11523 // counter = ADDXri counter, 1 # clone from this->Update
11524 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11525 // cond = CSINCXr cond, cond, C, implicit $nzcv
11526 // ... (repeat TC times)
11527 // SUBSXri cond, 0, implicit-def $nzcv
11528
11529 assert(CondBranch->getOpcode() == AArch64::Bcc);
11530 // CondCode to exit the loop
11532 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11533 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11535
11536 // Accumulate conditions to exit the loop
11537 Register AccCond = AArch64::XZR;
11538
11539 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11540 auto AccumulateCond = [&](Register CurCond,
11542 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11543 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11544 .addReg(NewCond, RegState::Define)
11545 .addReg(CurCond)
11546 .addReg(CurCond)
11548 return NewCond;
11549 };
11550
11551 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11552 // Update and Comp for I==0 are already exists in MBB
11553 // (MBB is an unrolled kernel)
11554 Register Counter;
11555 for (int I = 0; I <= TC; ++I) {
11556 Register NextCounter;
11557 if (I != 0)
11558 NextCounter =
11559 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11560
11561 AccCond = AccumulateCond(AccCond, CC);
11562
11563 if (I != TC) {
11564 if (I == 0) {
11565 if (Update != Comp && IsUpdatePriorComp) {
11566 Counter =
11567 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11568 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11569 MBB.end());
11570 } else {
11571 // can use already calculated value
11572 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11573 }
11574 } else if (Update != Comp) {
11575 NextCounter =
11576 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11577 }
11578 }
11579 Counter = NextCounter;
11580 }
11581 } else {
11582 Register Counter;
11583 if (LastStage0Insts.empty()) {
11584 // use initial counter value (testing if the trip count is sufficient to
11585 // be executed by pipelined code)
11586 Counter = Init;
11587 if (IsUpdatePriorComp)
11588 Counter =
11589 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11590 } else {
11591 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11592 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11593 }
11594
11595 for (int I = 0; I <= TC; ++I) {
11596 Register NextCounter;
11597 NextCounter =
11598 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11599 AccCond = AccumulateCond(AccCond, CC);
11600 if (I != TC && Update != Comp)
11601 NextCounter =
11602 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11603 Counter = NextCounter;
11604 }
11605 }
11606
11607 // If AccCond == 0, the remainder is greater than TC.
11608 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11609 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11610 .addReg(AccCond)
11611 .addImm(0)
11612 .addImm(0);
11613 Cond.clear();
11615}
11616
11617static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11618 Register &RegMBB, Register &RegOther) {
11619 assert(Phi.getNumOperands() == 5);
11620 if (Phi.getOperand(2).getMBB() == MBB) {
11621 RegMBB = Phi.getOperand(1).getReg();
11622 RegOther = Phi.getOperand(3).getReg();
11623 } else {
11624 assert(Phi.getOperand(4).getMBB() == MBB);
11625 RegMBB = Phi.getOperand(3).getReg();
11626 RegOther = Phi.getOperand(1).getReg();
11627 }
11628}
11629
11631 if (!Reg.isVirtual())
11632 return false;
11633 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11634 return MRI.getVRegDef(Reg)->getParent() != BB;
11635}
11636
11637/// If Reg is an induction variable, return true and set some parameters
11638static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11639 MachineInstr *&UpdateInst,
11640 unsigned &UpdateCounterOprNum, Register &InitReg,
11641 bool &IsUpdatePriorComp) {
11642 // Example:
11643 //
11644 // Preheader:
11645 // InitReg = ...
11646 // LoopBB:
11647 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11648 // Reg = COPY Reg0 ; COPY is ignored.
11649 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11650 // ; Reg is the value calculated in the previous
11651 // ; iteration, so IsUpdatePriorComp == false.
11652
11653 if (LoopBB->pred_size() != 2)
11654 return false;
11655 if (!Reg.isVirtual())
11656 return false;
11657 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11658 UpdateInst = nullptr;
11659 UpdateCounterOprNum = 0;
11660 InitReg = 0;
11661 IsUpdatePriorComp = true;
11662 Register CurReg = Reg;
11663 while (true) {
11664 MachineInstr *Def = MRI.getVRegDef(CurReg);
11665 if (Def->getParent() != LoopBB)
11666 return false;
11667 if (Def->isCopy()) {
11668 // Ignore copy instructions unless they contain subregisters
11669 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11670 return false;
11671 CurReg = Def->getOperand(1).getReg();
11672 } else if (Def->isPHI()) {
11673 if (InitReg != 0)
11674 return false;
11675 if (!UpdateInst)
11676 IsUpdatePriorComp = false;
11677 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11678 } else {
11679 if (UpdateInst)
11680 return false;
11681 switch (Def->getOpcode()) {
11682 case AArch64::ADDSXri:
11683 case AArch64::ADDSWri:
11684 case AArch64::SUBSXri:
11685 case AArch64::SUBSWri:
11686 case AArch64::ADDXri:
11687 case AArch64::ADDWri:
11688 case AArch64::SUBXri:
11689 case AArch64::SUBWri:
11690 UpdateInst = Def;
11691 UpdateCounterOprNum = 1;
11692 break;
11693 case AArch64::ADDSXrr:
11694 case AArch64::ADDSWrr:
11695 case AArch64::SUBSXrr:
11696 case AArch64::SUBSWrr:
11697 case AArch64::ADDXrr:
11698 case AArch64::ADDWrr:
11699 case AArch64::SUBXrr:
11700 case AArch64::SUBWrr:
11701 UpdateInst = Def;
11702 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11703 UpdateCounterOprNum = 1;
11704 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11705 UpdateCounterOprNum = 2;
11706 else
11707 return false;
11708 break;
11709 default:
11710 return false;
11711 }
11712 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11713 }
11714
11715 if (!CurReg.isVirtual())
11716 return false;
11717 if (Reg == CurReg)
11718 break;
11719 }
11720
11721 if (!UpdateInst)
11722 return false;
11723
11724 return true;
11725}
11726
11727std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11729 // Accept loops that meet the following conditions
11730 // * The conditional branch is BCC
11731 // * The compare instruction is ADDS/SUBS/WHILEXX
11732 // * One operand of the compare is an induction variable and the other is a
11733 // loop invariant value
11734 // * The induction variable is incremented/decremented by a single instruction
11735 // * Does not contain CALL or instructions which have unmodeled side effects
11736
11737 for (MachineInstr &MI : *LoopBB)
11738 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11739 // This instruction may use NZCV, which interferes with the instruction to
11740 // be inserted for loop control.
11741 return nullptr;
11742
11743 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11745 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11746 return nullptr;
11747
11748 // Infinite loops are not supported
11749 if (TBB == LoopBB && FBB == LoopBB)
11750 return nullptr;
11751
11752 // Must be conditional branch
11753 if (TBB != LoopBB && FBB == nullptr)
11754 return nullptr;
11755
11756 assert((TBB == LoopBB || FBB == LoopBB) &&
11757 "The Loop must be a single-basic-block loop");
11758
11759 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11761
11762 if (CondBranch->getOpcode() != AArch64::Bcc)
11763 return nullptr;
11764
11765 // Normalization for createTripCountGreaterCondition()
11766 if (TBB == LoopBB)
11768
11769 MachineInstr *Comp = nullptr;
11770 unsigned CompCounterOprNum = 0;
11771 for (MachineInstr &MI : reverse(*LoopBB)) {
11772 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
11773 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11774 // operands is a loop invariant value
11775
11776 switch (MI.getOpcode()) {
11777 case AArch64::SUBSXri:
11778 case AArch64::SUBSWri:
11779 case AArch64::ADDSXri:
11780 case AArch64::ADDSWri:
11781 Comp = &MI;
11782 CompCounterOprNum = 1;
11783 break;
11784 case AArch64::ADDSWrr:
11785 case AArch64::ADDSXrr:
11786 case AArch64::SUBSWrr:
11787 case AArch64::SUBSXrr:
11788 Comp = &MI;
11789 break;
11790 default:
11791 if (isWhileOpcode(MI.getOpcode())) {
11792 Comp = &MI;
11793 break;
11794 }
11795 return nullptr;
11796 }
11797
11798 if (CompCounterOprNum == 0) {
11799 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
11800 CompCounterOprNum = 2;
11801 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
11802 CompCounterOprNum = 1;
11803 else
11804 return nullptr;
11805 }
11806 break;
11807 }
11808 }
11809 if (!Comp)
11810 return nullptr;
11811
11812 MachineInstr *Update = nullptr;
11813 Register Init;
11814 bool IsUpdatePriorComp;
11815 unsigned UpdateCounterOprNum;
11816 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
11817 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
11818 return nullptr;
11819
11820 return std::make_unique<AArch64PipelinerLoopInfo>(
11821 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
11822 Init, IsUpdatePriorComp, Cond);
11823}
11824
11825/// verifyInstruction - Perform target specific instruction verification.
11826bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11827 StringRef &ErrInfo) const {
11828 // Verify that immediate offsets on load/store instructions are within range.
11829 // Stack objects with an FI operand are excluded as they can be fixed up
11830 // during PEI.
11831 TypeSize Scale(0U, false), Width(0U, false);
11832 int64_t MinOffset, MaxOffset;
11833 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11834 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
11835 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
11836 int64_t Imm = MI.getOperand(ImmIdx).getImm();
11837 if (Imm < MinOffset || Imm > MaxOffset) {
11838 ErrInfo = "Unexpected immediate on load/store instruction";
11839 return false;
11840 }
11841 }
11842 }
11843
11844 const MCInstrDesc &MCID = MI.getDesc();
11845 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11846 const MachineOperand &MO = MI.getOperand(Op);
11847 switch (MCID.operands()[Op].OperandType) {
11849 if (!MO.isImm() || MO.getImm() != 0) {
11850 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11851 return false;
11852 }
11853 break;
11855 if (!MO.isImm() ||
11857 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
11858 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
11859 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11860 return false;
11861 }
11862 break;
11863 default:
11864 break;
11865 }
11866 }
11867 return true;
11868}
11869
11870#define GET_INSTRINFO_HELPERS
11871#define GET_INSTRMAP_INFO
11872#include "AArch64GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if in a streaming call site region without SME-FA64.
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static std::optional< unsigned > getLFIInstSizeInBytes(const MachineInstr &MI)
Return the maximum number of bytes of code the specified instruction may be after LFI rewriting.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if the instruction at I is in a streaming call site region, within a single basic block.
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, RegState State, const TargetRegisterInfo *TRI)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
static bool isZExtLoad(const MachineInstr &MI)
Returns whether the instruction is a zero-extending load.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
void createPauthEpilogueInstr(MachineBasicBlock &MBB, DebugLoc DL) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSExtLoad(const MachineInstr &MI)
Returns whether the instruction is a sign-extending load.
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:123
bool empty() const
Definition DenseMap.h:109
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:64
bool usesWindowsCFI() const
Definition MCAsmInfo.h:665
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:576
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:618
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:591
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:688
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
Instructions::const_iterator const_instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
bool def_empty(Register RegNo) const
def_empty - Return true if there are no instructions defining the specified register (it may be live-...
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
bool hasOneDef(Register RegNo) const
Return true if there is exactly one operand defining the specified register.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:207
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
iterator end() const
Definition BasicBlock.h:89
LLVM_ABI Instruction & back() const
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr RegState getDefRegState(bool B)
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
static MCRegister getXRegFromWReg(MCRegister Reg)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
constexpr RegState getUndefRegState(bool B)
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.