LLVM 23.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
50#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Support/LEB128.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
101 "aarch64-search-limit", cl::Hidden, cl::init(2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// Return the maximum number of bytes of code the specified instruction may be
111/// after LFI rewriting. If the instruction is not rewritten, std::nullopt is
112/// returned (use default sizing).
113///
114/// NOTE: the size estimates here must be kept in sync with the rewrites in
115/// AArch64MCLFIRewriter.cpp. Sizes may be overestimates of the rewritten
116/// instruction sequences.
117static std::optional<unsigned> getLFIInstSizeInBytes(const MachineInstr &MI) {
118 switch (MI.getOpcode()) {
119 case AArch64::SVC:
120 // SVC expands to 4 instructions.
121 return 16;
122 case AArch64::BR:
123 case AArch64::BLR:
124 // Indirect branches/calls expand to 2 instructions (guard + br/blr).
125 return 8;
126 case AArch64::RET:
127 // RET through LR is not rewritten, but RET through another register
128 // expands to 2 instructions (guard + ret).
129 if (MI.getOperand(0).getReg() != AArch64::LR)
130 return 8;
131 return 4;
132 default:
133 break;
134 }
135
136 // Instructions that explicitly modify LR expand to 2 instructions.
137 for (const MachineOperand &MO : MI.explicit_operands())
138 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::LR)
139 return 8;
140
141 // Default case: instructions that don't cause expansion.
142 // - TP accesses in LFI are a single load/store, so no expansion.
143 // - All remaining instructions are not rewritten.
144 return std::nullopt;
145}
146
147/// GetInstSize - Return the number of bytes of code the specified
148/// instruction may be. This returns the maximum number of bytes.
150 const MachineBasicBlock &MBB = *MI.getParent();
151 const MachineFunction *MF = MBB.getParent();
152 const Function &F = MF->getFunction();
153 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
154
155 {
156 auto Op = MI.getOpcode();
157 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
158 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
159 }
160
161 // Meta-instructions emit no code.
162 if (MI.isMetaInstruction())
163 return 0;
164
165 // FIXME: We currently only handle pseudoinstructions that don't get expanded
166 // before the assembly printer.
167 unsigned NumBytes = 0;
168 const MCInstrDesc &Desc = MI.getDesc();
169
170 // LFI rewriter expansions that supersede normal sizing.
171 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
172 if (STI.isLFI())
173 if (auto Size = getLFIInstSizeInBytes(MI))
174 return *Size;
175
176 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
177 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
178
179 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
180 if (!MFI->shouldSignReturnAddress(*MF))
181 return NumBytes;
182
183 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
184 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
185 return NumBytes;
186 }
187
188 // Size should be preferably set in
189 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
190 // Specific cases handle instructions of variable sizes
191 switch (Desc.getOpcode()) {
192 default:
193 if (Desc.getSize())
194 return Desc.getSize();
195
196 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
197 // with fixed constant size but not specified in .td file) is a normal
198 // 4-byte insn.
199 NumBytes = 4;
200 break;
201 case TargetOpcode::STACKMAP:
202 // The upper bound for a stackmap intrinsic is the full length of its shadow
203 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
204 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
205 break;
206 case TargetOpcode::PATCHPOINT:
207 // The size of the patchpoint intrinsic is the number of bytes requested
208 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
209 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
210 break;
211 case TargetOpcode::STATEPOINT:
212 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
213 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
214 // No patch bytes means a normal call inst is emitted
215 if (NumBytes == 0)
216 NumBytes = 4;
217 break;
218 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
219 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
220 // instructions are expanded to the specified number of NOPs. Otherwise,
221 // they are expanded to 36-byte XRay sleds.
222 NumBytes =
223 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
224 break;
225 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
226 case TargetOpcode::PATCHABLE_TAIL_CALL:
227 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
228 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
229 NumBytes = 36;
230 break;
231 case TargetOpcode::PATCHABLE_EVENT_CALL:
232 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
233 NumBytes = 24;
234 break;
235
236 case AArch64::SPACE:
237 NumBytes = MI.getOperand(1).getImm();
238 break;
239 case TargetOpcode::BUNDLE:
240 NumBytes = getInstBundleSize(MI);
241 break;
242 }
243
244 return NumBytes;
245}
246
249 // Block ends with fall-through condbranch.
250 switch (LastInst->getOpcode()) {
251 default:
252 llvm_unreachable("Unknown branch instruction?");
253 case AArch64::Bcc:
254 Target = LastInst->getOperand(1).getMBB();
255 Cond.push_back(LastInst->getOperand(0));
256 break;
257 case AArch64::CBZW:
258 case AArch64::CBZX:
259 case AArch64::CBNZW:
260 case AArch64::CBNZX:
261 Target = LastInst->getOperand(1).getMBB();
262 Cond.push_back(MachineOperand::CreateImm(-1));
263 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
264 Cond.push_back(LastInst->getOperand(0));
265 break;
266 case AArch64::TBZW:
267 case AArch64::TBZX:
268 case AArch64::TBNZW:
269 case AArch64::TBNZX:
270 Target = LastInst->getOperand(2).getMBB();
271 Cond.push_back(MachineOperand::CreateImm(-1));
272 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
273 Cond.push_back(LastInst->getOperand(0));
274 Cond.push_back(LastInst->getOperand(1));
275 break;
276 case AArch64::CBWPri:
277 case AArch64::CBXPri:
278 case AArch64::CBWPrr:
279 case AArch64::CBXPrr:
280 Target = LastInst->getOperand(3).getMBB();
281 Cond.push_back(MachineOperand::CreateImm(-1));
282 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
283 Cond.push_back(LastInst->getOperand(0));
284 Cond.push_back(LastInst->getOperand(1));
285 Cond.push_back(LastInst->getOperand(2));
286 break;
287 case AArch64::CBBAssertExt:
288 case AArch64::CBHAssertExt:
289 Target = LastInst->getOperand(3).getMBB();
290 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
291 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
292 Cond.push_back(LastInst->getOperand(0)); // Cond
293 Cond.push_back(LastInst->getOperand(1)); // Op0
294 Cond.push_back(LastInst->getOperand(2)); // Op1
295 Cond.push_back(LastInst->getOperand(4)); // Ext0
296 Cond.push_back(LastInst->getOperand(5)); // Ext1
297 break;
298 }
299}
300
301static unsigned getBranchDisplacementBits(unsigned Opc) {
302 switch (Opc) {
303 default:
304 llvm_unreachable("unexpected opcode!");
305 case AArch64::B:
306 return BDisplacementBits;
307 case AArch64::TBNZW:
308 case AArch64::TBZW:
309 case AArch64::TBNZX:
310 case AArch64::TBZX:
311 return TBZDisplacementBits;
312 case AArch64::CBNZW:
313 case AArch64::CBZW:
314 case AArch64::CBNZX:
315 case AArch64::CBZX:
316 return CBZDisplacementBits;
317 case AArch64::Bcc:
318 return BCCDisplacementBits;
319 case AArch64::CBWPri:
320 case AArch64::CBXPri:
321 case AArch64::CBBAssertExt:
322 case AArch64::CBHAssertExt:
323 case AArch64::CBWPrr:
324 case AArch64::CBXPrr:
325 return CBDisplacementBits;
326 }
327}
328
330 int64_t BrOffset) const {
331 unsigned Bits = getBranchDisplacementBits(BranchOp);
332 assert(Bits >= 3 && "max branch displacement must be enough to jump"
333 "over conditional branch expansion");
334 return isIntN(Bits, BrOffset / 4);
335}
336
339 switch (MI.getOpcode()) {
340 default:
341 llvm_unreachable("unexpected opcode!");
342 case AArch64::B:
343 return MI.getOperand(0).getMBB();
344 case AArch64::TBZW:
345 case AArch64::TBNZW:
346 case AArch64::TBZX:
347 case AArch64::TBNZX:
348 return MI.getOperand(2).getMBB();
349 case AArch64::CBZW:
350 case AArch64::CBNZW:
351 case AArch64::CBZX:
352 case AArch64::CBNZX:
353 case AArch64::Bcc:
354 return MI.getOperand(1).getMBB();
355 case AArch64::CBWPri:
356 case AArch64::CBXPri:
357 case AArch64::CBBAssertExt:
358 case AArch64::CBHAssertExt:
359 case AArch64::CBWPrr:
360 case AArch64::CBXPrr:
361 return MI.getOperand(3).getMBB();
362 }
363}
364
366 MachineBasicBlock &NewDestBB,
367 MachineBasicBlock &RestoreBB,
368 const DebugLoc &DL,
369 int64_t BrOffset,
370 RegScavenger *RS) const {
371 assert(RS && "RegScavenger required for long branching");
372 assert(MBB.empty() &&
373 "new block should be inserted for expanding unconditional branch");
374 assert(MBB.pred_size() == 1);
375 assert(RestoreBB.empty() &&
376 "restore block should be inserted for restoring clobbered registers");
377
378 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
379 // Offsets outside of the signed 33-bit range are not supported for ADRP +
380 // ADD.
381 if (!isInt<33>(BrOffset))
383 "Branch offsets outside of the signed 33-bit range not supported");
384
385 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
386 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
387 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
388 .addReg(Reg)
389 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
390 .addImm(0);
391 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
392 };
393
394 RS->enterBasicBlockEnd(MBB);
395 // If X16 is unused, we can rely on the linker to insert a range extension
396 // thunk if NewDestBB is out of range of a single B instruction.
397 constexpr Register Reg = AArch64::X16;
398 if (!RS->isRegUsed(Reg)) {
399 insertUnconditionalBranch(MBB, &NewDestBB, DL);
400 RS->setRegUsed(Reg);
401 return;
402 }
403
404 // If there's a free register and it's worth inflating the code size,
405 // manually insert the indirect branch.
406 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
407 if (Scavenged != AArch64::NoRegister &&
408 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
409 buildIndirectBranch(Scavenged, NewDestBB);
410 RS->setRegUsed(Scavenged);
411 return;
412 }
413
414 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
415 // with red zones.
416 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
417 if (!AFI || AFI->hasRedZone().value_or(true))
419 "Unable to insert indirect branch inside function that has red zone");
420
421 // Otherwise, spill X16 and defer range extension to the linker.
422 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
423 .addReg(AArch64::SP, RegState::Define)
424 .addReg(Reg)
425 .addReg(AArch64::SP)
426 .addImm(-16);
427
428 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
429
430 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
431 .addReg(AArch64::SP, RegState::Define)
433 .addReg(AArch64::SP)
434 .addImm(16);
435}
436
437// Branch analysis.
440 MachineBasicBlock *&FBB,
442 bool AllowModify) const {
443 // If the block has no terminators, it just falls into the block after it.
444 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
445 if (I == MBB.end())
446 return false;
447
448 // Skip over SpeculationBarrierEndBB terminators
449 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
450 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
451 --I;
452 }
453
454 if (!isUnpredicatedTerminator(*I))
455 return false;
456
457 // Get the last instruction in the block.
458 MachineInstr *LastInst = &*I;
459
460 // If there is only one terminator instruction, process it.
461 unsigned LastOpc = LastInst->getOpcode();
462 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
463 if (isUncondBranchOpcode(LastOpc)) {
464 TBB = LastInst->getOperand(0).getMBB();
465 return false;
466 }
467 if (isCondBranchOpcode(LastOpc)) {
468 // Block ends with fall-through condbranch.
469 parseCondBranch(LastInst, TBB, Cond);
470 return false;
471 }
472 return true; // Can't handle indirect branch.
473 }
474
475 // Get the instruction before it if it is a terminator.
476 MachineInstr *SecondLastInst = &*I;
477 unsigned SecondLastOpc = SecondLastInst->getOpcode();
478
479 // If AllowModify is true and the block ends with two or more unconditional
480 // branches, delete all but the first unconditional branch.
481 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
482 while (isUncondBranchOpcode(SecondLastOpc)) {
483 LastInst->eraseFromParent();
484 LastInst = SecondLastInst;
485 LastOpc = LastInst->getOpcode();
486 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
487 // Return now the only terminator is an unconditional branch.
488 TBB = LastInst->getOperand(0).getMBB();
489 return false;
490 }
491 SecondLastInst = &*I;
492 SecondLastOpc = SecondLastInst->getOpcode();
493 }
494 }
495
496 // If we're allowed to modify and the block ends in a unconditional branch
497 // which could simply fallthrough, remove the branch. (Note: This case only
498 // matters when we can't understand the whole sequence, otherwise it's also
499 // handled by BranchFolding.cpp.)
500 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
501 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
502 LastInst->eraseFromParent();
503 LastInst = SecondLastInst;
504 LastOpc = LastInst->getOpcode();
505 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
506 assert(!isUncondBranchOpcode(LastOpc) &&
507 "unreachable unconditional branches removed above");
508
509 if (isCondBranchOpcode(LastOpc)) {
510 // Block ends with fall-through condbranch.
511 parseCondBranch(LastInst, TBB, Cond);
512 return false;
513 }
514 return true; // Can't handle indirect branch.
515 }
516 SecondLastInst = &*I;
517 SecondLastOpc = SecondLastInst->getOpcode();
518 }
519
520 // If there are three terminators, we don't know what sort of block this is.
521 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
522 return true;
523
524 // If the block ends with a B and a Bcc, handle it.
525 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
526 parseCondBranch(SecondLastInst, TBB, Cond);
527 FBB = LastInst->getOperand(0).getMBB();
528 return false;
529 }
530
531 // If the block ends with two unconditional branches, handle it. The second
532 // one is not executed, so remove it.
533 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
534 TBB = SecondLastInst->getOperand(0).getMBB();
535 I = LastInst;
536 if (AllowModify)
537 I->eraseFromParent();
538 return false;
539 }
540
541 // ...likewise if it ends with an indirect branch followed by an unconditional
542 // branch.
543 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
544 I = LastInst;
545 if (AllowModify)
546 I->eraseFromParent();
547 return true;
548 }
549
550 // Otherwise, can't handle this.
551 return true;
552}
553
555 MachineBranchPredicate &MBP,
556 bool AllowModify) const {
557 // Use analyzeBranch to validate the branch pattern.
558 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
560 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
561 return true;
562
563 // analyzeBranch returns success with empty Cond for unconditional branches.
564 if (Cond.empty())
565 return true;
566
567 MBP.TrueDest = TBB;
568 assert(MBP.TrueDest && "expected!");
569 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
570
571 MBP.ConditionDef = nullptr;
572 MBP.SingleUseCondition = false;
573
574 // Find the conditional branch. After analyzeBranch succeeds with non-empty
575 // Cond, there's exactly one conditional branch - either last (fallthrough)
576 // or second-to-last (followed by unconditional B).
577 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
578 if (I == MBB.end())
579 return true;
580
581 if (isUncondBranchOpcode(I->getOpcode())) {
582 if (I == MBB.begin())
583 return true;
584 --I;
585 }
586
587 MachineInstr *CondBranch = &*I;
588 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
589
590 switch (CondBranch->getOpcode()) {
591 default:
592 return true;
593
594 case AArch64::Bcc:
595 // Bcc takes the NZCV flag as the operand to branch on, walk up the
596 // instruction stream to find the last instruction to define NZCV.
598 if (MI.modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
599 MBP.ConditionDef = &MI;
600 break;
601 }
602 }
603 return false;
604
605 case AArch64::CBZW:
606 case AArch64::CBZX:
607 case AArch64::CBNZW:
608 case AArch64::CBNZX: {
609 MBP.LHS = CondBranch->getOperand(0);
610 MBP.RHS = MachineOperand::CreateImm(0);
611 unsigned Opc = CondBranch->getOpcode();
612 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
613 ? MachineBranchPredicate::PRED_NE
614 : MachineBranchPredicate::PRED_EQ;
615 Register CondReg = MBP.LHS.getReg();
616 if (CondReg.isVirtual())
617 MBP.ConditionDef = MRI.getVRegDef(CondReg);
618 return false;
619 }
620
621 case AArch64::TBZW:
622 case AArch64::TBZX:
623 case AArch64::TBNZW:
624 case AArch64::TBNZX: {
625 Register CondReg = CondBranch->getOperand(0).getReg();
626 if (CondReg.isVirtual())
627 MBP.ConditionDef = MRI.getVRegDef(CondReg);
628 return false;
629 }
630 }
631}
632
635 if (Cond[0].getImm() != -1) {
636 // Regular Bcc
637 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
639 } else {
640 // Folded compare-and-branch
641 switch (Cond[1].getImm()) {
642 default:
643 llvm_unreachable("Unknown conditional branch!");
644 case AArch64::CBZW:
645 Cond[1].setImm(AArch64::CBNZW);
646 break;
647 case AArch64::CBNZW:
648 Cond[1].setImm(AArch64::CBZW);
649 break;
650 case AArch64::CBZX:
651 Cond[1].setImm(AArch64::CBNZX);
652 break;
653 case AArch64::CBNZX:
654 Cond[1].setImm(AArch64::CBZX);
655 break;
656 case AArch64::TBZW:
657 Cond[1].setImm(AArch64::TBNZW);
658 break;
659 case AArch64::TBNZW:
660 Cond[1].setImm(AArch64::TBZW);
661 break;
662 case AArch64::TBZX:
663 Cond[1].setImm(AArch64::TBNZX);
664 break;
665 case AArch64::TBNZX:
666 Cond[1].setImm(AArch64::TBZX);
667 break;
668
669 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
670 case AArch64::CBWPri:
671 case AArch64::CBXPri:
672 case AArch64::CBBAssertExt:
673 case AArch64::CBHAssertExt:
674 case AArch64::CBWPrr:
675 case AArch64::CBXPrr: {
676 // Pseudos using standard 4bit Arm condition codes
678 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
680 }
681 }
682 }
683
684 return false;
685}
686
688 int *BytesRemoved) const {
689 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
690 if (I == MBB.end())
691 return 0;
692
693 if (!isUncondBranchOpcode(I->getOpcode()) &&
694 !isCondBranchOpcode(I->getOpcode()))
695 return 0;
696
697 // Remove the branch.
698 I->eraseFromParent();
699
700 I = MBB.end();
701
702 if (I == MBB.begin()) {
703 if (BytesRemoved)
704 *BytesRemoved = 4;
705 return 1;
706 }
707 --I;
708 if (!isCondBranchOpcode(I->getOpcode())) {
709 if (BytesRemoved)
710 *BytesRemoved = 4;
711 return 1;
712 }
713
714 // Remove the branch.
715 I->eraseFromParent();
716 if (BytesRemoved)
717 *BytesRemoved = 8;
718
719 return 2;
720}
721
722void AArch64InstrInfo::instantiateCondBranch(
725 if (Cond[0].getImm() != -1) {
726 // Regular Bcc
727 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
728 } else {
729 // Folded compare-and-branch
730 // Note that we use addOperand instead of addReg to keep the flags.
731
732 // cbz, cbnz
733 const MachineInstrBuilder MIB =
734 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
735
736 // tbz/tbnz
737 if (Cond.size() > 3)
738 MIB.add(Cond[3]);
739
740 // cb
741 if (Cond.size() > 4)
742 MIB.add(Cond[4]);
743
744 MIB.addMBB(TBB);
745
746 // cb[b,h]
747 if (Cond.size() > 5) {
748 MIB.addImm(Cond[5].getImm());
749 MIB.addImm(Cond[6].getImm());
750 }
751 }
752}
753
756 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
757 // Shouldn't be a fall through.
758 assert(TBB && "insertBranch must not be told to insert a fallthrough");
759
760 if (!FBB) {
761 if (Cond.empty()) // Unconditional branch?
762 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
763 else
764 instantiateCondBranch(MBB, DL, TBB, Cond);
765
766 if (BytesAdded)
767 *BytesAdded = 4;
768
769 return 1;
770 }
771
772 // Two-way conditional branch.
773 instantiateCondBranch(MBB, DL, TBB, Cond);
774 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
775
776 if (BytesAdded)
777 *BytesAdded = 8;
778
779 return 2;
780}
781
783 const TargetInstrInfo &TII) {
784 for (MachineInstr &MI : MBB->terminators()) {
785 unsigned Opc = MI.getOpcode();
786 switch (Opc) {
787 case AArch64::CBZW:
788 case AArch64::CBZX:
789 case AArch64::TBZW:
790 case AArch64::TBZX:
791 // CBZ/TBZ with WZR/XZR -> unconditional B
792 if (MI.getOperand(0).getReg() == AArch64::WZR ||
793 MI.getOperand(0).getReg() == AArch64::XZR) {
794 DEBUG_WITH_TYPE("optimizeTerminators",
795 dbgs() << "Removing always taken branch: " << MI);
796 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
797 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
798 for (auto *S : Succs)
799 if (S != Target)
800 MBB->removeSuccessor(S);
801 DebugLoc DL = MI.getDebugLoc();
802 while (MBB->rbegin() != &MI)
803 MBB->rbegin()->eraseFromParent();
804 MI.eraseFromParent();
805 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
806 return true;
807 }
808 break;
809 case AArch64::CBNZW:
810 case AArch64::CBNZX:
811 case AArch64::TBNZW:
812 case AArch64::TBNZX:
813 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
814 if (MI.getOperand(0).getReg() == AArch64::WZR ||
815 MI.getOperand(0).getReg() == AArch64::XZR) {
816 DEBUG_WITH_TYPE("optimizeTerminators",
817 dbgs() << "Removing never taken branch: " << MI);
818 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
819 MI.getParent()->removeSuccessor(Target);
820 MI.eraseFromParent();
821 return true;
822 }
823 break;
824 }
825 }
826 return false;
827}
828
829// Find the original register that VReg is copied from.
830static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
831 while (Register::isVirtualRegister(VReg)) {
832 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
833 if (!DefMI->isFullCopy())
834 return VReg;
835 VReg = DefMI->getOperand(1).getReg();
836 }
837 return VReg;
838}
839
840// Determine if VReg is defined by an instruction that can be folded into a
841// csel instruction. If so, return the folded opcode, and the replacement
842// register.
843static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
844 unsigned *NewReg = nullptr) {
845 VReg = removeCopies(MRI, VReg);
847 return 0;
848
849 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
850 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
851 unsigned Opc = 0;
852 unsigned SrcReg = 0;
853 switch (DefMI->getOpcode()) {
854 case AArch64::SUBREG_TO_REG:
855 // Check for the following way to define an 64-bit immediate:
856 // %0:gpr32 = MOVi32imm 1
857 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
858 if (!DefMI->getOperand(1).isReg())
859 return 0;
860 if (!DefMI->getOperand(2).isImm() ||
861 DefMI->getOperand(2).getImm() != AArch64::sub_32)
862 return 0;
863 DefMI = MRI.getVRegDef(DefMI->getOperand(1).getReg());
864 if (DefMI->getOpcode() != AArch64::MOVi32imm)
865 return 0;
866 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
867 return 0;
868 assert(Is64Bit);
869 SrcReg = AArch64::XZR;
870 Opc = AArch64::CSINCXr;
871 break;
872
873 case AArch64::MOVi32imm:
874 case AArch64::MOVi64imm:
875 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
876 return 0;
877 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
878 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
879 break;
880
881 case AArch64::ADDSXri:
882 case AArch64::ADDSWri:
883 // if NZCV is used, do not fold.
884 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
885 true) == -1)
886 return 0;
887 // fall-through to ADDXri and ADDWri.
888 [[fallthrough]];
889 case AArch64::ADDXri:
890 case AArch64::ADDWri:
891 // add x, 1 -> csinc.
892 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
893 DefMI->getOperand(3).getImm() != 0)
894 return 0;
895 SrcReg = DefMI->getOperand(1).getReg();
896 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
897 break;
898
899 case AArch64::ORNXrr:
900 case AArch64::ORNWrr: {
901 // not x -> csinv, represented as orn dst, xzr, src.
902 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
903 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
904 return 0;
905 SrcReg = DefMI->getOperand(2).getReg();
906 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
907 break;
908 }
909
910 case AArch64::SUBSXrr:
911 case AArch64::SUBSWrr:
912 // if NZCV is used, do not fold.
913 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
914 true) == -1)
915 return 0;
916 // fall-through to SUBXrr and SUBWrr.
917 [[fallthrough]];
918 case AArch64::SUBXrr:
919 case AArch64::SUBWrr: {
920 // neg x -> csneg, represented as sub dst, xzr, src.
921 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
922 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
923 return 0;
924 SrcReg = DefMI->getOperand(2).getReg();
925 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
926 break;
927 }
928 default:
929 return 0;
930 }
931 assert(Opc && SrcReg && "Missing parameters");
932
933 if (NewReg)
934 *NewReg = SrcReg;
935 return Opc;
936}
937
940 Register DstReg, Register TrueReg,
941 Register FalseReg, int &CondCycles,
942 int &TrueCycles,
943 int &FalseCycles) const {
944 // Check register classes.
945 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
946 const TargetRegisterClass *RC =
947 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
948 if (!RC)
949 return false;
950
951 // Also need to check the dest regclass, in case we're trying to optimize
952 // something like:
953 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
954 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
955 return false;
956
957 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
958 unsigned ExtraCondLat = Cond.size() != 1;
959
960 // GPRs are handled by csel.
961 // FIXME: Fold in x+1, -x, and ~x when applicable.
962 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
963 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
964 // Single-cycle csel, csinc, csinv, and csneg.
965 CondCycles = 1 + ExtraCondLat;
966 TrueCycles = FalseCycles = 1;
967 if (canFoldIntoCSel(MRI, TrueReg))
968 TrueCycles = 0;
969 else if (canFoldIntoCSel(MRI, FalseReg))
970 FalseCycles = 0;
971 return true;
972 }
973
974 // Scalar floating point is handled by fcsel.
975 // FIXME: Form fabs, fmin, and fmax when applicable.
976 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
977 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
978 CondCycles = 5 + ExtraCondLat;
979 TrueCycles = FalseCycles = 2;
980 return true;
981 }
982
983 // Can't do vectors.
984 return false;
985}
986
989 const DebugLoc &DL, Register DstReg,
991 Register TrueReg, Register FalseReg) const {
992 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
993
994 // Parse the condition code, see parseCondBranch() above.
996 switch (Cond.size()) {
997 default:
998 llvm_unreachable("Unknown condition opcode in Cond");
999 case 1: // b.cc
1000 CC = AArch64CC::CondCode(Cond[0].getImm());
1001 break;
1002 case 3: { // cbz/cbnz
1003 // We must insert a compare against 0.
1004 bool Is64Bit;
1005 switch (Cond[1].getImm()) {
1006 default:
1007 llvm_unreachable("Unknown branch opcode in Cond");
1008 case AArch64::CBZW:
1009 Is64Bit = false;
1010 CC = AArch64CC::EQ;
1011 break;
1012 case AArch64::CBZX:
1013 Is64Bit = true;
1014 CC = AArch64CC::EQ;
1015 break;
1016 case AArch64::CBNZW:
1017 Is64Bit = false;
1018 CC = AArch64CC::NE;
1019 break;
1020 case AArch64::CBNZX:
1021 Is64Bit = true;
1022 CC = AArch64CC::NE;
1023 break;
1024 }
1025 Register SrcReg = Cond[2].getReg();
1026 if (Is64Bit) {
1027 // cmp reg, #0 is actually subs xzr, reg, #0.
1028 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
1029 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
1030 .addReg(SrcReg)
1031 .addImm(0)
1032 .addImm(0);
1033 } else {
1034 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
1035 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
1036 .addReg(SrcReg)
1037 .addImm(0)
1038 .addImm(0);
1039 }
1040 break;
1041 }
1042 case 4: { // tbz/tbnz
1043 // We must insert a tst instruction.
1044 switch (Cond[1].getImm()) {
1045 default:
1046 llvm_unreachable("Unknown branch opcode in Cond");
1047 case AArch64::TBZW:
1048 case AArch64::TBZX:
1049 CC = AArch64CC::EQ;
1050 break;
1051 case AArch64::TBNZW:
1052 case AArch64::TBNZX:
1053 CC = AArch64CC::NE;
1054 break;
1055 }
1056 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1057 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1058 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1059 .addReg(Cond[2].getReg())
1060 .addImm(
1062 else
1063 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1064 .addReg(Cond[2].getReg())
1065 .addImm(
1067 break;
1068 }
1069 case 5: { // cb
1070 // We must insert a cmp, that is a subs
1071 // 0 1 2 3 4
1072 // Cond is { -1, Opcode, CC, Op0, Op1 }
1073
1074 unsigned SubsOpc, SubsDestReg;
1075 bool IsImm = false;
1076 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1077 switch (Cond[1].getImm()) {
1078 default:
1079 llvm_unreachable("Unknown branch opcode in Cond");
1080 case AArch64::CBWPri:
1081 SubsOpc = AArch64::SUBSWri;
1082 SubsDestReg = AArch64::WZR;
1083 IsImm = true;
1084 break;
1085 case AArch64::CBXPri:
1086 SubsOpc = AArch64::SUBSXri;
1087 SubsDestReg = AArch64::XZR;
1088 IsImm = true;
1089 break;
1090 case AArch64::CBWPrr:
1091 SubsOpc = AArch64::SUBSWrr;
1092 SubsDestReg = AArch64::WZR;
1093 IsImm = false;
1094 break;
1095 case AArch64::CBXPrr:
1096 SubsOpc = AArch64::SUBSXrr;
1097 SubsDestReg = AArch64::XZR;
1098 IsImm = false;
1099 break;
1100 }
1101
1102 if (IsImm)
1103 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1104 .addReg(Cond[3].getReg())
1105 .addImm(Cond[4].getImm())
1106 .addImm(0);
1107 else
1108 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1109 .addReg(Cond[3].getReg())
1110 .addReg(Cond[4].getReg());
1111 } break;
1112 case 7: { // cb[b,h]
1113 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1114 // that have been folded. For the first operand we codegen an explicit
1115 // extension, for the second operand we fold the extension into cmp.
1116 // 0 1 2 3 4 5 6
1117 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1118
1119 // We need a new register for the now explicitly extended register
1120 Register Reg = Cond[4].getReg();
1122 unsigned ExtOpc;
1123 unsigned ExtBits;
1124 AArch64_AM::ShiftExtendType ExtendType =
1126 switch (ExtendType) {
1127 default:
1128 llvm_unreachable("Unknown shift-extend for CB instruction");
1129 case AArch64_AM::SXTB:
1130 assert(
1131 Cond[1].getImm() == AArch64::CBBAssertExt &&
1132 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1133 ExtOpc = AArch64::SBFMWri;
1134 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1135 break;
1136 case AArch64_AM::SXTH:
1137 assert(
1138 Cond[1].getImm() == AArch64::CBHAssertExt &&
1139 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1140 ExtOpc = AArch64::SBFMWri;
1141 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1142 break;
1143 case AArch64_AM::UXTB:
1144 assert(
1145 Cond[1].getImm() == AArch64::CBBAssertExt &&
1146 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1147 ExtOpc = AArch64::ANDWri;
1148 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1149 break;
1150 case AArch64_AM::UXTH:
1151 assert(
1152 Cond[1].getImm() == AArch64::CBHAssertExt &&
1153 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1154 ExtOpc = AArch64::ANDWri;
1155 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1156 break;
1157 }
1158
1159 // Build the explicit extension of the first operand
1160 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1162 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1163 if (ExtOpc != AArch64::ANDWri)
1164 MBBI.addImm(0);
1165 MBBI.addImm(ExtBits);
1166 }
1167
1168 // Now, subs with an extended second operand
1170 AArch64_AM::ShiftExtendType ExtendType =
1172 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1173 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1174 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1175 .addReg(Cond[3].getReg())
1176 .addReg(Reg)
1177 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1178 } // If no extension is needed, just a regular subs
1179 else {
1180 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1181 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1182 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1183 .addReg(Cond[3].getReg())
1184 .addReg(Reg);
1185 }
1186
1187 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1188 } break;
1189 }
1190
1191 unsigned Opc = 0;
1192 const TargetRegisterClass *RC = nullptr;
1193 bool TryFold = false;
1194 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1195 RC = &AArch64::GPR64RegClass;
1196 Opc = AArch64::CSELXr;
1197 TryFold = true;
1198 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1199 RC = &AArch64::GPR32RegClass;
1200 Opc = AArch64::CSELWr;
1201 TryFold = true;
1202 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1203 RC = &AArch64::FPR64RegClass;
1204 Opc = AArch64::FCSELDrrr;
1205 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1206 RC = &AArch64::FPR32RegClass;
1207 Opc = AArch64::FCSELSrrr;
1208 }
1209 assert(RC && "Unsupported regclass");
1210
1211 // Try folding simple instructions into the csel.
1212 if (TryFold) {
1213 unsigned NewReg = 0;
1214 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1215 if (FoldedOpc) {
1216 // The folded opcodes csinc, csinc and csneg apply the operation to
1217 // FalseReg, so we need to invert the condition.
1219 TrueReg = FalseReg;
1220 } else
1221 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1222
1223 // Fold the operation. Leave any dead instructions for DCE to clean up.
1224 if (FoldedOpc) {
1225 FalseReg = NewReg;
1226 Opc = FoldedOpc;
1227 // Extend the live range of NewReg.
1228 MRI.clearKillFlags(NewReg);
1229 }
1230 }
1231
1232 // Pull all virtual register into the appropriate class.
1233 MRI.constrainRegClass(TrueReg, RC);
1234 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1235 assert(
1236 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1237 FalseReg == AArch64::XZR) &&
1238 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1239 if (FalseReg.isVirtual())
1240 MRI.constrainRegClass(FalseReg, RC);
1241
1242 // Insert the csel.
1243 BuildMI(MBB, I, DL, get(Opc), DstReg)
1244 .addReg(TrueReg)
1245 .addReg(FalseReg)
1246 .addImm(CC);
1247}
1248
1249// Return true if Imm can be loaded into a register by a "cheap" sequence of
1250// instructions. For now, "cheap" means at most two instructions.
1251static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1252 if (BitSize == 32)
1253 return true;
1254
1255 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1256 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1258 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1259
1260 return Is.size() <= 2;
1261}
1262
1263// Check if a COPY instruction is cheap.
1264static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1265 assert(MI.isCopy() && "Expected COPY instruction");
1266 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1267
1268 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1269 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1270 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1271 if (Reg.isVirtual())
1272 return MRI.getRegClass(Reg);
1273 if (Reg.isPhysical())
1274 return RI.getMinimalPhysRegClass(Reg);
1275 return nullptr;
1276 };
1277 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1278 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1279 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1280 return false;
1281
1282 return MI.isAsCheapAsAMove();
1283}
1284
1285// FIXME: this implementation should be micro-architecture dependent, so a
1286// micro-architecture target hook should be introduced here in future.
1288 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1289 if (isExynosCheapAsMove(MI))
1290 return true;
1291 return MI.isAsCheapAsAMove();
1292 }
1293
1294 switch (MI.getOpcode()) {
1295 default:
1296 return MI.isAsCheapAsAMove();
1297
1298 case TargetOpcode::COPY:
1299 return isCheapCopy(MI, RI);
1300
1301 case AArch64::ADDWrs:
1302 case AArch64::ADDXrs:
1303 case AArch64::SUBWrs:
1304 case AArch64::SUBXrs:
1305 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1306
1307 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1308 // ORRXri, it is as cheap as MOV.
1309 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1310 case AArch64::MOVi32imm:
1311 return isCheapImmediate(MI, 32);
1312 case AArch64::MOVi64imm:
1313 return isCheapImmediate(MI, 64);
1314 }
1315}
1316
1317bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1318 switch (MI.getOpcode()) {
1319 default:
1320 return false;
1321
1322 case AArch64::ADDWrs:
1323 case AArch64::ADDXrs:
1324 case AArch64::ADDSWrs:
1325 case AArch64::ADDSXrs: {
1326 unsigned Imm = MI.getOperand(3).getImm();
1327 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1328 if (ShiftVal == 0)
1329 return true;
1330 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1331 }
1332
1333 case AArch64::ADDWrx:
1334 case AArch64::ADDXrx:
1335 case AArch64::ADDXrx64:
1336 case AArch64::ADDSWrx:
1337 case AArch64::ADDSXrx:
1338 case AArch64::ADDSXrx64: {
1339 unsigned Imm = MI.getOperand(3).getImm();
1340 switch (AArch64_AM::getArithExtendType(Imm)) {
1341 default:
1342 return false;
1343 case AArch64_AM::UXTB:
1344 case AArch64_AM::UXTH:
1345 case AArch64_AM::UXTW:
1346 case AArch64_AM::UXTX:
1347 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1348 }
1349 }
1350
1351 case AArch64::SUBWrs:
1352 case AArch64::SUBSWrs: {
1353 unsigned Imm = MI.getOperand(3).getImm();
1354 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1355 return ShiftVal == 0 ||
1356 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1357 }
1358
1359 case AArch64::SUBXrs:
1360 case AArch64::SUBSXrs: {
1361 unsigned Imm = MI.getOperand(3).getImm();
1362 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1363 return ShiftVal == 0 ||
1364 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1365 }
1366
1367 case AArch64::SUBWrx:
1368 case AArch64::SUBXrx:
1369 case AArch64::SUBXrx64:
1370 case AArch64::SUBSWrx:
1371 case AArch64::SUBSXrx:
1372 case AArch64::SUBSXrx64: {
1373 unsigned Imm = MI.getOperand(3).getImm();
1374 switch (AArch64_AM::getArithExtendType(Imm)) {
1375 default:
1376 return false;
1377 case AArch64_AM::UXTB:
1378 case AArch64_AM::UXTH:
1379 case AArch64_AM::UXTW:
1380 case AArch64_AM::UXTX:
1381 return AArch64_AM::getArithShiftValue(Imm) == 0;
1382 }
1383 }
1384
1385 case AArch64::LDRBBroW:
1386 case AArch64::LDRBBroX:
1387 case AArch64::LDRBroW:
1388 case AArch64::LDRBroX:
1389 case AArch64::LDRDroW:
1390 case AArch64::LDRDroX:
1391 case AArch64::LDRHHroW:
1392 case AArch64::LDRHHroX:
1393 case AArch64::LDRHroW:
1394 case AArch64::LDRHroX:
1395 case AArch64::LDRQroW:
1396 case AArch64::LDRQroX:
1397 case AArch64::LDRSBWroW:
1398 case AArch64::LDRSBWroX:
1399 case AArch64::LDRSBXroW:
1400 case AArch64::LDRSBXroX:
1401 case AArch64::LDRSHWroW:
1402 case AArch64::LDRSHWroX:
1403 case AArch64::LDRSHXroW:
1404 case AArch64::LDRSHXroX:
1405 case AArch64::LDRSWroW:
1406 case AArch64::LDRSWroX:
1407 case AArch64::LDRSroW:
1408 case AArch64::LDRSroX:
1409 case AArch64::LDRWroW:
1410 case AArch64::LDRWroX:
1411 case AArch64::LDRXroW:
1412 case AArch64::LDRXroX:
1413 case AArch64::PRFMroW:
1414 case AArch64::PRFMroX:
1415 case AArch64::STRBBroW:
1416 case AArch64::STRBBroX:
1417 case AArch64::STRBroW:
1418 case AArch64::STRBroX:
1419 case AArch64::STRDroW:
1420 case AArch64::STRDroX:
1421 case AArch64::STRHHroW:
1422 case AArch64::STRHHroX:
1423 case AArch64::STRHroW:
1424 case AArch64::STRHroX:
1425 case AArch64::STRQroW:
1426 case AArch64::STRQroX:
1427 case AArch64::STRSroW:
1428 case AArch64::STRSroX:
1429 case AArch64::STRWroW:
1430 case AArch64::STRWroX:
1431 case AArch64::STRXroW:
1432 case AArch64::STRXroX: {
1433 unsigned IsSigned = MI.getOperand(3).getImm();
1434 return !IsSigned;
1435 }
1436 }
1437}
1438
1439bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1440 unsigned Opc = MI.getOpcode();
1441 switch (Opc) {
1442 default:
1443 return false;
1444 case AArch64::SEH_StackAlloc:
1445 case AArch64::SEH_SaveFPLR:
1446 case AArch64::SEH_SaveFPLR_X:
1447 case AArch64::SEH_SaveReg:
1448 case AArch64::SEH_SaveReg_X:
1449 case AArch64::SEH_SaveRegP:
1450 case AArch64::SEH_SaveRegP_X:
1451 case AArch64::SEH_SaveFReg:
1452 case AArch64::SEH_SaveFReg_X:
1453 case AArch64::SEH_SaveFRegP:
1454 case AArch64::SEH_SaveFRegP_X:
1455 case AArch64::SEH_SetFP:
1456 case AArch64::SEH_AddFP:
1457 case AArch64::SEH_Nop:
1458 case AArch64::SEH_PrologEnd:
1459 case AArch64::SEH_EpilogStart:
1460 case AArch64::SEH_EpilogEnd:
1461 case AArch64::SEH_PACSignLR:
1462 case AArch64::SEH_SaveAnyRegI:
1463 case AArch64::SEH_SaveAnyRegIP:
1464 case AArch64::SEH_SaveAnyRegQP:
1465 case AArch64::SEH_SaveAnyRegQPX:
1466 case AArch64::SEH_AllocZ:
1467 case AArch64::SEH_SaveZReg:
1468 case AArch64::SEH_SavePReg:
1469 return true;
1470 }
1471}
1472
1474 Register &SrcReg, Register &DstReg,
1475 unsigned &SubIdx) const {
1476 switch (MI.getOpcode()) {
1477 default:
1478 return false;
1479 case AArch64::SBFMXri: // aka sxtw
1480 case AArch64::UBFMXri: // aka uxtw
1481 // Check for the 32 -> 64 bit extension case, these instructions can do
1482 // much more.
1483 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1484 return false;
1485 // This is a signed or unsigned 32 -> 64 bit extension.
1486 SrcReg = MI.getOperand(1).getReg();
1487 DstReg = MI.getOperand(0).getReg();
1488 SubIdx = AArch64::sub_32;
1489 return true;
1490 }
1491}
1492
1494 const MachineInstr &MIa, const MachineInstr &MIb) const {
1496 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1497 int64_t OffsetA = 0, OffsetB = 0;
1498 TypeSize WidthA(0, false), WidthB(0, false);
1499 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1500
1501 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1502 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1503
1506 return false;
1507
1508 // Retrieve the base, offset from the base and width. Width
1509 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1510 // base are identical, and the offset of a lower memory access +
1511 // the width doesn't overlap the offset of a higher memory access,
1512 // then the memory accesses are different.
1513 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1514 // are assumed to have the same scale (vscale).
1515 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1516 WidthA, TRI) &&
1517 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1518 WidthB, TRI)) {
1519 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1520 OffsetAIsScalable == OffsetBIsScalable) {
1521 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1522 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1523 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1524 if (LowWidth.isScalable() == OffsetAIsScalable &&
1525 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1526 return true;
1527 }
1528 }
1529 return false;
1530}
1531
1533 const MachineBasicBlock *MBB,
1534 const MachineFunction &MF) const {
1536 return true;
1537
1538 // Do not move an instruction that can be recognized as a branch target.
1539 if (hasBTISemantics(MI))
1540 return true;
1541
1542 switch (MI.getOpcode()) {
1543 case AArch64::HINT:
1544 // CSDB hints are scheduling barriers.
1545 if (MI.getOperand(0).getImm() == 0x14)
1546 return true;
1547 break;
1548 case AArch64::DSB:
1549 case AArch64::ISB:
1550 // DSB and ISB also are scheduling barriers.
1551 return true;
1552 case AArch64::MSRpstatesvcrImm1:
1553 // SMSTART and SMSTOP are also scheduling barriers.
1554 return true;
1555 default:;
1556 }
1557 if (isSEHInstruction(MI))
1558 return true;
1559 auto Next = std::next(MI.getIterator());
1560 return Next != MBB->end() && Next->isCFIInstruction();
1561}
1562
1563/// analyzeCompare - For a comparison instruction, return the source registers
1564/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1565/// Return true if the comparison instruction can be analyzed.
1567 Register &SrcReg2, int64_t &CmpMask,
1568 int64_t &CmpValue) const {
1569 // The first operand can be a frame index where we'd normally expect a
1570 // register.
1571 // FIXME: Pass subregisters out of analyzeCompare
1572 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1573 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1574 return false;
1575
1576 switch (MI.getOpcode()) {
1577 default:
1578 break;
1579 case AArch64::PTEST_PP:
1580 case AArch64::PTEST_PP_ANY:
1581 case AArch64::PTEST_PP_FIRST:
1582 SrcReg = MI.getOperand(0).getReg();
1583 SrcReg2 = MI.getOperand(1).getReg();
1584 if (MI.getOperand(2).getSubReg())
1585 return false;
1586
1587 // Not sure about the mask and value for now...
1588 CmpMask = ~0;
1589 CmpValue = 0;
1590 return true;
1591 case AArch64::SUBSWrr:
1592 case AArch64::SUBSWrs:
1593 case AArch64::SUBSWrx:
1594 case AArch64::SUBSXrr:
1595 case AArch64::SUBSXrs:
1596 case AArch64::SUBSXrx:
1597 case AArch64::ADDSWrr:
1598 case AArch64::ADDSWrs:
1599 case AArch64::ADDSWrx:
1600 case AArch64::ADDSXrr:
1601 case AArch64::ADDSXrs:
1602 case AArch64::ADDSXrx:
1603 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1604 SrcReg = MI.getOperand(1).getReg();
1605 SrcReg2 = MI.getOperand(2).getReg();
1606
1607 // FIXME: Pass subregisters out of analyzeCompare
1608 if (MI.getOperand(2).getSubReg())
1609 return false;
1610
1611 CmpMask = ~0;
1612 CmpValue = 0;
1613 return true;
1614 case AArch64::SUBSWri:
1615 case AArch64::ADDSWri:
1616 case AArch64::SUBSXri:
1617 case AArch64::ADDSXri:
1618 SrcReg = MI.getOperand(1).getReg();
1619 SrcReg2 = 0;
1620 CmpMask = ~0;
1621 CmpValue = MI.getOperand(2).getImm();
1622 return true;
1623 case AArch64::ANDSWri:
1624 case AArch64::ANDSXri:
1625 // ANDS does not use the same encoding scheme as the others xxxS
1626 // instructions.
1627 SrcReg = MI.getOperand(1).getReg();
1628 SrcReg2 = 0;
1629 CmpMask = ~0;
1631 MI.getOperand(2).getImm(),
1632 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1633 return true;
1634 }
1635
1636 return false;
1637}
1638
1640 MachineBasicBlock *MBB = Instr.getParent();
1641 assert(MBB && "Can't get MachineBasicBlock here");
1642 MachineFunction *MF = MBB->getParent();
1643 assert(MF && "Can't get MachineFunction here");
1646 MachineRegisterInfo *MRI = &MF->getRegInfo();
1647
1648 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1649 ++OpIdx) {
1650 MachineOperand &MO = Instr.getOperand(OpIdx);
1651 const TargetRegisterClass *OpRegCstraints =
1652 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1653
1654 // If there's no constraint, there's nothing to do.
1655 if (!OpRegCstraints)
1656 continue;
1657 // If the operand is a frame index, there's nothing to do here.
1658 // A frame index operand will resolve correctly during PEI.
1659 if (MO.isFI())
1660 continue;
1661
1662 assert(MO.isReg() &&
1663 "Operand has register constraints without being a register!");
1664
1665 Register Reg = MO.getReg();
1666 if (Reg.isPhysical()) {
1667 if (!OpRegCstraints->contains(Reg))
1668 return false;
1669 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1670 !MRI->constrainRegClass(Reg, OpRegCstraints))
1671 return false;
1672 }
1673
1674 return true;
1675}
1676
1677/// Return the opcode that does not set flags when possible - otherwise
1678/// return the original opcode. The caller is responsible to do the actual
1679/// substitution and legality checking.
1681 // Don't convert all compare instructions, because for some the zero register
1682 // encoding becomes the sp register.
1683 bool MIDefinesZeroReg = false;
1684 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1685 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1686 MIDefinesZeroReg = true;
1687
1688 switch (MI.getOpcode()) {
1689 default:
1690 return MI.getOpcode();
1691 case AArch64::ADDSWrr:
1692 return AArch64::ADDWrr;
1693 case AArch64::ADDSWri:
1694 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1695 case AArch64::ADDSWrs:
1696 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1697 case AArch64::ADDSWrx:
1698 return AArch64::ADDWrx;
1699 case AArch64::ADDSXrr:
1700 return AArch64::ADDXrr;
1701 case AArch64::ADDSXri:
1702 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1703 case AArch64::ADDSXrs:
1704 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1705 case AArch64::ADDSXrx:
1706 return AArch64::ADDXrx;
1707 case AArch64::SUBSWrr:
1708 return AArch64::SUBWrr;
1709 case AArch64::SUBSWri:
1710 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1711 case AArch64::SUBSWrs:
1712 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1713 case AArch64::SUBSWrx:
1714 return AArch64::SUBWrx;
1715 case AArch64::SUBSXrr:
1716 return AArch64::SUBXrr;
1717 case AArch64::SUBSXri:
1718 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1719 case AArch64::SUBSXrs:
1720 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1721 case AArch64::SUBSXrx:
1722 return AArch64::SUBXrx;
1723 }
1724}
1725
1726enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1727
1728/// True when condition flags are accessed (either by writing or reading)
1729/// on the instruction trace starting at From and ending at To.
1730///
1731/// Note: If From and To are from different blocks it's assumed CC are accessed
1732/// on the path.
1735 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1736 // Early exit if To is at the beginning of the BB.
1737 if (To == To->getParent()->begin())
1738 return true;
1739
1740 // Check whether the instructions are in the same basic block
1741 // If not, assume the condition flags might get modified somewhere.
1742 if (To->getParent() != From->getParent())
1743 return true;
1744
1745 // From must be above To.
1746 assert(std::any_of(
1747 ++To.getReverse(), To->getParent()->rend(),
1748 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1749
1750 // We iterate backward starting at \p To until we hit \p From.
1751 for (const MachineInstr &Instr :
1753 if (((AccessToCheck & AK_Write) &&
1754 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1755 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1756 return true;
1757 }
1758 return false;
1759}
1760
1761std::optional<unsigned>
1762AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1763 MachineInstr *Pred,
1764 const MachineRegisterInfo *MRI) const {
1765 unsigned MaskOpcode = Mask->getOpcode();
1766 unsigned PredOpcode = Pred->getOpcode();
1767 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1768 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1769
1770 if (PredIsWhileLike) {
1771 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1772 // instruction and the condition is "any" since WHILcc does an implicit
1773 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1774 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1775 return PredOpcode;
1776
1777 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1778 // redundant since WHILE performs an implicit PTEST with an all active
1779 // mask.
1780 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1781 getElementSizeForOpcode(MaskOpcode) ==
1782 getElementSizeForOpcode(PredOpcode))
1783 return PredOpcode;
1784
1785 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1786 // WHILEcc performs an implicit PTEST with an all active mask, setting
1787 // the N flag as the PTEST_FIRST would.
1788 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1789 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1790 return PredOpcode;
1791
1792 return {};
1793 }
1794
1795 if (PredIsPTestLike) {
1796 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1797 // instruction that sets the flags as PTEST would and the condition is
1798 // "any" since PG is always a subset of the governing predicate of the
1799 // ptest-like instruction.
1800 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1801 return PredOpcode;
1802
1803 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1804
1805 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1806 // to look through a copy and try again. This is because some instructions
1807 // take a predicate whose register class is a subset of its result class.
1808 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1809 PTestLikeMask->getOperand(1).getReg().isVirtual())
1810 PTestLikeMask =
1811 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1812
1813 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1814 // the element size matches and either the PTEST_LIKE instruction uses
1815 // the same all active mask or the condition is "any".
1816 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1817 getElementSizeForOpcode(MaskOpcode) ==
1818 getElementSizeForOpcode(PredOpcode)) {
1819 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1820 return PredOpcode;
1821 }
1822
1823 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1824 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1825 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1826 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1827 // performed by the compare could consider fewer lanes for these element
1828 // sizes.
1829 //
1830 // For example, consider
1831 //
1832 // ptrue p0.b ; P0=1111-1111-1111-1111
1833 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1834 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1835 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1836 // ; ^ last active
1837 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1838 // ; ^ last active
1839 //
1840 // where the compare generates a canonical all active 32-bit predicate
1841 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1842 // active flag, whereas the PTEST instruction with the same mask doesn't.
1843 // For PTEST_ANY this doesn't apply as the flags in this case would be
1844 // identical regardless of element size.
1845 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1846 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1847 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1848 return PredOpcode;
1849
1850 return {};
1851 }
1852
1853 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1854 // opcode so the PTEST becomes redundant.
1855 switch (PredOpcode) {
1856 case AArch64::AND_PPzPP:
1857 case AArch64::BIC_PPzPP:
1858 case AArch64::EOR_PPzPP:
1859 case AArch64::NAND_PPzPP:
1860 case AArch64::NOR_PPzPP:
1861 case AArch64::ORN_PPzPP:
1862 case AArch64::ORR_PPzPP:
1863 case AArch64::BRKA_PPzP:
1864 case AArch64::BRKPA_PPzPP:
1865 case AArch64::BRKB_PPzP:
1866 case AArch64::BRKPB_PPzPP:
1867 case AArch64::RDFFR_PPz: {
1868 // Check to see if our mask is the same. If not the resulting flag bits
1869 // may be different and we can't remove the ptest.
1870 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1871 if (Mask != PredMask)
1872 return {};
1873 break;
1874 }
1875 case AArch64::BRKN_PPzP: {
1876 // BRKN uses an all active implicit mask to set flags unlike the other
1877 // flag-setting instructions.
1878 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1879 if ((MaskOpcode != AArch64::PTRUE_B) ||
1880 (Mask->getOperand(1).getImm() != 31))
1881 return {};
1882 break;
1883 }
1884 case AArch64::PTRUE_B:
1885 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1886 break;
1887 default:
1888 // Bail out if we don't recognize the input
1889 return {};
1890 }
1891
1892 return convertToFlagSettingOpc(PredOpcode);
1893}
1894
1895/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1896/// operation which could set the flags in an identical manner
1897bool AArch64InstrInfo::optimizePTestInstr(
1898 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1899 const MachineRegisterInfo *MRI) const {
1900 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1901 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1902
1903 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1904 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1905 // before the branch to extract each subregister.
1906 auto Op = Pred->getOperand(1);
1907 if (Op.isReg() && Op.getReg().isVirtual() &&
1908 Op.getSubReg() == AArch64::psub0)
1909 Pred = MRI->getUniqueVRegDef(Op.getReg());
1910 }
1911
1912 unsigned PredOpcode = Pred->getOpcode();
1913 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1914 if (!NewOp)
1915 return false;
1916
1917 const TargetRegisterInfo *TRI = &getRegisterInfo();
1918
1919 // If another instruction between Pred and PTest accesses flags, don't remove
1920 // the ptest or update the earlier instruction to modify them.
1921 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1922 return false;
1923
1924 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1925 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1926 // operand to be replaced with an equivalent instruction that also sets the
1927 // flags.
1928 PTest->eraseFromParent();
1929 if (*NewOp != PredOpcode) {
1930 Pred->setDesc(get(*NewOp));
1931 bool succeeded = UpdateOperandRegClass(*Pred);
1932 (void)succeeded;
1933 assert(succeeded && "Operands have incompatible register classes!");
1934 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1935 }
1936
1937 // Ensure that the flags def is live.
1938 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1939 unsigned i = 0, e = Pred->getNumOperands();
1940 for (; i != e; ++i) {
1941 MachineOperand &MO = Pred->getOperand(i);
1942 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1943 MO.setIsDead(false);
1944 break;
1945 }
1946 }
1947 }
1948 return true;
1949}
1950
1951/// Try to optimize a compare instruction. A compare instruction is an
1952/// instruction which produces AArch64::NZCV. It can be truly compare
1953/// instruction
1954/// when there are no uses of its destination register.
1955///
1956/// The following steps are tried in order:
1957/// 1. Convert CmpInstr into an unconditional version.
1958/// 2. Remove CmpInstr if above there is an instruction producing a needed
1959/// condition code or an instruction which can be converted into such an
1960/// instruction.
1961/// Only comparison with zero is supported.
1963 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1964 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1965 assert(CmpInstr.getParent());
1966 assert(MRI);
1967
1968 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1969 int DeadNZCVIdx =
1970 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1971 if (DeadNZCVIdx != -1) {
1972 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1973 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1974 CmpInstr.eraseFromParent();
1975 return true;
1976 }
1977 unsigned Opc = CmpInstr.getOpcode();
1978 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1979 if (NewOpc == Opc)
1980 return false;
1981 const MCInstrDesc &MCID = get(NewOpc);
1982 CmpInstr.setDesc(MCID);
1983 CmpInstr.removeOperand(DeadNZCVIdx);
1984 bool succeeded = UpdateOperandRegClass(CmpInstr);
1985 (void)succeeded;
1986 assert(succeeded && "Some operands reg class are incompatible!");
1987 return true;
1988 }
1989
1990 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1991 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1992 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1993 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1994
1995 if (SrcReg2 != 0)
1996 return false;
1997
1998 // CmpInstr is a Compare instruction if destination register is not used.
1999 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
2000 return false;
2001
2002 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
2003 return true;
2004 return (CmpValue == 0 || CmpValue == 1) &&
2005 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
2006}
2007
2008/// Get opcode of S version of Instr.
2009/// If Instr is S version its opcode is returned.
2010/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
2011/// or we are not interested in it.
2012static unsigned sForm(MachineInstr &Instr) {
2013 switch (Instr.getOpcode()) {
2014 default:
2015 return AArch64::INSTRUCTION_LIST_END;
2016
2017 case AArch64::ADDSWrr:
2018 case AArch64::ADDSWri:
2019 case AArch64::ADDSXrr:
2020 case AArch64::ADDSXri:
2021 case AArch64::ADDSWrx:
2022 case AArch64::ADDSXrx:
2023 case AArch64::SUBSWrr:
2024 case AArch64::SUBSWri:
2025 case AArch64::SUBSWrx:
2026 case AArch64::SUBSXrr:
2027 case AArch64::SUBSXri:
2028 case AArch64::SUBSXrx:
2029 case AArch64::ANDSWri:
2030 case AArch64::ANDSWrr:
2031 case AArch64::ANDSWrs:
2032 case AArch64::ANDSXri:
2033 case AArch64::ANDSXrr:
2034 case AArch64::ANDSXrs:
2035 case AArch64::BICSWrr:
2036 case AArch64::BICSXrr:
2037 case AArch64::BICSWrs:
2038 case AArch64::BICSXrs:
2039 return Instr.getOpcode();
2040
2041 case AArch64::ADDWrr:
2042 return AArch64::ADDSWrr;
2043 case AArch64::ADDWri:
2044 return AArch64::ADDSWri;
2045 case AArch64::ADDXrr:
2046 return AArch64::ADDSXrr;
2047 case AArch64::ADDXri:
2048 return AArch64::ADDSXri;
2049 case AArch64::ADDWrx:
2050 return AArch64::ADDSWrx;
2051 case AArch64::ADDXrx:
2052 return AArch64::ADDSXrx;
2053 case AArch64::ADCWr:
2054 return AArch64::ADCSWr;
2055 case AArch64::ADCXr:
2056 return AArch64::ADCSXr;
2057 case AArch64::SUBWrr:
2058 return AArch64::SUBSWrr;
2059 case AArch64::SUBWri:
2060 return AArch64::SUBSWri;
2061 case AArch64::SUBXrr:
2062 return AArch64::SUBSXrr;
2063 case AArch64::SUBXri:
2064 return AArch64::SUBSXri;
2065 case AArch64::SUBWrx:
2066 return AArch64::SUBSWrx;
2067 case AArch64::SUBXrx:
2068 return AArch64::SUBSXrx;
2069 case AArch64::SBCWr:
2070 return AArch64::SBCSWr;
2071 case AArch64::SBCXr:
2072 return AArch64::SBCSXr;
2073 case AArch64::ANDWri:
2074 return AArch64::ANDSWri;
2075 case AArch64::ANDXri:
2076 return AArch64::ANDSXri;
2077 case AArch64::ANDWrr:
2078 return AArch64::ANDSWrr;
2079 case AArch64::ANDWrs:
2080 return AArch64::ANDSWrs;
2081 case AArch64::ANDXrr:
2082 return AArch64::ANDSXrr;
2083 case AArch64::ANDXrs:
2084 return AArch64::ANDSXrs;
2085 case AArch64::BICWrr:
2086 return AArch64::BICSWrr;
2087 case AArch64::BICXrr:
2088 return AArch64::BICSXrr;
2089 case AArch64::BICWrs:
2090 return AArch64::BICSWrs;
2091 case AArch64::BICXrs:
2092 return AArch64::BICSXrs;
2093 }
2094}
2095
2096/// Check if AArch64::NZCV should be alive in successors of MBB.
2098 for (auto *BB : MBB->successors())
2099 if (BB->isLiveIn(AArch64::NZCV))
2100 return true;
2101 return false;
2102}
2103
2104/// \returns The condition code operand index for \p Instr if it is a branch
2105/// or select and -1 otherwise.
2106int AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(
2107 const MachineInstr &Instr) {
2108 switch (Instr.getOpcode()) {
2109 default:
2110 return -1;
2111
2112 case AArch64::Bcc: {
2113 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2114 assert(Idx >= 2);
2115 return Idx - 2;
2116 }
2117
2118 case AArch64::CSINVWr:
2119 case AArch64::CSINVXr:
2120 case AArch64::CSINCWr:
2121 case AArch64::CSINCXr:
2122 case AArch64::CSELWr:
2123 case AArch64::CSELXr:
2124 case AArch64::CSNEGWr:
2125 case AArch64::CSNEGXr:
2126 case AArch64::FCSELSrrr:
2127 case AArch64::FCSELDrrr: {
2128 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2129 assert(Idx >= 1);
2130 return Idx - 1;
2131 }
2132 }
2133}
2134
2135/// Find a condition code used by the instruction.
2136/// Returns AArch64CC::Invalid if either the instruction does not use condition
2137/// codes or we don't optimize CmpInstr in the presence of such instructions.
2139 int CCIdx =
2140 AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2141 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2142 Instr.getOperand(CCIdx).getImm())
2144}
2145
2148 UsedNZCV UsedFlags;
2149 switch (CC) {
2150 default:
2151 break;
2152
2153 case AArch64CC::EQ: // Z set
2154 case AArch64CC::NE: // Z clear
2155 UsedFlags.Z = true;
2156 break;
2157
2158 case AArch64CC::HI: // Z clear and C set
2159 case AArch64CC::LS: // Z set or C clear
2160 UsedFlags.Z = true;
2161 [[fallthrough]];
2162 case AArch64CC::HS: // C set
2163 case AArch64CC::LO: // C clear
2164 UsedFlags.C = true;
2165 break;
2166
2167 case AArch64CC::MI: // N set
2168 case AArch64CC::PL: // N clear
2169 UsedFlags.N = true;
2170 break;
2171
2172 case AArch64CC::VS: // V set
2173 case AArch64CC::VC: // V clear
2174 UsedFlags.V = true;
2175 break;
2176
2177 case AArch64CC::GT: // Z clear, N and V the same
2178 case AArch64CC::LE: // Z set, N and V differ
2179 UsedFlags.Z = true;
2180 [[fallthrough]];
2181 case AArch64CC::GE: // N and V the same
2182 case AArch64CC::LT: // N and V differ
2183 UsedFlags.N = true;
2184 UsedFlags.V = true;
2185 break;
2186 }
2187 return UsedFlags;
2188}
2189
2190/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2191/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2192/// \returns std::nullopt otherwise.
2193///
2194/// Collect instructions using that flags in \p CCUseInstrs if provided.
2195std::optional<UsedNZCV>
2197 const TargetRegisterInfo &TRI,
2198 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2199 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2200 if (MI.getParent() != CmpParent)
2201 return std::nullopt;
2202
2203 if (areCFlagsAliveInSuccessors(CmpParent))
2204 return std::nullopt;
2205
2206 UsedNZCV NZCVUsedAfterCmp;
2208 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2209 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2211 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2212 return std::nullopt;
2213 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2214 if (CCUseInstrs)
2215 CCUseInstrs->push_back(&Instr);
2216 }
2217 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2218 break;
2219 }
2220 return NZCVUsedAfterCmp;
2221}
2222
2223static bool isADDSRegImm(unsigned Opcode) {
2224 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2225}
2226
2227static bool isSUBSRegImm(unsigned Opcode) {
2228 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2229}
2230
2232 unsigned Opc = sForm(MI);
2233 switch (Opc) {
2234 case AArch64::ANDSWri:
2235 case AArch64::ANDSWrr:
2236 case AArch64::ANDSWrs:
2237 case AArch64::ANDSXri:
2238 case AArch64::ANDSXrr:
2239 case AArch64::ANDSXrs:
2240 case AArch64::BICSWrr:
2241 case AArch64::BICSXrr:
2242 case AArch64::BICSWrs:
2243 case AArch64::BICSXrs:
2244 return true;
2245 default:
2246 return false;
2247 }
2248}
2249
2250/// Check if CmpInstr can be substituted by MI.
2251///
2252/// CmpInstr can be substituted:
2253/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2254/// - and, MI and CmpInstr are from the same MachineBB
2255/// - and, condition flags are not alive in successors of the CmpInstr parent
2256/// - and, if MI opcode is the S form there must be no defs of flags between
2257/// MI and CmpInstr
2258/// or if MI opcode is not the S form there must be neither defs of flags
2259/// nor uses of flags between MI and CmpInstr.
2260/// - and, if C/V flags are not used after CmpInstr
2261/// or if N flag is used but MI produces poison value if signed overflow
2262/// occurs.
2264 const TargetRegisterInfo &TRI) {
2265 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2266 // that may or may not set flags.
2267 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2268
2269 const unsigned CmpOpcode = CmpInstr.getOpcode();
2270 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2271 return false;
2272
2273 assert((CmpInstr.getOperand(2).isImm() &&
2274 CmpInstr.getOperand(2).getImm() == 0) &&
2275 "Caller guarantees that CmpInstr compares with constant 0");
2276
2277 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2278 if (!NZVCUsed || NZVCUsed->C)
2279 return false;
2280
2281 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2282 // '%vreg = add ...' or '%vreg = sub ...'.
2283 // Condition flag V is used to indicate signed overflow.
2284 // 1) MI and CmpInstr set N and V to the same value.
2285 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2286 // signed overflow occurs, so CmpInstr could still be simplified away.
2287 // Note that Ands and Bics instructions always clear the V flag.
2288 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2289 return false;
2290
2291 AccessKind AccessToCheck = AK_Write;
2292 if (sForm(MI) != MI.getOpcode())
2293 AccessToCheck = AK_All;
2294 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2295}
2296
2297/// Substitute an instruction comparing to zero with another instruction
2298/// which produces needed condition flags.
2299///
2300/// Return true on success.
2301bool AArch64InstrInfo::substituteCmpToZero(
2302 MachineInstr &CmpInstr, unsigned SrcReg,
2303 const MachineRegisterInfo &MRI) const {
2304 // Get the unique definition of SrcReg.
2305 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2306 if (!MI)
2307 return false;
2308
2309 const TargetRegisterInfo &TRI = getRegisterInfo();
2310
2311 unsigned NewOpc = sForm(*MI);
2312 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2313 return false;
2314
2315 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2316 return false;
2317
2318 // Update the instruction to set NZCV.
2319 MI->setDesc(get(NewOpc));
2320 CmpInstr.eraseFromParent();
2322 (void)succeeded;
2323 assert(succeeded && "Some operands reg class are incompatible!");
2324 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2325 return true;
2326}
2327
2328/// \returns True if \p CmpInstr can be removed.
2329///
2330/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2331/// codes used in \p CCUseInstrs must be inverted.
2333 int CmpValue, const TargetRegisterInfo &TRI,
2335 bool &IsInvertCC) {
2336 assert((CmpValue == 0 || CmpValue == 1) &&
2337 "Only comparisons to 0 or 1 considered for removal!");
2338
2339 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2340 unsigned MIOpc = MI.getOpcode();
2341 if (MIOpc == AArch64::CSINCWr) {
2342 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2343 MI.getOperand(2).getReg() != AArch64::WZR)
2344 return false;
2345 } else if (MIOpc == AArch64::CSINCXr) {
2346 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2347 MI.getOperand(2).getReg() != AArch64::XZR)
2348 return false;
2349 } else {
2350 return false;
2351 }
2353 if (MICC == AArch64CC::Invalid)
2354 return false;
2355
2356 // NZCV needs to be defined
2357 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2358 return false;
2359
2360 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2361 const unsigned CmpOpcode = CmpInstr.getOpcode();
2362 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2363 if (CmpValue && !IsSubsRegImm)
2364 return false;
2365 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2366 return false;
2367
2368 // MI conditions allowed: eq, ne, mi, pl
2369 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2370 if (MIUsedNZCV.C || MIUsedNZCV.V)
2371 return false;
2372
2373 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2374 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2375 // Condition flags are not used in CmpInstr basic block successors and only
2376 // Z or N flags allowed to be used after CmpInstr within its basic block
2377 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2378 return false;
2379 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2380 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2381 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2382 return false;
2383 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2384 if (MIUsedNZCV.N && !CmpValue)
2385 return false;
2386
2387 // There must be no defs of flags between MI and CmpInstr
2388 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2389 return false;
2390
2391 // Condition code is inverted in the following cases:
2392 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2393 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2394 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2395 (!CmpValue && MICC == AArch64CC::NE);
2396 return true;
2397}
2398
2399/// Remove comparison in csinc-cmp sequence
2400///
2401/// Examples:
2402/// 1. \code
2403/// csinc w9, wzr, wzr, ne
2404/// cmp w9, #0
2405/// b.eq
2406/// \endcode
2407/// to
2408/// \code
2409/// csinc w9, wzr, wzr, ne
2410/// b.ne
2411/// \endcode
2412///
2413/// 2. \code
2414/// csinc x2, xzr, xzr, mi
2415/// cmp x2, #1
2416/// b.pl
2417/// \endcode
2418/// to
2419/// \code
2420/// csinc x2, xzr, xzr, mi
2421/// b.pl
2422/// \endcode
2423///
2424/// \param CmpInstr comparison instruction
2425/// \return True when comparison removed
2426bool AArch64InstrInfo::removeCmpToZeroOrOne(
2427 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2428 const MachineRegisterInfo &MRI) const {
2429 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2430 if (!MI)
2431 return false;
2432 const TargetRegisterInfo &TRI = getRegisterInfo();
2433 SmallVector<MachineInstr *, 4> CCUseInstrs;
2434 bool IsInvertCC = false;
2435 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2436 IsInvertCC))
2437 return false;
2438 // Make transformation
2439 CmpInstr.eraseFromParent();
2440 if (IsInvertCC) {
2441 // Invert condition codes in CmpInstr CC users
2442 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2443 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2444 assert(Idx >= 0 && "Unexpected instruction using CC.");
2445 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2447 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2448 CCOperand.setImm(CCUse);
2449 }
2450 }
2451 return true;
2452}
2453
2454bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2455 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2456 MI.getOpcode() != AArch64::CATCHRET)
2457 return false;
2458
2459 MachineBasicBlock &MBB = *MI.getParent();
2460 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2461 auto TRI = Subtarget.getRegisterInfo();
2462 DebugLoc DL = MI.getDebugLoc();
2463
2464 if (MI.getOpcode() == AArch64::CATCHRET) {
2465 // Skip to the first instruction before the epilog.
2466 const TargetInstrInfo *TII =
2468 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2470 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2471 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2472 FirstEpilogSEH != MBB.begin())
2473 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2474 if (FirstEpilogSEH != MBB.begin())
2475 FirstEpilogSEH = std::next(FirstEpilogSEH);
2476 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2477 .addReg(AArch64::X0, RegState::Define)
2478 .addMBB(TargetMBB);
2479 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2480 .addReg(AArch64::X0, RegState::Define)
2481 .addReg(AArch64::X0)
2482 .addMBB(TargetMBB)
2483 .addImm(0);
2484 TargetMBB->setMachineBlockAddressTaken();
2485 return true;
2486 }
2487
2488 Register Reg = MI.getOperand(0).getReg();
2490 if (M.getStackProtectorGuard() == "sysreg") {
2491 const AArch64SysReg::SysReg *SrcReg =
2492 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2493 if (!SrcReg)
2494 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2495
2496 // mrs xN, sysreg
2497 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2499 .addImm(SrcReg->Encoding);
2500 int Offset = M.getStackProtectorGuardOffset();
2501 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2502 // ldr xN, [xN, #offset]
2503 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2504 .addDef(Reg)
2506 .addImm(Offset / 8);
2507 } else if (Offset >= -256 && Offset <= 255) {
2508 // ldur xN, [xN, #offset]
2509 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2510 .addDef(Reg)
2512 .addImm(Offset);
2513 } else if (Offset >= -4095 && Offset <= 4095) {
2514 if (Offset > 0) {
2515 // add xN, xN, #offset
2516 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2517 .addDef(Reg)
2519 .addImm(Offset)
2520 .addImm(0);
2521 } else {
2522 // sub xN, xN, #offset
2523 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2524 .addDef(Reg)
2526 .addImm(-Offset)
2527 .addImm(0);
2528 }
2529 // ldr xN, [xN]
2530 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2531 .addDef(Reg)
2533 .addImm(0);
2534 } else {
2535 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2536 // than 23760.
2537 // It might be nice to use AArch64::MOVi32imm here, which would get
2538 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2539 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2540 // AArch64FrameLowering might help us find such a scratch register
2541 // though. If we failed to find a scratch register, we could emit a
2542 // stream of add instructions to build up the immediate. Or, we could try
2543 // to insert a AArch64::MOVi32imm before register allocation so that we
2544 // didn't need to scavenge for a scratch register.
2545 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2546 }
2547 MBB.erase(MI);
2548 return true;
2549 }
2550
2551 const GlobalValue *GV =
2552 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2553 const TargetMachine &TM = MBB.getParent()->getTarget();
2554 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2555 const unsigned char MO_NC = AArch64II::MO_NC;
2556
2557 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2558 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2559 .addGlobalAddress(GV, 0, OpFlags);
2560 if (Subtarget.isTargetILP32()) {
2561 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2562 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2563 .addDef(Reg32, RegState::Dead)
2565 .addImm(0)
2566 .addMemOperand(*MI.memoperands_begin())
2568 } else {
2569 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2571 .addImm(0)
2572 .addMemOperand(*MI.memoperands_begin());
2573 }
2574 } else if (TM.getCodeModel() == CodeModel::Large) {
2575 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2576 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2577 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2578 .addImm(0);
2579 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2581 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2582 .addImm(16);
2583 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2585 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2586 .addImm(32);
2587 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2590 .addImm(48);
2591 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2593 .addImm(0)
2594 .addMemOperand(*MI.memoperands_begin());
2595 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2596 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2597 .addGlobalAddress(GV, 0, OpFlags);
2598 } else {
2599 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2600 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2601 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2602 if (Subtarget.isTargetILP32()) {
2603 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2604 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2605 .addDef(Reg32, RegState::Dead)
2607 .addGlobalAddress(GV, 0, LoFlags)
2608 .addMemOperand(*MI.memoperands_begin())
2610 } else {
2611 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2613 .addGlobalAddress(GV, 0, LoFlags)
2614 .addMemOperand(*MI.memoperands_begin());
2615 }
2616 }
2617
2618 MBB.erase(MI);
2619
2620 return true;
2621}
2622
2623// Return true if this instruction simply sets its single destination register
2624// to zero. This is equivalent to a register rename of the zero-register.
2626 switch (MI.getOpcode()) {
2627 default:
2628 break;
2629 case AArch64::MOVZWi:
2630 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2631 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2632 assert(MI.getDesc().getNumOperands() == 3 &&
2633 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2634 return true;
2635 }
2636 break;
2637 case AArch64::ANDWri: // and Rd, Rzr, #imm
2638 return MI.getOperand(1).getReg() == AArch64::WZR;
2639 case AArch64::ANDXri:
2640 return MI.getOperand(1).getReg() == AArch64::XZR;
2641 case TargetOpcode::COPY:
2642 return MI.getOperand(1).getReg() == AArch64::WZR;
2643 }
2644 return false;
2645}
2646
2647// Return true if this instruction simply renames a general register without
2648// modifying bits.
2650 switch (MI.getOpcode()) {
2651 default:
2652 break;
2653 case TargetOpcode::COPY: {
2654 // GPR32 copies will by lowered to ORRXrs
2655 Register DstReg = MI.getOperand(0).getReg();
2656 return (AArch64::GPR32RegClass.contains(DstReg) ||
2657 AArch64::GPR64RegClass.contains(DstReg));
2658 }
2659 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2660 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2661 assert(MI.getDesc().getNumOperands() == 4 &&
2662 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2663 return true;
2664 }
2665 break;
2666 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2667 if (MI.getOperand(2).getImm() == 0) {
2668 assert(MI.getDesc().getNumOperands() == 4 &&
2669 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2670 return true;
2671 }
2672 break;
2673 }
2674 return false;
2675}
2676
2677// Return true if this instruction simply renames a general register without
2678// modifying bits.
2680 switch (MI.getOpcode()) {
2681 default:
2682 break;
2683 case TargetOpcode::COPY: {
2684 Register DstReg = MI.getOperand(0).getReg();
2685 return AArch64::FPR128RegClass.contains(DstReg);
2686 }
2687 case AArch64::ORRv16i8:
2688 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2689 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2690 "invalid ORRv16i8 operands");
2691 return true;
2692 }
2693 break;
2694 }
2695 return false;
2696}
2697
2698static bool isFrameLoadOpcode(int Opcode) {
2699 switch (Opcode) {
2700 default:
2701 return false;
2702 case AArch64::LDRWui:
2703 case AArch64::LDRXui:
2704 case AArch64::LDRBui:
2705 case AArch64::LDRHui:
2706 case AArch64::LDRSui:
2707 case AArch64::LDRDui:
2708 case AArch64::LDRQui:
2709 case AArch64::LDR_PXI:
2710 return true;
2711 }
2712}
2713
2715 int &FrameIndex) const {
2716 if (!isFrameLoadOpcode(MI.getOpcode()))
2717 return Register();
2718
2719 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2720 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2721 FrameIndex = MI.getOperand(1).getIndex();
2722 return MI.getOperand(0).getReg();
2723 }
2724 return Register();
2725}
2726
2727static bool isFrameStoreOpcode(int Opcode) {
2728 switch (Opcode) {
2729 default:
2730 return false;
2731 case AArch64::STRWui:
2732 case AArch64::STRXui:
2733 case AArch64::STRBui:
2734 case AArch64::STRHui:
2735 case AArch64::STRSui:
2736 case AArch64::STRDui:
2737 case AArch64::STRQui:
2738 case AArch64::STR_PXI:
2739 return true;
2740 }
2741}
2742
2744 int &FrameIndex) const {
2745 if (!isFrameStoreOpcode(MI.getOpcode()))
2746 return Register();
2747
2748 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2749 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2750 FrameIndex = MI.getOperand(1).getIndex();
2751 return MI.getOperand(0).getReg();
2752 }
2753 return Register();
2754}
2755
2757 int &FrameIndex) const {
2758 if (!isFrameStoreOpcode(MI.getOpcode()))
2759 return Register();
2760
2761 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2762 return Reg;
2763
2765 if (hasStoreToStackSlot(MI, Accesses)) {
2766 if (Accesses.size() > 1)
2767 return Register();
2768
2769 FrameIndex =
2770 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2771 ->getFrameIndex();
2772 return MI.getOperand(0).getReg();
2773 }
2774 return Register();
2775}
2776
2778 int &FrameIndex) const {
2779 if (!isFrameLoadOpcode(MI.getOpcode()))
2780 return Register();
2781
2782 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2783 return Reg;
2784
2786 if (hasLoadFromStackSlot(MI, Accesses)) {
2787 if (Accesses.size() > 1)
2788 return Register();
2789
2790 FrameIndex =
2791 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2792 ->getFrameIndex();
2793 return MI.getOperand(0).getReg();
2794 }
2795 return Register();
2796}
2797
2798/// Check all MachineMemOperands for a hint to suppress pairing.
2800 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2801 return MMO->getFlags() & MOSuppressPair;
2802 });
2803}
2804
2805/// Set a flag on the first MachineMemOperand to suppress pairing.
2807 if (MI.memoperands_empty())
2808 return;
2809 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2810}
2811
2812/// Check all MachineMemOperands for a hint that the load/store is strided.
2814 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2815 return MMO->getFlags() & MOStridedAccess;
2816 });
2817}
2818
2820 switch (Opc) {
2821 default:
2822 return false;
2823 case AArch64::STURSi:
2824 case AArch64::STRSpre:
2825 case AArch64::STURDi:
2826 case AArch64::STRDpre:
2827 case AArch64::STURQi:
2828 case AArch64::STRQpre:
2829 case AArch64::STURBBi:
2830 case AArch64::STURHHi:
2831 case AArch64::STURWi:
2832 case AArch64::STRWpre:
2833 case AArch64::STURXi:
2834 case AArch64::STRXpre:
2835 case AArch64::LDURSi:
2836 case AArch64::LDRSpre:
2837 case AArch64::LDURDi:
2838 case AArch64::LDRDpre:
2839 case AArch64::LDURQi:
2840 case AArch64::LDRQpre:
2841 case AArch64::LDURWi:
2842 case AArch64::LDRWpre:
2843 case AArch64::LDURXi:
2844 case AArch64::LDRXpre:
2845 case AArch64::LDRSWpre:
2846 case AArch64::LDURSWi:
2847 case AArch64::LDURHHi:
2848 case AArch64::LDURBBi:
2849 case AArch64::LDURSBWi:
2850 case AArch64::LDURSHWi:
2851 return true;
2852 }
2853}
2854
2855std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2856 switch (Opc) {
2857 default: return {};
2858 case AArch64::PRFMui: return AArch64::PRFUMi;
2859 case AArch64::LDRXui: return AArch64::LDURXi;
2860 case AArch64::LDRWui: return AArch64::LDURWi;
2861 case AArch64::LDRBui: return AArch64::LDURBi;
2862 case AArch64::LDRHui: return AArch64::LDURHi;
2863 case AArch64::LDRSui: return AArch64::LDURSi;
2864 case AArch64::LDRDui: return AArch64::LDURDi;
2865 case AArch64::LDRQui: return AArch64::LDURQi;
2866 case AArch64::LDRBBui: return AArch64::LDURBBi;
2867 case AArch64::LDRHHui: return AArch64::LDURHHi;
2868 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2869 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2870 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2871 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2872 case AArch64::LDRSWui: return AArch64::LDURSWi;
2873 case AArch64::STRXui: return AArch64::STURXi;
2874 case AArch64::STRWui: return AArch64::STURWi;
2875 case AArch64::STRBui: return AArch64::STURBi;
2876 case AArch64::STRHui: return AArch64::STURHi;
2877 case AArch64::STRSui: return AArch64::STURSi;
2878 case AArch64::STRDui: return AArch64::STURDi;
2879 case AArch64::STRQui: return AArch64::STURQi;
2880 case AArch64::STRBBui: return AArch64::STURBBi;
2881 case AArch64::STRHHui: return AArch64::STURHHi;
2882 }
2883}
2884
2886 switch (Opc) {
2887 default:
2888 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2889 case AArch64::ADDG:
2890 case AArch64::LDAPURBi:
2891 case AArch64::LDAPURHi:
2892 case AArch64::LDAPURi:
2893 case AArch64::LDAPURSBWi:
2894 case AArch64::LDAPURSBXi:
2895 case AArch64::LDAPURSHWi:
2896 case AArch64::LDAPURSHXi:
2897 case AArch64::LDAPURSWi:
2898 case AArch64::LDAPURXi:
2899 case AArch64::LDR_PPXI:
2900 case AArch64::LDR_PXI:
2901 case AArch64::LDR_ZXI:
2902 case AArch64::LDR_ZZXI:
2903 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2904 case AArch64::LDR_ZZZXI:
2905 case AArch64::LDR_ZZZZXI:
2906 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2907 case AArch64::LDRBBui:
2908 case AArch64::LDRBui:
2909 case AArch64::LDRDui:
2910 case AArch64::LDRHHui:
2911 case AArch64::LDRHui:
2912 case AArch64::LDRQui:
2913 case AArch64::LDRSBWui:
2914 case AArch64::LDRSBXui:
2915 case AArch64::LDRSHWui:
2916 case AArch64::LDRSHXui:
2917 case AArch64::LDRSui:
2918 case AArch64::LDRSWui:
2919 case AArch64::LDRWui:
2920 case AArch64::LDRXui:
2921 case AArch64::LDURBBi:
2922 case AArch64::LDURBi:
2923 case AArch64::LDURDi:
2924 case AArch64::LDURHHi:
2925 case AArch64::LDURHi:
2926 case AArch64::LDURQi:
2927 case AArch64::LDURSBWi:
2928 case AArch64::LDURSBXi:
2929 case AArch64::LDURSHWi:
2930 case AArch64::LDURSHXi:
2931 case AArch64::LDURSi:
2932 case AArch64::LDURSWi:
2933 case AArch64::LDURWi:
2934 case AArch64::LDURXi:
2935 case AArch64::PRFMui:
2936 case AArch64::PRFUMi:
2937 case AArch64::ST2Gi:
2938 case AArch64::STGi:
2939 case AArch64::STLURBi:
2940 case AArch64::STLURHi:
2941 case AArch64::STLURWi:
2942 case AArch64::STLURXi:
2943 case AArch64::StoreSwiftAsyncContext:
2944 case AArch64::STR_PPXI:
2945 case AArch64::STR_PXI:
2946 case AArch64::STR_ZXI:
2947 case AArch64::STR_ZZXI:
2948 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2949 case AArch64::STR_ZZZXI:
2950 case AArch64::STR_ZZZZXI:
2951 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2952 case AArch64::STRBBui:
2953 case AArch64::STRBui:
2954 case AArch64::STRDui:
2955 case AArch64::STRHHui:
2956 case AArch64::STRHui:
2957 case AArch64::STRQui:
2958 case AArch64::STRSui:
2959 case AArch64::STRWui:
2960 case AArch64::STRXui:
2961 case AArch64::STURBBi:
2962 case AArch64::STURBi:
2963 case AArch64::STURDi:
2964 case AArch64::STURHHi:
2965 case AArch64::STURHi:
2966 case AArch64::STURQi:
2967 case AArch64::STURSi:
2968 case AArch64::STURWi:
2969 case AArch64::STURXi:
2970 case AArch64::STZ2Gi:
2971 case AArch64::STZGi:
2972 case AArch64::TAGPstack:
2973 return 2;
2974 case AArch64::LD1B_D_IMM:
2975 case AArch64::LD1B_H_IMM:
2976 case AArch64::LD1B_IMM:
2977 case AArch64::LD1B_S_IMM:
2978 case AArch64::LD1D_IMM:
2979 case AArch64::LD1H_D_IMM:
2980 case AArch64::LD1H_IMM:
2981 case AArch64::LD1H_S_IMM:
2982 case AArch64::LD1RB_D_IMM:
2983 case AArch64::LD1RB_H_IMM:
2984 case AArch64::LD1RB_IMM:
2985 case AArch64::LD1RB_S_IMM:
2986 case AArch64::LD1RD_IMM:
2987 case AArch64::LD1RH_D_IMM:
2988 case AArch64::LD1RH_IMM:
2989 case AArch64::LD1RH_S_IMM:
2990 case AArch64::LD1RSB_D_IMM:
2991 case AArch64::LD1RSB_H_IMM:
2992 case AArch64::LD1RSB_S_IMM:
2993 case AArch64::LD1RSH_D_IMM:
2994 case AArch64::LD1RSH_S_IMM:
2995 case AArch64::LD1RSW_IMM:
2996 case AArch64::LD1RW_D_IMM:
2997 case AArch64::LD1RW_IMM:
2998 case AArch64::LD1SB_D_IMM:
2999 case AArch64::LD1SB_H_IMM:
3000 case AArch64::LD1SB_S_IMM:
3001 case AArch64::LD1SH_D_IMM:
3002 case AArch64::LD1SH_S_IMM:
3003 case AArch64::LD1SW_D_IMM:
3004 case AArch64::LD1W_D_IMM:
3005 case AArch64::LD1W_IMM:
3006 case AArch64::LD2B_IMM:
3007 case AArch64::LD2D_IMM:
3008 case AArch64::LD2H_IMM:
3009 case AArch64::LD2W_IMM:
3010 case AArch64::LD3B_IMM:
3011 case AArch64::LD3D_IMM:
3012 case AArch64::LD3H_IMM:
3013 case AArch64::LD3W_IMM:
3014 case AArch64::LD4B_IMM:
3015 case AArch64::LD4D_IMM:
3016 case AArch64::LD4H_IMM:
3017 case AArch64::LD4W_IMM:
3018 case AArch64::LDG:
3019 case AArch64::LDNF1B_D_IMM:
3020 case AArch64::LDNF1B_H_IMM:
3021 case AArch64::LDNF1B_IMM:
3022 case AArch64::LDNF1B_S_IMM:
3023 case AArch64::LDNF1D_IMM:
3024 case AArch64::LDNF1H_D_IMM:
3025 case AArch64::LDNF1H_IMM:
3026 case AArch64::LDNF1H_S_IMM:
3027 case AArch64::LDNF1SB_D_IMM:
3028 case AArch64::LDNF1SB_H_IMM:
3029 case AArch64::LDNF1SB_S_IMM:
3030 case AArch64::LDNF1SH_D_IMM:
3031 case AArch64::LDNF1SH_S_IMM:
3032 case AArch64::LDNF1SW_D_IMM:
3033 case AArch64::LDNF1W_D_IMM:
3034 case AArch64::LDNF1W_IMM:
3035 case AArch64::LDNPDi:
3036 case AArch64::LDNPQi:
3037 case AArch64::LDNPSi:
3038 case AArch64::LDNPWi:
3039 case AArch64::LDNPXi:
3040 case AArch64::LDNT1B_ZRI:
3041 case AArch64::LDNT1D_ZRI:
3042 case AArch64::LDNT1H_ZRI:
3043 case AArch64::LDNT1W_ZRI:
3044 case AArch64::LDPDi:
3045 case AArch64::LDPQi:
3046 case AArch64::LDPSi:
3047 case AArch64::LDPWi:
3048 case AArch64::LDPXi:
3049 case AArch64::LDRBBpost:
3050 case AArch64::LDRBBpre:
3051 case AArch64::LDRBpost:
3052 case AArch64::LDRBpre:
3053 case AArch64::LDRDpost:
3054 case AArch64::LDRDpre:
3055 case AArch64::LDRHHpost:
3056 case AArch64::LDRHHpre:
3057 case AArch64::LDRHpost:
3058 case AArch64::LDRHpre:
3059 case AArch64::LDRQpost:
3060 case AArch64::LDRQpre:
3061 case AArch64::LDRSpost:
3062 case AArch64::LDRSpre:
3063 case AArch64::LDRWpost:
3064 case AArch64::LDRWpre:
3065 case AArch64::LDRXpost:
3066 case AArch64::LDRXpre:
3067 case AArch64::ST1B_D_IMM:
3068 case AArch64::ST1B_H_IMM:
3069 case AArch64::ST1B_IMM:
3070 case AArch64::ST1B_S_IMM:
3071 case AArch64::ST1D_IMM:
3072 case AArch64::ST1H_D_IMM:
3073 case AArch64::ST1H_IMM:
3074 case AArch64::ST1H_S_IMM:
3075 case AArch64::ST1W_D_IMM:
3076 case AArch64::ST1W_IMM:
3077 case AArch64::ST2B_IMM:
3078 case AArch64::ST2D_IMM:
3079 case AArch64::ST2H_IMM:
3080 case AArch64::ST2W_IMM:
3081 case AArch64::ST3B_IMM:
3082 case AArch64::ST3D_IMM:
3083 case AArch64::ST3H_IMM:
3084 case AArch64::ST3W_IMM:
3085 case AArch64::ST4B_IMM:
3086 case AArch64::ST4D_IMM:
3087 case AArch64::ST4H_IMM:
3088 case AArch64::ST4W_IMM:
3089 case AArch64::STGPi:
3090 case AArch64::STGPreIndex:
3091 case AArch64::STZGPreIndex:
3092 case AArch64::ST2GPreIndex:
3093 case AArch64::STZ2GPreIndex:
3094 case AArch64::STGPostIndex:
3095 case AArch64::STZGPostIndex:
3096 case AArch64::ST2GPostIndex:
3097 case AArch64::STZ2GPostIndex:
3098 case AArch64::STNPDi:
3099 case AArch64::STNPQi:
3100 case AArch64::STNPSi:
3101 case AArch64::STNPWi:
3102 case AArch64::STNPXi:
3103 case AArch64::STNT1B_ZRI:
3104 case AArch64::STNT1D_ZRI:
3105 case AArch64::STNT1H_ZRI:
3106 case AArch64::STNT1W_ZRI:
3107 case AArch64::STPDi:
3108 case AArch64::STPQi:
3109 case AArch64::STPSi:
3110 case AArch64::STPWi:
3111 case AArch64::STPXi:
3112 case AArch64::STRBBpost:
3113 case AArch64::STRBBpre:
3114 case AArch64::STRBpost:
3115 case AArch64::STRBpre:
3116 case AArch64::STRDpost:
3117 case AArch64::STRDpre:
3118 case AArch64::STRHHpost:
3119 case AArch64::STRHHpre:
3120 case AArch64::STRHpost:
3121 case AArch64::STRHpre:
3122 case AArch64::STRQpost:
3123 case AArch64::STRQpre:
3124 case AArch64::STRSpost:
3125 case AArch64::STRSpre:
3126 case AArch64::STRWpost:
3127 case AArch64::STRWpre:
3128 case AArch64::STRXpost:
3129 case AArch64::STRXpre:
3130 return 3;
3131 case AArch64::LDPDpost:
3132 case AArch64::LDPDpre:
3133 case AArch64::LDPQpost:
3134 case AArch64::LDPQpre:
3135 case AArch64::LDPSpost:
3136 case AArch64::LDPSpre:
3137 case AArch64::LDPWpost:
3138 case AArch64::LDPWpre:
3139 case AArch64::LDPXpost:
3140 case AArch64::LDPXpre:
3141 case AArch64::STGPpre:
3142 case AArch64::STGPpost:
3143 case AArch64::STPDpost:
3144 case AArch64::STPDpre:
3145 case AArch64::STPQpost:
3146 case AArch64::STPQpre:
3147 case AArch64::STPSpost:
3148 case AArch64::STPSpre:
3149 case AArch64::STPWpost:
3150 case AArch64::STPWpre:
3151 case AArch64::STPXpost:
3152 case AArch64::STPXpre:
3153 return 4;
3154 }
3155}
3156
3158 switch (MI.getOpcode()) {
3159 default:
3160 return false;
3161 // Scaled instructions.
3162 case AArch64::STRSui:
3163 case AArch64::STRDui:
3164 case AArch64::STRQui:
3165 case AArch64::STRXui:
3166 case AArch64::STRWui:
3167 case AArch64::LDRSui:
3168 case AArch64::LDRDui:
3169 case AArch64::LDRQui:
3170 case AArch64::LDRXui:
3171 case AArch64::LDRWui:
3172 case AArch64::LDRSWui:
3173 // Unscaled instructions.
3174 case AArch64::STURSi:
3175 case AArch64::STRSpre:
3176 case AArch64::STURDi:
3177 case AArch64::STRDpre:
3178 case AArch64::STURQi:
3179 case AArch64::STRQpre:
3180 case AArch64::STURWi:
3181 case AArch64::STRWpre:
3182 case AArch64::STURXi:
3183 case AArch64::STRXpre:
3184 case AArch64::LDURSi:
3185 case AArch64::LDRSpre:
3186 case AArch64::LDURDi:
3187 case AArch64::LDRDpre:
3188 case AArch64::LDURQi:
3189 case AArch64::LDRQpre:
3190 case AArch64::LDURWi:
3191 case AArch64::LDRWpre:
3192 case AArch64::LDURXi:
3193 case AArch64::LDRXpre:
3194 case AArch64::LDURSWi:
3195 case AArch64::LDRSWpre:
3196 // SVE instructions.
3197 case AArch64::LDR_ZXI:
3198 case AArch64::STR_ZXI:
3199 return true;
3200 }
3201}
3202
3204 switch (MI.getOpcode()) {
3205 default:
3206 assert((!MI.isCall() || !MI.isReturn()) &&
3207 "Unexpected instruction - was a new tail call opcode introduced?");
3208 return false;
3209 case AArch64::TCRETURNdi:
3210 case AArch64::TCRETURNri:
3211 case AArch64::TCRETURNrix16x17:
3212 case AArch64::TCRETURNrix17:
3213 case AArch64::TCRETURNrinotx16:
3214 case AArch64::TCRETURNriALL:
3215 case AArch64::AUTH_TCRETURN:
3216 case AArch64::AUTH_TCRETURN_BTI:
3217 return true;
3218 }
3219}
3220
3222 switch (Opc) {
3223 default:
3224 llvm_unreachable("Opcode has no flag setting equivalent!");
3225 // 32-bit cases:
3226 case AArch64::ADDWri:
3227 return AArch64::ADDSWri;
3228 case AArch64::ADDWrr:
3229 return AArch64::ADDSWrr;
3230 case AArch64::ADDWrs:
3231 return AArch64::ADDSWrs;
3232 case AArch64::ADDWrx:
3233 return AArch64::ADDSWrx;
3234 case AArch64::ANDWri:
3235 return AArch64::ANDSWri;
3236 case AArch64::ANDWrr:
3237 return AArch64::ANDSWrr;
3238 case AArch64::ANDWrs:
3239 return AArch64::ANDSWrs;
3240 case AArch64::BICWrr:
3241 return AArch64::BICSWrr;
3242 case AArch64::BICWrs:
3243 return AArch64::BICSWrs;
3244 case AArch64::SUBWri:
3245 return AArch64::SUBSWri;
3246 case AArch64::SUBWrr:
3247 return AArch64::SUBSWrr;
3248 case AArch64::SUBWrs:
3249 return AArch64::SUBSWrs;
3250 case AArch64::SUBWrx:
3251 return AArch64::SUBSWrx;
3252 // 64-bit cases:
3253 case AArch64::ADDXri:
3254 return AArch64::ADDSXri;
3255 case AArch64::ADDXrr:
3256 return AArch64::ADDSXrr;
3257 case AArch64::ADDXrs:
3258 return AArch64::ADDSXrs;
3259 case AArch64::ADDXrx:
3260 return AArch64::ADDSXrx;
3261 case AArch64::ANDXri:
3262 return AArch64::ANDSXri;
3263 case AArch64::ANDXrr:
3264 return AArch64::ANDSXrr;
3265 case AArch64::ANDXrs:
3266 return AArch64::ANDSXrs;
3267 case AArch64::BICXrr:
3268 return AArch64::BICSXrr;
3269 case AArch64::BICXrs:
3270 return AArch64::BICSXrs;
3271 case AArch64::SUBXri:
3272 return AArch64::SUBSXri;
3273 case AArch64::SUBXrr:
3274 return AArch64::SUBSXrr;
3275 case AArch64::SUBXrs:
3276 return AArch64::SUBSXrs;
3277 case AArch64::SUBXrx:
3278 return AArch64::SUBSXrx;
3279 // SVE instructions:
3280 case AArch64::AND_PPzPP:
3281 return AArch64::ANDS_PPzPP;
3282 case AArch64::BIC_PPzPP:
3283 return AArch64::BICS_PPzPP;
3284 case AArch64::EOR_PPzPP:
3285 return AArch64::EORS_PPzPP;
3286 case AArch64::NAND_PPzPP:
3287 return AArch64::NANDS_PPzPP;
3288 case AArch64::NOR_PPzPP:
3289 return AArch64::NORS_PPzPP;
3290 case AArch64::ORN_PPzPP:
3291 return AArch64::ORNS_PPzPP;
3292 case AArch64::ORR_PPzPP:
3293 return AArch64::ORRS_PPzPP;
3294 case AArch64::BRKA_PPzP:
3295 return AArch64::BRKAS_PPzP;
3296 case AArch64::BRKPA_PPzPP:
3297 return AArch64::BRKPAS_PPzPP;
3298 case AArch64::BRKB_PPzP:
3299 return AArch64::BRKBS_PPzP;
3300 case AArch64::BRKPB_PPzPP:
3301 return AArch64::BRKPBS_PPzPP;
3302 case AArch64::BRKN_PPzP:
3303 return AArch64::BRKNS_PPzP;
3304 case AArch64::RDFFR_PPz:
3305 return AArch64::RDFFRS_PPz;
3306 case AArch64::PTRUE_B:
3307 return AArch64::PTRUES_B;
3308 }
3309}
3310
3311// Is this a candidate for ld/st merging or pairing? For example, we don't
3312// touch volatiles or load/stores that have a hint to avoid pair formation.
3314
3315 bool IsPreLdSt = isPreLdSt(MI);
3316
3317 // If this is a volatile load/store, don't mess with it.
3318 if (MI.hasOrderedMemoryRef())
3319 return false;
3320
3321 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3322 // For Pre-inc LD/ST, the operand is shifted by one.
3323 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3324 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3325 "Expected a reg or frame index operand.");
3326
3327 // For Pre-indexed addressing quadword instructions, the third operand is the
3328 // immediate value.
3329 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3330
3331 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3332 return false;
3333
3334 // Can't merge/pair if the instruction modifies the base register.
3335 // e.g., ldr x0, [x0]
3336 // This case will never occur with an FI base.
3337 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3338 // STR<S,D,Q,W,X>pre, it can be merged.
3339 // For example:
3340 // ldr q0, [x11, #32]!
3341 // ldr q1, [x11, #16]
3342 // to
3343 // ldp q0, q1, [x11, #32]!
3344 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3345 Register BaseReg = MI.getOperand(1).getReg();
3347 if (MI.modifiesRegister(BaseReg, TRI))
3348 return false;
3349 }
3350
3351 // Pairing SVE fills/spills is only valid for little-endian targets that
3352 // implement VLS 128.
3353 switch (MI.getOpcode()) {
3354 default:
3355 break;
3356 case AArch64::LDR_ZXI:
3357 case AArch64::STR_ZXI:
3358 if (!Subtarget.isLittleEndian() ||
3359 Subtarget.getSVEVectorSizeInBits() != 128)
3360 return false;
3361 }
3362
3363 // Check if this load/store has a hint to avoid pair formation.
3364 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3366 return false;
3367
3368 // Do not pair any callee-save store/reload instructions in the
3369 // prologue/epilogue if the CFI information encoded the operations as separate
3370 // instructions, as that will cause the size of the actual prologue to mismatch
3371 // with the prologue size recorded in the Windows CFI.
3372 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
3373 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
3374 MI.getMF()->getFunction().needsUnwindTableEntry();
3375 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3377 return false;
3378
3379 // On some CPUs quad load/store pairs are slower than two single load/stores.
3380 if (Subtarget.isPaired128Slow()) {
3381 switch (MI.getOpcode()) {
3382 default:
3383 break;
3384 case AArch64::LDURQi:
3385 case AArch64::STURQi:
3386 case AArch64::LDRQui:
3387 case AArch64::STRQui:
3388 return false;
3389 }
3390 }
3391
3392 return true;
3393}
3394
3397 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3398 const TargetRegisterInfo *TRI) const {
3399 if (!LdSt.mayLoadOrStore())
3400 return false;
3401
3402 const MachineOperand *BaseOp;
3403 TypeSize WidthN(0, false);
3404 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3405 WidthN, TRI))
3406 return false;
3407 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3408 // vector.
3409 Width = LocationSize::precise(WidthN);
3410 BaseOps.push_back(BaseOp);
3411 return true;
3412}
3413
3414std::optional<ExtAddrMode>
3416 const TargetRegisterInfo *TRI) const {
3417 const MachineOperand *Base; // Filled with the base operand of MI.
3418 int64_t Offset; // Filled with the offset of MI.
3419 bool OffsetIsScalable;
3420 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3421 return std::nullopt;
3422
3423 if (!Base->isReg())
3424 return std::nullopt;
3425 ExtAddrMode AM;
3426 AM.BaseReg = Base->getReg();
3427 AM.Displacement = Offset;
3428 AM.ScaledReg = 0;
3429 AM.Scale = 0;
3430 return AM;
3431}
3432
3434 Register Reg,
3435 const MachineInstr &AddrI,
3436 ExtAddrMode &AM) const {
3437 // Filter out instructions into which we cannot fold.
3438 unsigned NumBytes;
3439 int64_t OffsetScale = 1;
3440 switch (MemI.getOpcode()) {
3441 default:
3442 return false;
3443
3444 case AArch64::LDURQi:
3445 case AArch64::STURQi:
3446 NumBytes = 16;
3447 break;
3448
3449 case AArch64::LDURDi:
3450 case AArch64::STURDi:
3451 case AArch64::LDURXi:
3452 case AArch64::STURXi:
3453 NumBytes = 8;
3454 break;
3455
3456 case AArch64::LDURWi:
3457 case AArch64::LDURSWi:
3458 case AArch64::STURWi:
3459 NumBytes = 4;
3460 break;
3461
3462 case AArch64::LDURHi:
3463 case AArch64::STURHi:
3464 case AArch64::LDURHHi:
3465 case AArch64::STURHHi:
3466 case AArch64::LDURSHXi:
3467 case AArch64::LDURSHWi:
3468 NumBytes = 2;
3469 break;
3470
3471 case AArch64::LDRBroX:
3472 case AArch64::LDRBBroX:
3473 case AArch64::LDRSBXroX:
3474 case AArch64::LDRSBWroX:
3475 case AArch64::STRBroX:
3476 case AArch64::STRBBroX:
3477 case AArch64::LDURBi:
3478 case AArch64::LDURBBi:
3479 case AArch64::LDURSBXi:
3480 case AArch64::LDURSBWi:
3481 case AArch64::STURBi:
3482 case AArch64::STURBBi:
3483 case AArch64::LDRBui:
3484 case AArch64::LDRBBui:
3485 case AArch64::LDRSBXui:
3486 case AArch64::LDRSBWui:
3487 case AArch64::STRBui:
3488 case AArch64::STRBBui:
3489 NumBytes = 1;
3490 break;
3491
3492 case AArch64::LDRQroX:
3493 case AArch64::STRQroX:
3494 case AArch64::LDRQui:
3495 case AArch64::STRQui:
3496 NumBytes = 16;
3497 OffsetScale = 16;
3498 break;
3499
3500 case AArch64::LDRDroX:
3501 case AArch64::STRDroX:
3502 case AArch64::LDRXroX:
3503 case AArch64::STRXroX:
3504 case AArch64::LDRDui:
3505 case AArch64::STRDui:
3506 case AArch64::LDRXui:
3507 case AArch64::STRXui:
3508 NumBytes = 8;
3509 OffsetScale = 8;
3510 break;
3511
3512 case AArch64::LDRWroX:
3513 case AArch64::LDRSWroX:
3514 case AArch64::STRWroX:
3515 case AArch64::LDRWui:
3516 case AArch64::LDRSWui:
3517 case AArch64::STRWui:
3518 NumBytes = 4;
3519 OffsetScale = 4;
3520 break;
3521
3522 case AArch64::LDRHroX:
3523 case AArch64::STRHroX:
3524 case AArch64::LDRHHroX:
3525 case AArch64::STRHHroX:
3526 case AArch64::LDRSHXroX:
3527 case AArch64::LDRSHWroX:
3528 case AArch64::LDRHui:
3529 case AArch64::STRHui:
3530 case AArch64::LDRHHui:
3531 case AArch64::STRHHui:
3532 case AArch64::LDRSHXui:
3533 case AArch64::LDRSHWui:
3534 NumBytes = 2;
3535 OffsetScale = 2;
3536 break;
3537 }
3538
3539 // Check the fold operand is not the loaded/stored value.
3540 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3541 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3542 return false;
3543
3544 // Handle memory instructions with a [Reg, Reg] addressing mode.
3545 if (MemI.getOperand(2).isReg()) {
3546 // Bail if the addressing mode already includes extension of the offset
3547 // register.
3548 if (MemI.getOperand(3).getImm())
3549 return false;
3550
3551 // Check if we actually have a scaled offset.
3552 if (MemI.getOperand(4).getImm() == 0)
3553 OffsetScale = 1;
3554
3555 // If the address instructions is folded into the base register, then the
3556 // addressing mode must not have a scale. Then we can swap the base and the
3557 // scaled registers.
3558 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3559 return false;
3560
3561 switch (AddrI.getOpcode()) {
3562 default:
3563 return false;
3564
3565 case AArch64::SBFMXri:
3566 // sxtw Xa, Wm
3567 // ldr Xd, [Xn, Xa, lsl #N]
3568 // ->
3569 // ldr Xd, [Xn, Wm, sxtw #N]
3570 if (AddrI.getOperand(2).getImm() != 0 ||
3571 AddrI.getOperand(3).getImm() != 31)
3572 return false;
3573
3574 AM.BaseReg = MemI.getOperand(1).getReg();
3575 if (AM.BaseReg == Reg)
3576 AM.BaseReg = MemI.getOperand(2).getReg();
3577 AM.ScaledReg = AddrI.getOperand(1).getReg();
3578 AM.Scale = OffsetScale;
3579 AM.Displacement = 0;
3581 return true;
3582
3583 case TargetOpcode::SUBREG_TO_REG: {
3584 // mov Wa, Wm
3585 // ldr Xd, [Xn, Xa, lsl #N]
3586 // ->
3587 // ldr Xd, [Xn, Wm, uxtw #N]
3588
3589 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3590 if (AddrI.getOperand(2).getImm() != AArch64::sub_32)
3591 return false;
3592
3593 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3594 Register OffsetReg = AddrI.getOperand(1).getReg();
3595 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3596 return false;
3597
3598 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3599 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3600 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3601 DefMI.getOperand(3).getImm() != 0)
3602 return false;
3603
3604 AM.BaseReg = MemI.getOperand(1).getReg();
3605 if (AM.BaseReg == Reg)
3606 AM.BaseReg = MemI.getOperand(2).getReg();
3607 AM.ScaledReg = DefMI.getOperand(2).getReg();
3608 AM.Scale = OffsetScale;
3609 AM.Displacement = 0;
3611 return true;
3612 }
3613 }
3614 }
3615
3616 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3617
3618 // Check we are not breaking a potential conversion to an LDP.
3619 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3620 int64_t NewOffset) -> bool {
3621 int64_t MinOffset, MaxOffset;
3622 switch (NumBytes) {
3623 default:
3624 return true;
3625 case 4:
3626 MinOffset = -256;
3627 MaxOffset = 252;
3628 break;
3629 case 8:
3630 MinOffset = -512;
3631 MaxOffset = 504;
3632 break;
3633 case 16:
3634 MinOffset = -1024;
3635 MaxOffset = 1008;
3636 break;
3637 }
3638 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3639 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3640 };
3641 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3642 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3643 int64_t NewOffset = OldOffset + Disp;
3644 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3645 return false;
3646 // If the old offset would fit into an LDP, but the new offset wouldn't,
3647 // bail out.
3648 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3649 return false;
3650 AM.BaseReg = AddrI.getOperand(1).getReg();
3651 AM.ScaledReg = 0;
3652 AM.Scale = 0;
3653 AM.Displacement = NewOffset;
3655 return true;
3656 };
3657
3658 auto canFoldAddRegIntoAddrMode =
3659 [&](int64_t Scale,
3661 if (MemI.getOperand(2).getImm() != 0)
3662 return false;
3663 if ((unsigned)Scale != Scale)
3664 return false;
3665 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3666 return false;
3667 AM.BaseReg = AddrI.getOperand(1).getReg();
3668 AM.ScaledReg = AddrI.getOperand(2).getReg();
3669 AM.Scale = Scale;
3670 AM.Displacement = 0;
3671 AM.Form = Form;
3672 return true;
3673 };
3674
3675 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3676 unsigned Opcode = MemI.getOpcode();
3677 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3678 Subtarget.isSTRQroSlow();
3679 };
3680
3681 int64_t Disp = 0;
3682 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3683 switch (AddrI.getOpcode()) {
3684 default:
3685 return false;
3686
3687 case AArch64::ADDXri:
3688 // add Xa, Xn, #N
3689 // ldr Xd, [Xa, #M]
3690 // ->
3691 // ldr Xd, [Xn, #N'+M]
3692 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3693 return canFoldAddSubImmIntoAddrMode(Disp);
3694
3695 case AArch64::SUBXri:
3696 // sub Xa, Xn, #N
3697 // ldr Xd, [Xa, #M]
3698 // ->
3699 // ldr Xd, [Xn, #N'+M]
3700 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3701 return canFoldAddSubImmIntoAddrMode(-Disp);
3702
3703 case AArch64::ADDXrs: {
3704 // add Xa, Xn, Xm, lsl #N
3705 // ldr Xd, [Xa]
3706 // ->
3707 // ldr Xd, [Xn, Xm, lsl #N]
3708
3709 // Don't fold the add if the result would be slower, unless optimising for
3710 // size.
3711 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3713 return false;
3714 Shift = AArch64_AM::getShiftValue(Shift);
3715 if (!OptSize) {
3716 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3717 return false;
3718 if (avoidSlowSTRQ(MemI))
3719 return false;
3720 }
3721 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3722 }
3723
3724 case AArch64::ADDXrr:
3725 // add Xa, Xn, Xm
3726 // ldr Xd, [Xa]
3727 // ->
3728 // ldr Xd, [Xn, Xm, lsl #0]
3729
3730 // Don't fold the add if the result would be slower, unless optimising for
3731 // size.
3732 if (!OptSize && avoidSlowSTRQ(MemI))
3733 return false;
3734 return canFoldAddRegIntoAddrMode(1);
3735
3736 case AArch64::ADDXrx:
3737 // add Xa, Xn, Wm, {s,u}xtw #N
3738 // ldr Xd, [Xa]
3739 // ->
3740 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3741
3742 // Don't fold the add if the result would be slower, unless optimising for
3743 // size.
3744 if (!OptSize && avoidSlowSTRQ(MemI))
3745 return false;
3746
3747 // Can fold only sign-/zero-extend of a word.
3748 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3750 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3751 return false;
3752
3753 return canFoldAddRegIntoAddrMode(
3754 1ULL << AArch64_AM::getArithShiftValue(Imm),
3757 }
3758}
3759
3760// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3761// return the opcode of an instruction performing the same operation, but using
3762// the [Reg, Reg] addressing mode.
3763static unsigned regOffsetOpcode(unsigned Opcode) {
3764 switch (Opcode) {
3765 default:
3766 llvm_unreachable("Address folding not implemented for instruction");
3767
3768 case AArch64::LDURQi:
3769 case AArch64::LDRQui:
3770 return AArch64::LDRQroX;
3771 case AArch64::STURQi:
3772 case AArch64::STRQui:
3773 return AArch64::STRQroX;
3774 case AArch64::LDURDi:
3775 case AArch64::LDRDui:
3776 return AArch64::LDRDroX;
3777 case AArch64::STURDi:
3778 case AArch64::STRDui:
3779 return AArch64::STRDroX;
3780 case AArch64::LDURXi:
3781 case AArch64::LDRXui:
3782 return AArch64::LDRXroX;
3783 case AArch64::STURXi:
3784 case AArch64::STRXui:
3785 return AArch64::STRXroX;
3786 case AArch64::LDURWi:
3787 case AArch64::LDRWui:
3788 return AArch64::LDRWroX;
3789 case AArch64::LDURSWi:
3790 case AArch64::LDRSWui:
3791 return AArch64::LDRSWroX;
3792 case AArch64::STURWi:
3793 case AArch64::STRWui:
3794 return AArch64::STRWroX;
3795 case AArch64::LDURHi:
3796 case AArch64::LDRHui:
3797 return AArch64::LDRHroX;
3798 case AArch64::STURHi:
3799 case AArch64::STRHui:
3800 return AArch64::STRHroX;
3801 case AArch64::LDURHHi:
3802 case AArch64::LDRHHui:
3803 return AArch64::LDRHHroX;
3804 case AArch64::STURHHi:
3805 case AArch64::STRHHui:
3806 return AArch64::STRHHroX;
3807 case AArch64::LDURSHXi:
3808 case AArch64::LDRSHXui:
3809 return AArch64::LDRSHXroX;
3810 case AArch64::LDURSHWi:
3811 case AArch64::LDRSHWui:
3812 return AArch64::LDRSHWroX;
3813 case AArch64::LDURBi:
3814 case AArch64::LDRBui:
3815 return AArch64::LDRBroX;
3816 case AArch64::LDURBBi:
3817 case AArch64::LDRBBui:
3818 return AArch64::LDRBBroX;
3819 case AArch64::LDURSBXi:
3820 case AArch64::LDRSBXui:
3821 return AArch64::LDRSBXroX;
3822 case AArch64::LDURSBWi:
3823 case AArch64::LDRSBWui:
3824 return AArch64::LDRSBWroX;
3825 case AArch64::STURBi:
3826 case AArch64::STRBui:
3827 return AArch64::STRBroX;
3828 case AArch64::STURBBi:
3829 case AArch64::STRBBui:
3830 return AArch64::STRBBroX;
3831 }
3832}
3833
3834// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3835// the opcode of an instruction performing the same operation, but using the
3836// [Reg, #Imm] addressing mode with scaled offset.
3837unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3838 switch (Opcode) {
3839 default:
3840 llvm_unreachable("Address folding not implemented for instruction");
3841
3842 case AArch64::LDURQi:
3843 Scale = 16;
3844 return AArch64::LDRQui;
3845 case AArch64::STURQi:
3846 Scale = 16;
3847 return AArch64::STRQui;
3848 case AArch64::LDURDi:
3849 Scale = 8;
3850 return AArch64::LDRDui;
3851 case AArch64::STURDi:
3852 Scale = 8;
3853 return AArch64::STRDui;
3854 case AArch64::LDURXi:
3855 Scale = 8;
3856 return AArch64::LDRXui;
3857 case AArch64::STURXi:
3858 Scale = 8;
3859 return AArch64::STRXui;
3860 case AArch64::LDURWi:
3861 Scale = 4;
3862 return AArch64::LDRWui;
3863 case AArch64::LDURSWi:
3864 Scale = 4;
3865 return AArch64::LDRSWui;
3866 case AArch64::STURWi:
3867 Scale = 4;
3868 return AArch64::STRWui;
3869 case AArch64::LDURHi:
3870 Scale = 2;
3871 return AArch64::LDRHui;
3872 case AArch64::STURHi:
3873 Scale = 2;
3874 return AArch64::STRHui;
3875 case AArch64::LDURHHi:
3876 Scale = 2;
3877 return AArch64::LDRHHui;
3878 case AArch64::STURHHi:
3879 Scale = 2;
3880 return AArch64::STRHHui;
3881 case AArch64::LDURSHXi:
3882 Scale = 2;
3883 return AArch64::LDRSHXui;
3884 case AArch64::LDURSHWi:
3885 Scale = 2;
3886 return AArch64::LDRSHWui;
3887 case AArch64::LDURBi:
3888 Scale = 1;
3889 return AArch64::LDRBui;
3890 case AArch64::LDURBBi:
3891 Scale = 1;
3892 return AArch64::LDRBBui;
3893 case AArch64::LDURSBXi:
3894 Scale = 1;
3895 return AArch64::LDRSBXui;
3896 case AArch64::LDURSBWi:
3897 Scale = 1;
3898 return AArch64::LDRSBWui;
3899 case AArch64::STURBi:
3900 Scale = 1;
3901 return AArch64::STRBui;
3902 case AArch64::STURBBi:
3903 Scale = 1;
3904 return AArch64::STRBBui;
3905 case AArch64::LDRQui:
3906 case AArch64::STRQui:
3907 Scale = 16;
3908 return Opcode;
3909 case AArch64::LDRDui:
3910 case AArch64::STRDui:
3911 case AArch64::LDRXui:
3912 case AArch64::STRXui:
3913 Scale = 8;
3914 return Opcode;
3915 case AArch64::LDRWui:
3916 case AArch64::LDRSWui:
3917 case AArch64::STRWui:
3918 Scale = 4;
3919 return Opcode;
3920 case AArch64::LDRHui:
3921 case AArch64::STRHui:
3922 case AArch64::LDRHHui:
3923 case AArch64::STRHHui:
3924 case AArch64::LDRSHXui:
3925 case AArch64::LDRSHWui:
3926 Scale = 2;
3927 return Opcode;
3928 case AArch64::LDRBui:
3929 case AArch64::LDRBBui:
3930 case AArch64::LDRSBXui:
3931 case AArch64::LDRSBWui:
3932 case AArch64::STRBui:
3933 case AArch64::STRBBui:
3934 Scale = 1;
3935 return Opcode;
3936 }
3937}
3938
3939// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3940// the opcode of an instruction performing the same operation, but using the
3941// [Reg, #Imm] addressing mode with unscaled offset.
3942unsigned unscaledOffsetOpcode(unsigned Opcode) {
3943 switch (Opcode) {
3944 default:
3945 llvm_unreachable("Address folding not implemented for instruction");
3946
3947 case AArch64::LDURQi:
3948 case AArch64::STURQi:
3949 case AArch64::LDURDi:
3950 case AArch64::STURDi:
3951 case AArch64::LDURXi:
3952 case AArch64::STURXi:
3953 case AArch64::LDURWi:
3954 case AArch64::LDURSWi:
3955 case AArch64::STURWi:
3956 case AArch64::LDURHi:
3957 case AArch64::STURHi:
3958 case AArch64::LDURHHi:
3959 case AArch64::STURHHi:
3960 case AArch64::LDURSHXi:
3961 case AArch64::LDURSHWi:
3962 case AArch64::LDURBi:
3963 case AArch64::STURBi:
3964 case AArch64::LDURBBi:
3965 case AArch64::STURBBi:
3966 case AArch64::LDURSBWi:
3967 case AArch64::LDURSBXi:
3968 return Opcode;
3969 case AArch64::LDRQui:
3970 return AArch64::LDURQi;
3971 case AArch64::STRQui:
3972 return AArch64::STURQi;
3973 case AArch64::LDRDui:
3974 return AArch64::LDURDi;
3975 case AArch64::STRDui:
3976 return AArch64::STURDi;
3977 case AArch64::LDRXui:
3978 return AArch64::LDURXi;
3979 case AArch64::STRXui:
3980 return AArch64::STURXi;
3981 case AArch64::LDRWui:
3982 return AArch64::LDURWi;
3983 case AArch64::LDRSWui:
3984 return AArch64::LDURSWi;
3985 case AArch64::STRWui:
3986 return AArch64::STURWi;
3987 case AArch64::LDRHui:
3988 return AArch64::LDURHi;
3989 case AArch64::STRHui:
3990 return AArch64::STURHi;
3991 case AArch64::LDRHHui:
3992 return AArch64::LDURHHi;
3993 case AArch64::STRHHui:
3994 return AArch64::STURHHi;
3995 case AArch64::LDRSHXui:
3996 return AArch64::LDURSHXi;
3997 case AArch64::LDRSHWui:
3998 return AArch64::LDURSHWi;
3999 case AArch64::LDRBBui:
4000 return AArch64::LDURBBi;
4001 case AArch64::LDRBui:
4002 return AArch64::LDURBi;
4003 case AArch64::STRBBui:
4004 return AArch64::STURBBi;
4005 case AArch64::STRBui:
4006 return AArch64::STURBi;
4007 case AArch64::LDRSBWui:
4008 return AArch64::LDURSBWi;
4009 case AArch64::LDRSBXui:
4010 return AArch64::LDURSBXi;
4011 }
4012}
4013
4014// Given the opcode of a memory load/store instruction, return the opcode of an
4015// instruction performing the same operation, but using
4016// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
4017// offset register.
4018static unsigned offsetExtendOpcode(unsigned Opcode) {
4019 switch (Opcode) {
4020 default:
4021 llvm_unreachable("Address folding not implemented for instruction");
4022
4023 case AArch64::LDRQroX:
4024 case AArch64::LDURQi:
4025 case AArch64::LDRQui:
4026 return AArch64::LDRQroW;
4027 case AArch64::STRQroX:
4028 case AArch64::STURQi:
4029 case AArch64::STRQui:
4030 return AArch64::STRQroW;
4031 case AArch64::LDRDroX:
4032 case AArch64::LDURDi:
4033 case AArch64::LDRDui:
4034 return AArch64::LDRDroW;
4035 case AArch64::STRDroX:
4036 case AArch64::STURDi:
4037 case AArch64::STRDui:
4038 return AArch64::STRDroW;
4039 case AArch64::LDRXroX:
4040 case AArch64::LDURXi:
4041 case AArch64::LDRXui:
4042 return AArch64::LDRXroW;
4043 case AArch64::STRXroX:
4044 case AArch64::STURXi:
4045 case AArch64::STRXui:
4046 return AArch64::STRXroW;
4047 case AArch64::LDRWroX:
4048 case AArch64::LDURWi:
4049 case AArch64::LDRWui:
4050 return AArch64::LDRWroW;
4051 case AArch64::LDRSWroX:
4052 case AArch64::LDURSWi:
4053 case AArch64::LDRSWui:
4054 return AArch64::LDRSWroW;
4055 case AArch64::STRWroX:
4056 case AArch64::STURWi:
4057 case AArch64::STRWui:
4058 return AArch64::STRWroW;
4059 case AArch64::LDRHroX:
4060 case AArch64::LDURHi:
4061 case AArch64::LDRHui:
4062 return AArch64::LDRHroW;
4063 case AArch64::STRHroX:
4064 case AArch64::STURHi:
4065 case AArch64::STRHui:
4066 return AArch64::STRHroW;
4067 case AArch64::LDRHHroX:
4068 case AArch64::LDURHHi:
4069 case AArch64::LDRHHui:
4070 return AArch64::LDRHHroW;
4071 case AArch64::STRHHroX:
4072 case AArch64::STURHHi:
4073 case AArch64::STRHHui:
4074 return AArch64::STRHHroW;
4075 case AArch64::LDRSHXroX:
4076 case AArch64::LDURSHXi:
4077 case AArch64::LDRSHXui:
4078 return AArch64::LDRSHXroW;
4079 case AArch64::LDRSHWroX:
4080 case AArch64::LDURSHWi:
4081 case AArch64::LDRSHWui:
4082 return AArch64::LDRSHWroW;
4083 case AArch64::LDRBroX:
4084 case AArch64::LDURBi:
4085 case AArch64::LDRBui:
4086 return AArch64::LDRBroW;
4087 case AArch64::LDRBBroX:
4088 case AArch64::LDURBBi:
4089 case AArch64::LDRBBui:
4090 return AArch64::LDRBBroW;
4091 case AArch64::LDRSBXroX:
4092 case AArch64::LDURSBXi:
4093 case AArch64::LDRSBXui:
4094 return AArch64::LDRSBXroW;
4095 case AArch64::LDRSBWroX:
4096 case AArch64::LDURSBWi:
4097 case AArch64::LDRSBWui:
4098 return AArch64::LDRSBWroW;
4099 case AArch64::STRBroX:
4100 case AArch64::STURBi:
4101 case AArch64::STRBui:
4102 return AArch64::STRBroW;
4103 case AArch64::STRBBroX:
4104 case AArch64::STURBBi:
4105 case AArch64::STRBBui:
4106 return AArch64::STRBBroW;
4107 }
4108}
4109
4111 const ExtAddrMode &AM) const {
4112
4113 const DebugLoc &DL = MemI.getDebugLoc();
4114 MachineBasicBlock &MBB = *MemI.getParent();
4115 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4116
4118 if (AM.ScaledReg) {
4119 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4120 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4121 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4122 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4123 .addReg(MemI.getOperand(0).getReg(),
4124 getDefRegState(MemI.mayLoad()))
4125 .addReg(AM.BaseReg)
4126 .addReg(AM.ScaledReg)
4127 .addImm(0)
4128 .addImm(AM.Scale > 1)
4129 .setMemRefs(MemI.memoperands())
4130 .setMIFlags(MemI.getFlags());
4131 return B.getInstr();
4132 }
4133
4134 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4135 "Addressing mode not supported for folding");
4136
4137 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4138 unsigned Scale = 1;
4139 unsigned Opcode = MemI.getOpcode();
4140 if (isInt<9>(AM.Displacement))
4141 Opcode = unscaledOffsetOpcode(Opcode);
4142 else
4143 Opcode = scaledOffsetOpcode(Opcode, Scale);
4144
4145 auto B =
4146 BuildMI(MBB, MemI, DL, get(Opcode))
4147 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4148 .addReg(AM.BaseReg)
4149 .addImm(AM.Displacement / Scale)
4150 .setMemRefs(MemI.memoperands())
4151 .setMIFlags(MemI.getFlags());
4152 return B.getInstr();
4153 }
4154
4157 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4158 assert(AM.ScaledReg && !AM.Displacement &&
4159 "Address offset can be a register or an immediate, but not both");
4160 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4161 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4162 // Make sure the offset register is in the correct register class.
4163 Register OffsetReg = AM.ScaledReg;
4164 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4165 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4166 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4167 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4168 .addReg(AM.ScaledReg, {}, AArch64::sub_32);
4169 }
4170 auto B =
4171 BuildMI(MBB, MemI, DL, get(Opcode))
4172 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4173 .addReg(AM.BaseReg)
4174 .addReg(OffsetReg)
4176 .addImm(AM.Scale != 1)
4177 .setMemRefs(MemI.memoperands())
4178 .setMIFlags(MemI.getFlags());
4179
4180 return B.getInstr();
4181 }
4182
4184 "Function must not be called with an addressing mode it can't handle");
4185}
4186
4187/// Return true if the opcode is a post-index ld/st instruction, which really
4188/// loads from base+0.
4189static bool isPostIndexLdStOpcode(unsigned Opcode) {
4190 switch (Opcode) {
4191 default:
4192 return false;
4193 case AArch64::LD1Fourv16b_POST:
4194 case AArch64::LD1Fourv1d_POST:
4195 case AArch64::LD1Fourv2d_POST:
4196 case AArch64::LD1Fourv2s_POST:
4197 case AArch64::LD1Fourv4h_POST:
4198 case AArch64::LD1Fourv4s_POST:
4199 case AArch64::LD1Fourv8b_POST:
4200 case AArch64::LD1Fourv8h_POST:
4201 case AArch64::LD1Onev16b_POST:
4202 case AArch64::LD1Onev1d_POST:
4203 case AArch64::LD1Onev2d_POST:
4204 case AArch64::LD1Onev2s_POST:
4205 case AArch64::LD1Onev4h_POST:
4206 case AArch64::LD1Onev4s_POST:
4207 case AArch64::LD1Onev8b_POST:
4208 case AArch64::LD1Onev8h_POST:
4209 case AArch64::LD1Rv16b_POST:
4210 case AArch64::LD1Rv1d_POST:
4211 case AArch64::LD1Rv2d_POST:
4212 case AArch64::LD1Rv2s_POST:
4213 case AArch64::LD1Rv4h_POST:
4214 case AArch64::LD1Rv4s_POST:
4215 case AArch64::LD1Rv8b_POST:
4216 case AArch64::LD1Rv8h_POST:
4217 case AArch64::LD1Threev16b_POST:
4218 case AArch64::LD1Threev1d_POST:
4219 case AArch64::LD1Threev2d_POST:
4220 case AArch64::LD1Threev2s_POST:
4221 case AArch64::LD1Threev4h_POST:
4222 case AArch64::LD1Threev4s_POST:
4223 case AArch64::LD1Threev8b_POST:
4224 case AArch64::LD1Threev8h_POST:
4225 case AArch64::LD1Twov16b_POST:
4226 case AArch64::LD1Twov1d_POST:
4227 case AArch64::LD1Twov2d_POST:
4228 case AArch64::LD1Twov2s_POST:
4229 case AArch64::LD1Twov4h_POST:
4230 case AArch64::LD1Twov4s_POST:
4231 case AArch64::LD1Twov8b_POST:
4232 case AArch64::LD1Twov8h_POST:
4233 case AArch64::LD1i16_POST:
4234 case AArch64::LD1i32_POST:
4235 case AArch64::LD1i64_POST:
4236 case AArch64::LD1i8_POST:
4237 case AArch64::LD2Rv16b_POST:
4238 case AArch64::LD2Rv1d_POST:
4239 case AArch64::LD2Rv2d_POST:
4240 case AArch64::LD2Rv2s_POST:
4241 case AArch64::LD2Rv4h_POST:
4242 case AArch64::LD2Rv4s_POST:
4243 case AArch64::LD2Rv8b_POST:
4244 case AArch64::LD2Rv8h_POST:
4245 case AArch64::LD2Twov16b_POST:
4246 case AArch64::LD2Twov2d_POST:
4247 case AArch64::LD2Twov2s_POST:
4248 case AArch64::LD2Twov4h_POST:
4249 case AArch64::LD2Twov4s_POST:
4250 case AArch64::LD2Twov8b_POST:
4251 case AArch64::LD2Twov8h_POST:
4252 case AArch64::LD2i16_POST:
4253 case AArch64::LD2i32_POST:
4254 case AArch64::LD2i64_POST:
4255 case AArch64::LD2i8_POST:
4256 case AArch64::LD3Rv16b_POST:
4257 case AArch64::LD3Rv1d_POST:
4258 case AArch64::LD3Rv2d_POST:
4259 case AArch64::LD3Rv2s_POST:
4260 case AArch64::LD3Rv4h_POST:
4261 case AArch64::LD3Rv4s_POST:
4262 case AArch64::LD3Rv8b_POST:
4263 case AArch64::LD3Rv8h_POST:
4264 case AArch64::LD3Threev16b_POST:
4265 case AArch64::LD3Threev2d_POST:
4266 case AArch64::LD3Threev2s_POST:
4267 case AArch64::LD3Threev4h_POST:
4268 case AArch64::LD3Threev4s_POST:
4269 case AArch64::LD3Threev8b_POST:
4270 case AArch64::LD3Threev8h_POST:
4271 case AArch64::LD3i16_POST:
4272 case AArch64::LD3i32_POST:
4273 case AArch64::LD3i64_POST:
4274 case AArch64::LD3i8_POST:
4275 case AArch64::LD4Fourv16b_POST:
4276 case AArch64::LD4Fourv2d_POST:
4277 case AArch64::LD4Fourv2s_POST:
4278 case AArch64::LD4Fourv4h_POST:
4279 case AArch64::LD4Fourv4s_POST:
4280 case AArch64::LD4Fourv8b_POST:
4281 case AArch64::LD4Fourv8h_POST:
4282 case AArch64::LD4Rv16b_POST:
4283 case AArch64::LD4Rv1d_POST:
4284 case AArch64::LD4Rv2d_POST:
4285 case AArch64::LD4Rv2s_POST:
4286 case AArch64::LD4Rv4h_POST:
4287 case AArch64::LD4Rv4s_POST:
4288 case AArch64::LD4Rv8b_POST:
4289 case AArch64::LD4Rv8h_POST:
4290 case AArch64::LD4i16_POST:
4291 case AArch64::LD4i32_POST:
4292 case AArch64::LD4i64_POST:
4293 case AArch64::LD4i8_POST:
4294 case AArch64::LDAPRWpost:
4295 case AArch64::LDAPRXpost:
4296 case AArch64::LDIAPPWpost:
4297 case AArch64::LDIAPPXpost:
4298 case AArch64::LDPDpost:
4299 case AArch64::LDPQpost:
4300 case AArch64::LDPSWpost:
4301 case AArch64::LDPSpost:
4302 case AArch64::LDPWpost:
4303 case AArch64::LDPXpost:
4304 case AArch64::LDRBBpost:
4305 case AArch64::LDRBpost:
4306 case AArch64::LDRDpost:
4307 case AArch64::LDRHHpost:
4308 case AArch64::LDRHpost:
4309 case AArch64::LDRQpost:
4310 case AArch64::LDRSBWpost:
4311 case AArch64::LDRSBXpost:
4312 case AArch64::LDRSHWpost:
4313 case AArch64::LDRSHXpost:
4314 case AArch64::LDRSWpost:
4315 case AArch64::LDRSpost:
4316 case AArch64::LDRWpost:
4317 case AArch64::LDRXpost:
4318 case AArch64::ST1Fourv16b_POST:
4319 case AArch64::ST1Fourv1d_POST:
4320 case AArch64::ST1Fourv2d_POST:
4321 case AArch64::ST1Fourv2s_POST:
4322 case AArch64::ST1Fourv4h_POST:
4323 case AArch64::ST1Fourv4s_POST:
4324 case AArch64::ST1Fourv8b_POST:
4325 case AArch64::ST1Fourv8h_POST:
4326 case AArch64::ST1Onev16b_POST:
4327 case AArch64::ST1Onev1d_POST:
4328 case AArch64::ST1Onev2d_POST:
4329 case AArch64::ST1Onev2s_POST:
4330 case AArch64::ST1Onev4h_POST:
4331 case AArch64::ST1Onev4s_POST:
4332 case AArch64::ST1Onev8b_POST:
4333 case AArch64::ST1Onev8h_POST:
4334 case AArch64::ST1Threev16b_POST:
4335 case AArch64::ST1Threev1d_POST:
4336 case AArch64::ST1Threev2d_POST:
4337 case AArch64::ST1Threev2s_POST:
4338 case AArch64::ST1Threev4h_POST:
4339 case AArch64::ST1Threev4s_POST:
4340 case AArch64::ST1Threev8b_POST:
4341 case AArch64::ST1Threev8h_POST:
4342 case AArch64::ST1Twov16b_POST:
4343 case AArch64::ST1Twov1d_POST:
4344 case AArch64::ST1Twov2d_POST:
4345 case AArch64::ST1Twov2s_POST:
4346 case AArch64::ST1Twov4h_POST:
4347 case AArch64::ST1Twov4s_POST:
4348 case AArch64::ST1Twov8b_POST:
4349 case AArch64::ST1Twov8h_POST:
4350 case AArch64::ST1i16_POST:
4351 case AArch64::ST1i32_POST:
4352 case AArch64::ST1i64_POST:
4353 case AArch64::ST1i8_POST:
4354 case AArch64::ST2GPostIndex:
4355 case AArch64::ST2Twov16b_POST:
4356 case AArch64::ST2Twov2d_POST:
4357 case AArch64::ST2Twov2s_POST:
4358 case AArch64::ST2Twov4h_POST:
4359 case AArch64::ST2Twov4s_POST:
4360 case AArch64::ST2Twov8b_POST:
4361 case AArch64::ST2Twov8h_POST:
4362 case AArch64::ST2i16_POST:
4363 case AArch64::ST2i32_POST:
4364 case AArch64::ST2i64_POST:
4365 case AArch64::ST2i8_POST:
4366 case AArch64::ST3Threev16b_POST:
4367 case AArch64::ST3Threev2d_POST:
4368 case AArch64::ST3Threev2s_POST:
4369 case AArch64::ST3Threev4h_POST:
4370 case AArch64::ST3Threev4s_POST:
4371 case AArch64::ST3Threev8b_POST:
4372 case AArch64::ST3Threev8h_POST:
4373 case AArch64::ST3i16_POST:
4374 case AArch64::ST3i32_POST:
4375 case AArch64::ST3i64_POST:
4376 case AArch64::ST3i8_POST:
4377 case AArch64::ST4Fourv16b_POST:
4378 case AArch64::ST4Fourv2d_POST:
4379 case AArch64::ST4Fourv2s_POST:
4380 case AArch64::ST4Fourv4h_POST:
4381 case AArch64::ST4Fourv4s_POST:
4382 case AArch64::ST4Fourv8b_POST:
4383 case AArch64::ST4Fourv8h_POST:
4384 case AArch64::ST4i16_POST:
4385 case AArch64::ST4i32_POST:
4386 case AArch64::ST4i64_POST:
4387 case AArch64::ST4i8_POST:
4388 case AArch64::STGPostIndex:
4389 case AArch64::STGPpost:
4390 case AArch64::STPDpost:
4391 case AArch64::STPQpost:
4392 case AArch64::STPSpost:
4393 case AArch64::STPWpost:
4394 case AArch64::STPXpost:
4395 case AArch64::STRBBpost:
4396 case AArch64::STRBpost:
4397 case AArch64::STRDpost:
4398 case AArch64::STRHHpost:
4399 case AArch64::STRHpost:
4400 case AArch64::STRQpost:
4401 case AArch64::STRSpost:
4402 case AArch64::STRWpost:
4403 case AArch64::STRXpost:
4404 case AArch64::STZ2GPostIndex:
4405 case AArch64::STZGPostIndex:
4406 return true;
4407 }
4408}
4409
4411 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4412 bool &OffsetIsScalable, TypeSize &Width,
4413 const TargetRegisterInfo *TRI) const {
4414 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4415 // Handle only loads/stores with base register followed by immediate offset.
4416 if (LdSt.getNumExplicitOperands() == 3) {
4417 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4418 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4419 !LdSt.getOperand(2).isImm())
4420 return false;
4421 } else if (LdSt.getNumExplicitOperands() == 4) {
4422 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4423 if (!LdSt.getOperand(1).isReg() ||
4424 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4425 !LdSt.getOperand(3).isImm())
4426 return false;
4427 } else
4428 return false;
4429
4430 // Get the scaling factor for the instruction and set the width for the
4431 // instruction.
4432 TypeSize Scale(0U, false);
4433 int64_t Dummy1, Dummy2;
4434
4435 // If this returns false, then it's an instruction we don't want to handle.
4436 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4437 return false;
4438
4439 // Compute the offset. Offset is calculated as the immediate operand
4440 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4441 // set to 1. Postindex are a special case which have an offset of 0.
4442 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4443 BaseOp = &LdSt.getOperand(2);
4444 Offset = 0;
4445 } else if (LdSt.getNumExplicitOperands() == 3) {
4446 BaseOp = &LdSt.getOperand(1);
4447 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4448 } else {
4449 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4450 BaseOp = &LdSt.getOperand(2);
4451 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4452 }
4453 OffsetIsScalable = Scale.isScalable();
4454
4455 return BaseOp->isReg() || BaseOp->isFI();
4456}
4457
4460 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4461 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4462 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4463 return OfsOp;
4464}
4465
4466bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4467 TypeSize &Width, int64_t &MinOffset,
4468 int64_t &MaxOffset) {
4469 switch (Opcode) {
4470 // Not a memory operation or something we want to handle.
4471 default:
4472 Scale = TypeSize::getFixed(0);
4473 Width = TypeSize::getFixed(0);
4474 MinOffset = MaxOffset = 0;
4475 return false;
4476 // LDR / STR
4477 case AArch64::LDRQui:
4478 case AArch64::STRQui:
4479 Scale = TypeSize::getFixed(16);
4480 Width = TypeSize::getFixed(16);
4481 MinOffset = 0;
4482 MaxOffset = 4095;
4483 break;
4484 case AArch64::LDRXui:
4485 case AArch64::LDRDui:
4486 case AArch64::STRXui:
4487 case AArch64::STRDui:
4488 case AArch64::PRFMui:
4489 Scale = TypeSize::getFixed(8);
4490 Width = TypeSize::getFixed(8);
4491 MinOffset = 0;
4492 MaxOffset = 4095;
4493 break;
4494 case AArch64::LDRWui:
4495 case AArch64::LDRSui:
4496 case AArch64::LDRSWui:
4497 case AArch64::STRWui:
4498 case AArch64::STRSui:
4499 Scale = TypeSize::getFixed(4);
4500 Width = TypeSize::getFixed(4);
4501 MinOffset = 0;
4502 MaxOffset = 4095;
4503 break;
4504 case AArch64::LDRHui:
4505 case AArch64::LDRHHui:
4506 case AArch64::LDRSHWui:
4507 case AArch64::LDRSHXui:
4508 case AArch64::STRHui:
4509 case AArch64::STRHHui:
4510 Scale = TypeSize::getFixed(2);
4511 Width = TypeSize::getFixed(2);
4512 MinOffset = 0;
4513 MaxOffset = 4095;
4514 break;
4515 case AArch64::LDRBui:
4516 case AArch64::LDRBBui:
4517 case AArch64::LDRSBWui:
4518 case AArch64::LDRSBXui:
4519 case AArch64::STRBui:
4520 case AArch64::STRBBui:
4521 Scale = TypeSize::getFixed(1);
4522 Width = TypeSize::getFixed(1);
4523 MinOffset = 0;
4524 MaxOffset = 4095;
4525 break;
4526 // post/pre inc
4527 case AArch64::STRQpre:
4528 case AArch64::LDRQpost:
4529 Scale = TypeSize::getFixed(1);
4530 Width = TypeSize::getFixed(16);
4531 MinOffset = -256;
4532 MaxOffset = 255;
4533 break;
4534 case AArch64::LDRDpost:
4535 case AArch64::LDRDpre:
4536 case AArch64::LDRXpost:
4537 case AArch64::LDRXpre:
4538 case AArch64::STRDpost:
4539 case AArch64::STRDpre:
4540 case AArch64::STRXpost:
4541 case AArch64::STRXpre:
4542 Scale = TypeSize::getFixed(1);
4543 Width = TypeSize::getFixed(8);
4544 MinOffset = -256;
4545 MaxOffset = 255;
4546 break;
4547 case AArch64::STRWpost:
4548 case AArch64::STRWpre:
4549 case AArch64::LDRWpost:
4550 case AArch64::LDRWpre:
4551 case AArch64::STRSpost:
4552 case AArch64::STRSpre:
4553 case AArch64::LDRSpost:
4554 case AArch64::LDRSpre:
4555 Scale = TypeSize::getFixed(1);
4556 Width = TypeSize::getFixed(4);
4557 MinOffset = -256;
4558 MaxOffset = 255;
4559 break;
4560 case AArch64::LDRHpost:
4561 case AArch64::LDRHpre:
4562 case AArch64::STRHpost:
4563 case AArch64::STRHpre:
4564 case AArch64::LDRHHpost:
4565 case AArch64::LDRHHpre:
4566 case AArch64::STRHHpost:
4567 case AArch64::STRHHpre:
4568 Scale = TypeSize::getFixed(1);
4569 Width = TypeSize::getFixed(2);
4570 MinOffset = -256;
4571 MaxOffset = 255;
4572 break;
4573 case AArch64::LDRBpost:
4574 case AArch64::LDRBpre:
4575 case AArch64::STRBpost:
4576 case AArch64::STRBpre:
4577 case AArch64::LDRBBpost:
4578 case AArch64::LDRBBpre:
4579 case AArch64::STRBBpost:
4580 case AArch64::STRBBpre:
4581 Scale = TypeSize::getFixed(1);
4582 Width = TypeSize::getFixed(1);
4583 MinOffset = -256;
4584 MaxOffset = 255;
4585 break;
4586 // Unscaled
4587 case AArch64::LDURQi:
4588 case AArch64::STURQi:
4589 Scale = TypeSize::getFixed(1);
4590 Width = TypeSize::getFixed(16);
4591 MinOffset = -256;
4592 MaxOffset = 255;
4593 break;
4594 case AArch64::LDURXi:
4595 case AArch64::LDURDi:
4596 case AArch64::LDAPURXi:
4597 case AArch64::STURXi:
4598 case AArch64::STURDi:
4599 case AArch64::STLURXi:
4600 case AArch64::PRFUMi:
4601 Scale = TypeSize::getFixed(1);
4602 Width = TypeSize::getFixed(8);
4603 MinOffset = -256;
4604 MaxOffset = 255;
4605 break;
4606 case AArch64::LDURWi:
4607 case AArch64::LDURSi:
4608 case AArch64::LDURSWi:
4609 case AArch64::LDAPURi:
4610 case AArch64::LDAPURSWi:
4611 case AArch64::STURWi:
4612 case AArch64::STURSi:
4613 case AArch64::STLURWi:
4614 Scale = TypeSize::getFixed(1);
4615 Width = TypeSize::getFixed(4);
4616 MinOffset = -256;
4617 MaxOffset = 255;
4618 break;
4619 case AArch64::LDURHi:
4620 case AArch64::LDURHHi:
4621 case AArch64::LDURSHXi:
4622 case AArch64::LDURSHWi:
4623 case AArch64::LDAPURHi:
4624 case AArch64::LDAPURSHWi:
4625 case AArch64::LDAPURSHXi:
4626 case AArch64::STURHi:
4627 case AArch64::STURHHi:
4628 case AArch64::STLURHi:
4629 Scale = TypeSize::getFixed(1);
4630 Width = TypeSize::getFixed(2);
4631 MinOffset = -256;
4632 MaxOffset = 255;
4633 break;
4634 case AArch64::LDURBi:
4635 case AArch64::LDURBBi:
4636 case AArch64::LDURSBXi:
4637 case AArch64::LDURSBWi:
4638 case AArch64::LDAPURBi:
4639 case AArch64::LDAPURSBWi:
4640 case AArch64::LDAPURSBXi:
4641 case AArch64::STURBi:
4642 case AArch64::STURBBi:
4643 case AArch64::STLURBi:
4644 Scale = TypeSize::getFixed(1);
4645 Width = TypeSize::getFixed(1);
4646 MinOffset = -256;
4647 MaxOffset = 255;
4648 break;
4649 // LDP / STP (including pre/post inc)
4650 case AArch64::LDPQi:
4651 case AArch64::LDNPQi:
4652 case AArch64::STPQi:
4653 case AArch64::STNPQi:
4654 case AArch64::LDPQpost:
4655 case AArch64::LDPQpre:
4656 case AArch64::STPQpost:
4657 case AArch64::STPQpre:
4658 Scale = TypeSize::getFixed(16);
4659 Width = TypeSize::getFixed(16 * 2);
4660 MinOffset = -64;
4661 MaxOffset = 63;
4662 break;
4663 case AArch64::LDPXi:
4664 case AArch64::LDPDi:
4665 case AArch64::LDNPXi:
4666 case AArch64::LDNPDi:
4667 case AArch64::STPXi:
4668 case AArch64::STPDi:
4669 case AArch64::STNPXi:
4670 case AArch64::STNPDi:
4671 case AArch64::LDPDpost:
4672 case AArch64::LDPDpre:
4673 case AArch64::LDPXpost:
4674 case AArch64::LDPXpre:
4675 case AArch64::STPDpost:
4676 case AArch64::STPDpre:
4677 case AArch64::STPXpost:
4678 case AArch64::STPXpre:
4679 Scale = TypeSize::getFixed(8);
4680 Width = TypeSize::getFixed(8 * 2);
4681 MinOffset = -64;
4682 MaxOffset = 63;
4683 break;
4684 case AArch64::LDPWi:
4685 case AArch64::LDPSi:
4686 case AArch64::LDNPWi:
4687 case AArch64::LDNPSi:
4688 case AArch64::STPWi:
4689 case AArch64::STPSi:
4690 case AArch64::STNPWi:
4691 case AArch64::STNPSi:
4692 case AArch64::LDPSpost:
4693 case AArch64::LDPSpre:
4694 case AArch64::LDPWpost:
4695 case AArch64::LDPWpre:
4696 case AArch64::STPSpost:
4697 case AArch64::STPSpre:
4698 case AArch64::STPWpost:
4699 case AArch64::STPWpre:
4700 Scale = TypeSize::getFixed(4);
4701 Width = TypeSize::getFixed(4 * 2);
4702 MinOffset = -64;
4703 MaxOffset = 63;
4704 break;
4705 case AArch64::StoreSwiftAsyncContext:
4706 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4707 Scale = TypeSize::getFixed(1);
4708 Width = TypeSize::getFixed(8);
4709 MinOffset = 0;
4710 MaxOffset = 4095;
4711 break;
4712 case AArch64::ADDG:
4713 Scale = TypeSize::getFixed(16);
4714 Width = TypeSize::getFixed(0);
4715 MinOffset = 0;
4716 MaxOffset = 63;
4717 break;
4718 case AArch64::TAGPstack:
4719 Scale = TypeSize::getFixed(16);
4720 Width = TypeSize::getFixed(0);
4721 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4722 // of 63 (not 64!).
4723 MinOffset = -63;
4724 MaxOffset = 63;
4725 break;
4726 case AArch64::LDG:
4727 case AArch64::STGi:
4728 case AArch64::STGPreIndex:
4729 case AArch64::STGPostIndex:
4730 case AArch64::STZGi:
4731 case AArch64::STZGPreIndex:
4732 case AArch64::STZGPostIndex:
4733 Scale = TypeSize::getFixed(16);
4734 Width = TypeSize::getFixed(16);
4735 MinOffset = -256;
4736 MaxOffset = 255;
4737 break;
4738 // SVE
4739 case AArch64::STR_ZZZZXI:
4740 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4741 case AArch64::LDR_ZZZZXI:
4742 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4743 Scale = TypeSize::getScalable(16);
4744 Width = TypeSize::getScalable(16 * 4);
4745 MinOffset = -256;
4746 MaxOffset = 252;
4747 break;
4748 case AArch64::STR_ZZZXI:
4749 case AArch64::LDR_ZZZXI:
4750 Scale = TypeSize::getScalable(16);
4751 Width = TypeSize::getScalable(16 * 3);
4752 MinOffset = -256;
4753 MaxOffset = 253;
4754 break;
4755 case AArch64::STR_ZZXI:
4756 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4757 case AArch64::LDR_ZZXI:
4758 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4759 Scale = TypeSize::getScalable(16);
4760 Width = TypeSize::getScalable(16 * 2);
4761 MinOffset = -256;
4762 MaxOffset = 254;
4763 break;
4764 case AArch64::LDR_PXI:
4765 case AArch64::STR_PXI:
4766 Scale = TypeSize::getScalable(2);
4767 Width = TypeSize::getScalable(2);
4768 MinOffset = -256;
4769 MaxOffset = 255;
4770 break;
4771 case AArch64::LDR_PPXI:
4772 case AArch64::STR_PPXI:
4773 Scale = TypeSize::getScalable(2);
4774 Width = TypeSize::getScalable(2 * 2);
4775 MinOffset = -256;
4776 MaxOffset = 254;
4777 break;
4778 case AArch64::LDR_ZXI:
4779 case AArch64::STR_ZXI:
4780 Scale = TypeSize::getScalable(16);
4781 Width = TypeSize::getScalable(16);
4782 MinOffset = -256;
4783 MaxOffset = 255;
4784 break;
4785 case AArch64::LD1B_IMM:
4786 case AArch64::LD1H_IMM:
4787 case AArch64::LD1W_IMM:
4788 case AArch64::LD1D_IMM:
4789 case AArch64::LDNT1B_ZRI:
4790 case AArch64::LDNT1H_ZRI:
4791 case AArch64::LDNT1W_ZRI:
4792 case AArch64::LDNT1D_ZRI:
4793 case AArch64::ST1B_IMM:
4794 case AArch64::ST1H_IMM:
4795 case AArch64::ST1W_IMM:
4796 case AArch64::ST1D_IMM:
4797 case AArch64::STNT1B_ZRI:
4798 case AArch64::STNT1H_ZRI:
4799 case AArch64::STNT1W_ZRI:
4800 case AArch64::STNT1D_ZRI:
4801 case AArch64::LDNF1B_IMM:
4802 case AArch64::LDNF1H_IMM:
4803 case AArch64::LDNF1W_IMM:
4804 case AArch64::LDNF1D_IMM:
4805 // A full vectors worth of data
4806 // Width = mbytes * elements
4807 Scale = TypeSize::getScalable(16);
4808 Width = TypeSize::getScalable(16);
4809 MinOffset = -8;
4810 MaxOffset = 7;
4811 break;
4812 case AArch64::LD2B_IMM:
4813 case AArch64::LD2H_IMM:
4814 case AArch64::LD2W_IMM:
4815 case AArch64::LD2D_IMM:
4816 case AArch64::ST2B_IMM:
4817 case AArch64::ST2H_IMM:
4818 case AArch64::ST2W_IMM:
4819 case AArch64::ST2D_IMM:
4820 Scale = TypeSize::getScalable(32);
4821 Width = TypeSize::getScalable(16 * 2);
4822 MinOffset = -8;
4823 MaxOffset = 7;
4824 break;
4825 case AArch64::LD3B_IMM:
4826 case AArch64::LD3H_IMM:
4827 case AArch64::LD3W_IMM:
4828 case AArch64::LD3D_IMM:
4829 case AArch64::ST3B_IMM:
4830 case AArch64::ST3H_IMM:
4831 case AArch64::ST3W_IMM:
4832 case AArch64::ST3D_IMM:
4833 Scale = TypeSize::getScalable(48);
4834 Width = TypeSize::getScalable(16 * 3);
4835 MinOffset = -8;
4836 MaxOffset = 7;
4837 break;
4838 case AArch64::LD4B_IMM:
4839 case AArch64::LD4H_IMM:
4840 case AArch64::LD4W_IMM:
4841 case AArch64::LD4D_IMM:
4842 case AArch64::ST4B_IMM:
4843 case AArch64::ST4H_IMM:
4844 case AArch64::ST4W_IMM:
4845 case AArch64::ST4D_IMM:
4846 Scale = TypeSize::getScalable(64);
4847 Width = TypeSize::getScalable(16 * 4);
4848 MinOffset = -8;
4849 MaxOffset = 7;
4850 break;
4851 case AArch64::LD1B_H_IMM:
4852 case AArch64::LD1SB_H_IMM:
4853 case AArch64::LD1H_S_IMM:
4854 case AArch64::LD1SH_S_IMM:
4855 case AArch64::LD1W_D_IMM:
4856 case AArch64::LD1SW_D_IMM:
4857 case AArch64::ST1B_H_IMM:
4858 case AArch64::ST1H_S_IMM:
4859 case AArch64::ST1W_D_IMM:
4860 case AArch64::LDNF1B_H_IMM:
4861 case AArch64::LDNF1SB_H_IMM:
4862 case AArch64::LDNF1H_S_IMM:
4863 case AArch64::LDNF1SH_S_IMM:
4864 case AArch64::LDNF1W_D_IMM:
4865 case AArch64::LDNF1SW_D_IMM:
4866 // A half vector worth of data
4867 // Width = mbytes * elements
4868 Scale = TypeSize::getScalable(8);
4869 Width = TypeSize::getScalable(8);
4870 MinOffset = -8;
4871 MaxOffset = 7;
4872 break;
4873 case AArch64::LD1B_S_IMM:
4874 case AArch64::LD1SB_S_IMM:
4875 case AArch64::LD1H_D_IMM:
4876 case AArch64::LD1SH_D_IMM:
4877 case AArch64::ST1B_S_IMM:
4878 case AArch64::ST1H_D_IMM:
4879 case AArch64::LDNF1B_S_IMM:
4880 case AArch64::LDNF1SB_S_IMM:
4881 case AArch64::LDNF1H_D_IMM:
4882 case AArch64::LDNF1SH_D_IMM:
4883 // A quarter vector worth of data
4884 // Width = mbytes * elements
4885 Scale = TypeSize::getScalable(4);
4886 Width = TypeSize::getScalable(4);
4887 MinOffset = -8;
4888 MaxOffset = 7;
4889 break;
4890 case AArch64::LD1B_D_IMM:
4891 case AArch64::LD1SB_D_IMM:
4892 case AArch64::ST1B_D_IMM:
4893 case AArch64::LDNF1B_D_IMM:
4894 case AArch64::LDNF1SB_D_IMM:
4895 // A eighth vector worth of data
4896 // Width = mbytes * elements
4897 Scale = TypeSize::getScalable(2);
4898 Width = TypeSize::getScalable(2);
4899 MinOffset = -8;
4900 MaxOffset = 7;
4901 break;
4902 case AArch64::ST2Gi:
4903 case AArch64::ST2GPreIndex:
4904 case AArch64::ST2GPostIndex:
4905 case AArch64::STZ2Gi:
4906 case AArch64::STZ2GPreIndex:
4907 case AArch64::STZ2GPostIndex:
4908 Scale = TypeSize::getFixed(16);
4909 Width = TypeSize::getFixed(32);
4910 MinOffset = -256;
4911 MaxOffset = 255;
4912 break;
4913 case AArch64::STGPi:
4914 case AArch64::STGPpost:
4915 case AArch64::STGPpre:
4916 Scale = TypeSize::getFixed(16);
4917 Width = TypeSize::getFixed(16);
4918 MinOffset = -64;
4919 MaxOffset = 63;
4920 break;
4921 case AArch64::LD1RB_IMM:
4922 case AArch64::LD1RB_H_IMM:
4923 case AArch64::LD1RB_S_IMM:
4924 case AArch64::LD1RB_D_IMM:
4925 case AArch64::LD1RSB_H_IMM:
4926 case AArch64::LD1RSB_S_IMM:
4927 case AArch64::LD1RSB_D_IMM:
4928 Scale = TypeSize::getFixed(1);
4929 Width = TypeSize::getFixed(1);
4930 MinOffset = 0;
4931 MaxOffset = 63;
4932 break;
4933 case AArch64::LD1RH_IMM:
4934 case AArch64::LD1RH_S_IMM:
4935 case AArch64::LD1RH_D_IMM:
4936 case AArch64::LD1RSH_S_IMM:
4937 case AArch64::LD1RSH_D_IMM:
4938 Scale = TypeSize::getFixed(2);
4939 Width = TypeSize::getFixed(2);
4940 MinOffset = 0;
4941 MaxOffset = 63;
4942 break;
4943 case AArch64::LD1RW_IMM:
4944 case AArch64::LD1RW_D_IMM:
4945 case AArch64::LD1RSW_IMM:
4946 Scale = TypeSize::getFixed(4);
4947 Width = TypeSize::getFixed(4);
4948 MinOffset = 0;
4949 MaxOffset = 63;
4950 break;
4951 case AArch64::LD1RD_IMM:
4952 Scale = TypeSize::getFixed(8);
4953 Width = TypeSize::getFixed(8);
4954 MinOffset = 0;
4955 MaxOffset = 63;
4956 break;
4957 }
4958
4959 return true;
4960}
4961
4962// Scaling factor for unscaled load or store.
4964 switch (Opc) {
4965 default:
4966 llvm_unreachable("Opcode has unknown scale!");
4967 case AArch64::LDRBui:
4968 case AArch64::LDRBBui:
4969 case AArch64::LDURBBi:
4970 case AArch64::LDRSBWui:
4971 case AArch64::LDURSBWi:
4972 case AArch64::STRBui:
4973 case AArch64::STRBBui:
4974 case AArch64::STURBBi:
4975 return 1;
4976 case AArch64::LDRHui:
4977 case AArch64::LDRHHui:
4978 case AArch64::LDURHHi:
4979 case AArch64::LDRSHWui:
4980 case AArch64::LDURSHWi:
4981 case AArch64::STRHui:
4982 case AArch64::STRHHui:
4983 case AArch64::STURHHi:
4984 return 2;
4985 case AArch64::LDRSui:
4986 case AArch64::LDURSi:
4987 case AArch64::LDRSpre:
4988 case AArch64::LDRSWui:
4989 case AArch64::LDURSWi:
4990 case AArch64::LDRSWpre:
4991 case AArch64::LDRWpre:
4992 case AArch64::LDRWui:
4993 case AArch64::LDURWi:
4994 case AArch64::STRSui:
4995 case AArch64::STURSi:
4996 case AArch64::STRSpre:
4997 case AArch64::STRWui:
4998 case AArch64::STURWi:
4999 case AArch64::STRWpre:
5000 case AArch64::LDPSi:
5001 case AArch64::LDPSWi:
5002 case AArch64::LDPWi:
5003 case AArch64::STPSi:
5004 case AArch64::STPWi:
5005 return 4;
5006 case AArch64::LDRDui:
5007 case AArch64::LDURDi:
5008 case AArch64::LDRDpre:
5009 case AArch64::LDRXui:
5010 case AArch64::LDURXi:
5011 case AArch64::LDRXpre:
5012 case AArch64::STRDui:
5013 case AArch64::STURDi:
5014 case AArch64::STRDpre:
5015 case AArch64::STRXui:
5016 case AArch64::STURXi:
5017 case AArch64::STRXpre:
5018 case AArch64::LDPDi:
5019 case AArch64::LDPXi:
5020 case AArch64::STPDi:
5021 case AArch64::STPXi:
5022 return 8;
5023 case AArch64::LDRQui:
5024 case AArch64::LDURQi:
5025 case AArch64::STRQui:
5026 case AArch64::STURQi:
5027 case AArch64::STRQpre:
5028 case AArch64::LDPQi:
5029 case AArch64::LDRQpre:
5030 case AArch64::STPQi:
5031 case AArch64::STGi:
5032 case AArch64::STZGi:
5033 case AArch64::ST2Gi:
5034 case AArch64::STZ2Gi:
5035 case AArch64::STGPi:
5036 return 16;
5037 }
5038}
5039
5041 switch (MI.getOpcode()) {
5042 default:
5043 return false;
5044 case AArch64::LDRWpre:
5045 case AArch64::LDRXpre:
5046 case AArch64::LDRSWpre:
5047 case AArch64::LDRSpre:
5048 case AArch64::LDRDpre:
5049 case AArch64::LDRQpre:
5050 return true;
5051 }
5052}
5053
5055 switch (MI.getOpcode()) {
5056 default:
5057 return false;
5058 case AArch64::STRWpre:
5059 case AArch64::STRXpre:
5060 case AArch64::STRSpre:
5061 case AArch64::STRDpre:
5062 case AArch64::STRQpre:
5063 return true;
5064 }
5065}
5066
5068 return isPreLd(MI) || isPreSt(MI);
5069}
5070
5072 switch (MI.getOpcode()) {
5073 default:
5074 return false;
5075 case AArch64::LDURBBi:
5076 case AArch64::LDURHHi:
5077 case AArch64::LDURWi:
5078 case AArch64::LDRBBui:
5079 case AArch64::LDRHHui:
5080 case AArch64::LDRWui:
5081 case AArch64::LDRBBroX:
5082 case AArch64::LDRHHroX:
5083 case AArch64::LDRWroX:
5084 case AArch64::LDRBBroW:
5085 case AArch64::LDRHHroW:
5086 case AArch64::LDRWroW:
5087 return true;
5088 }
5089}
5090
5092 switch (MI.getOpcode()) {
5093 default:
5094 return false;
5095 case AArch64::LDURSBWi:
5096 case AArch64::LDURSHWi:
5097 case AArch64::LDURSBXi:
5098 case AArch64::LDURSHXi:
5099 case AArch64::LDURSWi:
5100 case AArch64::LDRSBWui:
5101 case AArch64::LDRSHWui:
5102 case AArch64::LDRSBXui:
5103 case AArch64::LDRSHXui:
5104 case AArch64::LDRSWui:
5105 case AArch64::LDRSBWroX:
5106 case AArch64::LDRSHWroX:
5107 case AArch64::LDRSBXroX:
5108 case AArch64::LDRSHXroX:
5109 case AArch64::LDRSWroX:
5110 case AArch64::LDRSBWroW:
5111 case AArch64::LDRSHWroW:
5112 case AArch64::LDRSBXroW:
5113 case AArch64::LDRSHXroW:
5114 case AArch64::LDRSWroW:
5115 return true;
5116 }
5117}
5118
5120 switch (MI.getOpcode()) {
5121 default:
5122 return false;
5123 case AArch64::LDPSi:
5124 case AArch64::LDPSWi:
5125 case AArch64::LDPDi:
5126 case AArch64::LDPQi:
5127 case AArch64::LDPWi:
5128 case AArch64::LDPXi:
5129 case AArch64::STPSi:
5130 case AArch64::STPDi:
5131 case AArch64::STPQi:
5132 case AArch64::STPWi:
5133 case AArch64::STPXi:
5134 case AArch64::STGPi:
5135 return true;
5136 }
5137}
5138
5140 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5141 unsigned Idx =
5143 : 1;
5144 return MI.getOperand(Idx);
5145}
5146
5147const MachineOperand &
5149 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5150 unsigned Idx =
5152 : 2;
5153 return MI.getOperand(Idx);
5154}
5155
5156const MachineOperand &
5158 switch (MI.getOpcode()) {
5159 default:
5160 llvm_unreachable("Unexpected opcode");
5161 case AArch64::LDRBroX:
5162 case AArch64::LDRBBroX:
5163 case AArch64::LDRSBXroX:
5164 case AArch64::LDRSBWroX:
5165 case AArch64::LDRHroX:
5166 case AArch64::LDRHHroX:
5167 case AArch64::LDRSHXroX:
5168 case AArch64::LDRSHWroX:
5169 case AArch64::LDRWroX:
5170 case AArch64::LDRSroX:
5171 case AArch64::LDRSWroX:
5172 case AArch64::LDRDroX:
5173 case AArch64::LDRXroX:
5174 case AArch64::LDRQroX:
5175 return MI.getOperand(4);
5176 }
5177}
5178
5180 Register Reg) {
5181 if (MI.getParent() == nullptr)
5182 return nullptr;
5183 const MachineFunction *MF = MI.getParent()->getParent();
5184 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5185}
5186
5188 auto IsHFPR = [&](const MachineOperand &Op) {
5189 if (!Op.isReg())
5190 return false;
5191 auto Reg = Op.getReg();
5192 if (Reg.isPhysical())
5193 return AArch64::FPR16RegClass.contains(Reg);
5194 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5195 return TRC == &AArch64::FPR16RegClass ||
5196 TRC == &AArch64::FPR16_loRegClass;
5197 };
5198 return llvm::any_of(MI.operands(), IsHFPR);
5199}
5200
5202 auto IsQFPR = [&](const MachineOperand &Op) {
5203 if (!Op.isReg())
5204 return false;
5205 auto Reg = Op.getReg();
5206 if (Reg.isPhysical())
5207 return AArch64::FPR128RegClass.contains(Reg);
5208 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5209 return TRC == &AArch64::FPR128RegClass ||
5210 TRC == &AArch64::FPR128_loRegClass;
5211 };
5212 return llvm::any_of(MI.operands(), IsQFPR);
5213}
5214
5216 switch (MI.getOpcode()) {
5217 case AArch64::BRK:
5218 case AArch64::HLT:
5219 case AArch64::PACIASP:
5220 case AArch64::PACIBSP:
5221 // Implicit BTI behavior.
5222 return true;
5223 case AArch64::PAUTH_PROLOGUE:
5224 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5225 return true;
5226 case AArch64::HINT: {
5227 unsigned Imm = MI.getOperand(0).getImm();
5228 // Explicit BTI instruction.
5229 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5230 return true;
5231 // PACI(A|B)SP instructions.
5232 if (Imm == 25 || Imm == 27)
5233 return true;
5234 return false;
5235 }
5236 default:
5237 return false;
5238 }
5239}
5240
5242 if (Reg == 0)
5243 return false;
5244 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5245 return AArch64::FPR128RegClass.contains(Reg) ||
5246 AArch64::FPR64RegClass.contains(Reg) ||
5247 AArch64::FPR32RegClass.contains(Reg) ||
5248 AArch64::FPR16RegClass.contains(Reg) ||
5249 AArch64::FPR8RegClass.contains(Reg);
5250}
5251
5253 auto IsFPR = [&](const MachineOperand &Op) {
5254 if (!Op.isReg())
5255 return false;
5256 auto Reg = Op.getReg();
5257 if (Reg.isPhysical())
5258 return isFpOrNEON(Reg);
5259
5260 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5261 return TRC == &AArch64::FPR128RegClass ||
5262 TRC == &AArch64::FPR128_loRegClass ||
5263 TRC == &AArch64::FPR64RegClass ||
5264 TRC == &AArch64::FPR64_loRegClass ||
5265 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5266 TRC == &AArch64::FPR8RegClass;
5267 };
5268 return llvm::any_of(MI.operands(), IsFPR);
5269}
5270
5271// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5272// scaled.
5273static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5275
5276 // If the byte-offset isn't a multiple of the stride, we can't scale this
5277 // offset.
5278 if (Offset % Scale != 0)
5279 return false;
5280
5281 // Convert the byte-offset used by unscaled into an "element" offset used
5282 // by the scaled pair load/store instructions.
5283 Offset /= Scale;
5284 return true;
5285}
5286
5287static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5288 if (FirstOpc == SecondOpc)
5289 return true;
5290 // We can also pair sign-ext and zero-ext instructions.
5291 switch (FirstOpc) {
5292 default:
5293 return false;
5294 case AArch64::STRSui:
5295 case AArch64::STURSi:
5296 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5297 case AArch64::STRDui:
5298 case AArch64::STURDi:
5299 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5300 case AArch64::STRQui:
5301 case AArch64::STURQi:
5302 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5303 case AArch64::STRWui:
5304 case AArch64::STURWi:
5305 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5306 case AArch64::STRXui:
5307 case AArch64::STURXi:
5308 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5309 case AArch64::LDRSui:
5310 case AArch64::LDURSi:
5311 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5312 case AArch64::LDRDui:
5313 case AArch64::LDURDi:
5314 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5315 case AArch64::LDRQui:
5316 case AArch64::LDURQi:
5317 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5318 case AArch64::LDRWui:
5319 case AArch64::LDURWi:
5320 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5321 case AArch64::LDRSWui:
5322 case AArch64::LDURSWi:
5323 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5324 case AArch64::LDRXui:
5325 case AArch64::LDURXi:
5326 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5327 }
5328 // These instructions can't be paired based on their opcodes.
5329 return false;
5330}
5331
5332static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5333 int64_t Offset1, unsigned Opcode1, int FI2,
5334 int64_t Offset2, unsigned Opcode2) {
5335 // Accesses through fixed stack object frame indices may access a different
5336 // fixed stack slot. Check that the object offsets + offsets match.
5337 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5338 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5339 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5340 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5341 // Convert to scaled object offsets.
5342 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5343 if (ObjectOffset1 % Scale1 != 0)
5344 return false;
5345 ObjectOffset1 /= Scale1;
5346 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5347 if (ObjectOffset2 % Scale2 != 0)
5348 return false;
5349 ObjectOffset2 /= Scale2;
5350 ObjectOffset1 += Offset1;
5351 ObjectOffset2 += Offset2;
5352 return ObjectOffset1 + 1 == ObjectOffset2;
5353 }
5354
5355 return FI1 == FI2;
5356}
5357
5358/// Detect opportunities for ldp/stp formation.
5359///
5360/// Only called for LdSt for which getMemOperandWithOffset returns true.
5362 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5363 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5364 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5365 unsigned NumBytes) const {
5366 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5367 const MachineOperand &BaseOp1 = *BaseOps1.front();
5368 const MachineOperand &BaseOp2 = *BaseOps2.front();
5369 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5370 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5371 if (BaseOp1.getType() != BaseOp2.getType())
5372 return false;
5373
5374 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5375 "Only base registers and frame indices are supported.");
5376
5377 // Check for both base regs and base FI.
5378 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5379 return false;
5380
5381 // Only cluster up to a single pair.
5382 if (ClusterSize > 2)
5383 return false;
5384
5385 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5386 return false;
5387
5388 // Can we pair these instructions based on their opcodes?
5389 unsigned FirstOpc = FirstLdSt.getOpcode();
5390 unsigned SecondOpc = SecondLdSt.getOpcode();
5391 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5392 return false;
5393
5394 // Can't merge volatiles or load/stores that have a hint to avoid pair
5395 // formation, for example.
5396 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5397 !isCandidateToMergeOrPair(SecondLdSt))
5398 return false;
5399
5400 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5401 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5402 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5403 return false;
5404
5405 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5406 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5407 return false;
5408
5409 // Pairwise instructions have a 7-bit signed offset field.
5410 if (Offset1 > 63 || Offset1 < -64)
5411 return false;
5412
5413 // The caller should already have ordered First/SecondLdSt by offset.
5414 // Note: except for non-equal frame index bases
5415 if (BaseOp1.isFI()) {
5416 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5417 "Caller should have ordered offsets.");
5418
5419 const MachineFrameInfo &MFI =
5420 FirstLdSt.getParent()->getParent()->getFrameInfo();
5421 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5422 BaseOp2.getIndex(), Offset2, SecondOpc);
5423 }
5424
5425 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5426
5427 return Offset1 + 1 == Offset2;
5428}
5429
5431 MCRegister Reg, unsigned SubIdx,
5432 RegState State,
5433 const TargetRegisterInfo *TRI) {
5434 if (!SubIdx)
5435 return MIB.addReg(Reg, State);
5436
5437 if (Reg.isPhysical())
5438 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5439 return MIB.addReg(Reg, State, SubIdx);
5440}
5441
5442static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5443 unsigned NumRegs) {
5444 // We really want the positive remainder mod 32 here, that happens to be
5445 // easily obtainable with a mask.
5446 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5447}
5448
5451 const DebugLoc &DL, MCRegister DestReg,
5452 MCRegister SrcReg, bool KillSrc,
5453 unsigned Opcode,
5454 ArrayRef<unsigned> Indices) const {
5455 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5457 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5458 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5459 unsigned NumRegs = Indices.size();
5460
5461 int SubReg = 0, End = NumRegs, Incr = 1;
5462 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5463 SubReg = NumRegs - 1;
5464 End = -1;
5465 Incr = -1;
5466 }
5467
5468 for (; SubReg != End; SubReg += Incr) {
5469 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5470 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5471 AddSubReg(MIB, SrcReg, Indices[SubReg], {}, TRI);
5472 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5473 }
5474}
5475
5478 const DebugLoc &DL, MCRegister DestReg,
5479 MCRegister SrcReg, bool KillSrc,
5480 unsigned Opcode, unsigned ZeroReg,
5481 llvm::ArrayRef<unsigned> Indices) const {
5483 unsigned NumRegs = Indices.size();
5484
5485#ifndef NDEBUG
5486 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5487 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5488 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5489 "GPR reg sequences should not be able to overlap");
5490#endif
5491
5492 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5493 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5494 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5495 MIB.addReg(ZeroReg);
5496 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5497 MIB.addImm(0);
5498 }
5499}
5500
5501/// Returns true if the instruction at I is in a streaming call site region,
5502/// within a single basic block.
5503/// A "call site streaming region" starts after smstart and ends at smstop
5504/// around a call to a streaming function. This walks backward from I.
5507 MachineFunction &MF = *MBB.getParent();
5509 if (!AFI->hasStreamingModeChanges())
5510 return false;
5511 // Walk backwards to find smstart/smstop
5512 for (MachineInstr &MI : reverse(make_range(MBB.begin(), I))) {
5513 unsigned Opc = MI.getOpcode();
5514 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5515 // Check if this is SM change (not ZA)
5516 int64_t PState = MI.getOperand(0).getImm();
5517 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5518 // Operand 1 is 1 for start, 0 for stop
5519 return MI.getOperand(1).getImm() == 1;
5520 }
5521 }
5522 }
5523 return false;
5524}
5525
5526/// Returns true if in a streaming call site region without SME-FA64.
5527static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5530 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5531}
5532
5535 const DebugLoc &DL, Register DestReg,
5536 Register SrcReg, bool KillSrc,
5537 bool RenamableDest,
5538 bool RenamableSrc) const {
5539 ++NumCopyInstrs;
5540 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5541 AArch64::GPR32spRegClass.contains(SrcReg)) {
5542 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5543 // If either operand is WSP, expand to ADD #0.
5544 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5545 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5546 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5547 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5548 &AArch64::GPR64spRegClass);
5549 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5550 &AArch64::GPR64spRegClass);
5551 // This instruction is reading and writing X registers. This may upset
5552 // the register scavenger and machine verifier, so we need to indicate
5553 // that we are reading an undefined value from SrcRegX, but a proper
5554 // value from SrcReg.
5555 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5556 .addReg(SrcRegX, RegState::Undef)
5557 .addImm(0)
5559 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5560 ++NumZCRegMoveInstrsGPR;
5561 } else {
5562 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5563 .addReg(SrcReg, getKillRegState(KillSrc))
5564 .addImm(0)
5566 if (Subtarget.hasZeroCycleRegMoveGPR32())
5567 ++NumZCRegMoveInstrsGPR;
5568 }
5569 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5570 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5571 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5572 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5573 &AArch64::GPR64spRegClass);
5574 assert(DestRegX.isValid() && "Destination super-reg not valid");
5575 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5576 &AArch64::GPR64spRegClass);
5577 assert(SrcRegX.isValid() && "Source super-reg not valid");
5578 // This instruction is reading and writing X registers. This may upset
5579 // the register scavenger and machine verifier, so we need to indicate
5580 // that we are reading an undefined value from SrcRegX, but a proper
5581 // value from SrcReg.
5582 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5583 .addReg(AArch64::XZR)
5584 .addReg(SrcRegX, RegState::Undef)
5585 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5586 ++NumZCRegMoveInstrsGPR;
5587 } else {
5588 // Otherwise, expand to ORR WZR.
5589 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5590 .addReg(AArch64::WZR)
5591 .addReg(SrcReg, getKillRegState(KillSrc));
5592 if (Subtarget.hasZeroCycleRegMoveGPR32())
5593 ++NumZCRegMoveInstrsGPR;
5594 }
5595 return;
5596 }
5597
5598 // GPR32 zeroing
5599 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5600 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5601 !Subtarget.hasZeroCycleZeroingGPR32()) {
5602 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5603 &AArch64::GPR64spRegClass);
5604 assert(DestRegX.isValid() && "Destination super-reg not valid");
5605 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5606 .addImm(0)
5608 ++NumZCZeroingInstrsGPR;
5609 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5610 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5611 .addImm(0)
5613 ++NumZCZeroingInstrsGPR;
5614 } else {
5615 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5616 .addReg(AArch64::WZR)
5617 .addReg(AArch64::WZR);
5618 }
5619 return;
5620 }
5621
5622 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5623 AArch64::GPR64spRegClass.contains(SrcReg)) {
5624 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5625 // If either operand is SP, expand to ADD #0.
5626 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5627 .addReg(SrcReg, getKillRegState(KillSrc))
5628 .addImm(0)
5630 if (Subtarget.hasZeroCycleRegMoveGPR64())
5631 ++NumZCRegMoveInstrsGPR;
5632 } else {
5633 // Otherwise, expand to ORR XZR.
5634 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5635 .addReg(AArch64::XZR)
5636 .addReg(SrcReg, getKillRegState(KillSrc));
5637 if (Subtarget.hasZeroCycleRegMoveGPR64())
5638 ++NumZCRegMoveInstrsGPR;
5639 }
5640 return;
5641 }
5642
5643 // GPR64 zeroing
5644 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5645 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5646 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5647 .addImm(0)
5649 ++NumZCZeroingInstrsGPR;
5650 } else {
5651 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5652 .addReg(AArch64::XZR)
5653 .addReg(AArch64::XZR);
5654 }
5655 return;
5656 }
5657
5658 // Copy a Predicate register by ORRing with itself.
5659 if (AArch64::PPRRegClass.contains(DestReg) &&
5660 AArch64::PPRRegClass.contains(SrcReg)) {
5661 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5662 "Unexpected SVE register.");
5663 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5664 .addReg(SrcReg) // Pg
5665 .addReg(SrcReg)
5666 .addReg(SrcReg, getKillRegState(KillSrc));
5667 return;
5668 }
5669
5670 // Copy a predicate-as-counter register by ORRing with itself as if it
5671 // were a regular predicate (mask) register.
5672 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5673 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5674 if (DestIsPNR || SrcIsPNR) {
5675 auto ToPPR = [](MCRegister R) -> MCRegister {
5676 return (R - AArch64::PN0) + AArch64::P0;
5677 };
5678 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5679 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5680
5681 if (PPRSrcReg != PPRDestReg) {
5682 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5683 .addReg(PPRSrcReg) // Pg
5684 .addReg(PPRSrcReg)
5685 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5686 if (DestIsPNR)
5687 NewMI.addDef(DestReg, RegState::Implicit);
5688 }
5689 return;
5690 }
5691
5692 // Copy a Z register by ORRing with itself.
5693 if (AArch64::ZPRRegClass.contains(DestReg) &&
5694 AArch64::ZPRRegClass.contains(SrcReg)) {
5695 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5696 "Unexpected SVE register.");
5697 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5698 .addReg(SrcReg)
5699 .addReg(SrcReg, getKillRegState(KillSrc));
5700 return;
5701 }
5702
5703 // Copy a Z register pair by copying the individual sub-registers.
5704 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5705 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5706 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5707 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5708 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5709 "Unexpected SVE register.");
5710 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5711 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5712 Indices);
5713 return;
5714 }
5715
5716 // Copy a Z register triple by copying the individual sub-registers.
5717 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5718 AArch64::ZPR3RegClass.contains(SrcReg)) {
5719 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5720 "Unexpected SVE register.");
5721 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5722 AArch64::zsub2};
5723 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5724 Indices);
5725 return;
5726 }
5727
5728 // Copy a Z register quad by copying the individual sub-registers.
5729 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5730 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5731 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5732 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5733 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5734 "Unexpected SVE register.");
5735 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5736 AArch64::zsub2, AArch64::zsub3};
5737 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5738 Indices);
5739 return;
5740 }
5741
5742 // Copy a DDDD register quad by copying the individual sub-registers.
5743 if (AArch64::DDDDRegClass.contains(DestReg) &&
5744 AArch64::DDDDRegClass.contains(SrcReg)) {
5745 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5746 AArch64::dsub2, AArch64::dsub3};
5747 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5748 Indices);
5749 return;
5750 }
5751
5752 // Copy a DDD register triple by copying the individual sub-registers.
5753 if (AArch64::DDDRegClass.contains(DestReg) &&
5754 AArch64::DDDRegClass.contains(SrcReg)) {
5755 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5756 AArch64::dsub2};
5757 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5758 Indices);
5759 return;
5760 }
5761
5762 // Copy a DD register pair by copying the individual sub-registers.
5763 if (AArch64::DDRegClass.contains(DestReg) &&
5764 AArch64::DDRegClass.contains(SrcReg)) {
5765 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5766 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5767 Indices);
5768 return;
5769 }
5770
5771 // Copy a QQQQ register quad by copying the individual sub-registers.
5772 if (AArch64::QQQQRegClass.contains(DestReg) &&
5773 AArch64::QQQQRegClass.contains(SrcReg)) {
5774 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5775 AArch64::qsub2, AArch64::qsub3};
5776 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5777 Indices);
5778 return;
5779 }
5780
5781 // Copy a QQQ register triple by copying the individual sub-registers.
5782 if (AArch64::QQQRegClass.contains(DestReg) &&
5783 AArch64::QQQRegClass.contains(SrcReg)) {
5784 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5785 AArch64::qsub2};
5786 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5787 Indices);
5788 return;
5789 }
5790
5791 // Copy a QQ register pair by copying the individual sub-registers.
5792 if (AArch64::QQRegClass.contains(DestReg) &&
5793 AArch64::QQRegClass.contains(SrcReg)) {
5794 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5795 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5796 Indices);
5797 return;
5798 }
5799
5800 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5801 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5802 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5803 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5804 AArch64::XZR, Indices);
5805 return;
5806 }
5807
5808 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5809 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5810 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5811 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5812 AArch64::WZR, Indices);
5813 return;
5814 }
5815
5816 if (AArch64::FPR128RegClass.contains(DestReg) &&
5817 AArch64::FPR128RegClass.contains(SrcReg)) {
5818 // In streaming regions, NEON is illegal but streaming-SVE is available.
5819 // Use SVE for copies if we're in a streaming region and SME is available.
5820 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5821 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5822 !Subtarget.isNeonAvailable()) ||
5823 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5824 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5825 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5826 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5827 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5828 } else if (Subtarget.isNeonAvailable()) {
5829 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5830 .addReg(SrcReg)
5831 .addReg(SrcReg, getKillRegState(KillSrc));
5832 if (Subtarget.hasZeroCycleRegMoveFPR128())
5833 ++NumZCRegMoveInstrsFPR;
5834 } else {
5835 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5836 .addReg(AArch64::SP, RegState::Define)
5837 .addReg(SrcReg, getKillRegState(KillSrc))
5838 .addReg(AArch64::SP)
5839 .addImm(-16);
5840 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5841 .addReg(AArch64::SP, RegState::Define)
5842 .addReg(DestReg, RegState::Define)
5843 .addReg(AArch64::SP)
5844 .addImm(16);
5845 }
5846 return;
5847 }
5848
5849 if (AArch64::FPR64RegClass.contains(DestReg) &&
5850 AArch64::FPR64RegClass.contains(SrcReg)) {
5851 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5852 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5853 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5854 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5855 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
5856 &AArch64::FPR128RegClass);
5857 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
5858 &AArch64::FPR128RegClass);
5859 // This instruction is reading and writing Q registers. This may upset
5860 // the register scavenger and machine verifier, so we need to indicate
5861 // that we are reading an undefined value from SrcRegQ, but a proper
5862 // value from SrcReg.
5863 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5864 .addReg(SrcRegQ, RegState::Undef)
5865 .addReg(SrcRegQ, RegState::Undef)
5866 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5867 ++NumZCRegMoveInstrsFPR;
5868 } else {
5869 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5870 .addReg(SrcReg, getKillRegState(KillSrc));
5871 if (Subtarget.hasZeroCycleRegMoveFPR64())
5872 ++NumZCRegMoveInstrsFPR;
5873 }
5874 return;
5875 }
5876
5877 if (AArch64::FPR32RegClass.contains(DestReg) &&
5878 AArch64::FPR32RegClass.contains(SrcReg)) {
5879 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5880 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5881 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5882 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5883 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5884 &AArch64::FPR128RegClass);
5885 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5886 &AArch64::FPR128RegClass);
5887 // This instruction is reading and writing Q registers. This may upset
5888 // the register scavenger and machine verifier, so we need to indicate
5889 // that we are reading an undefined value from SrcRegQ, but a proper
5890 // value from SrcReg.
5891 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5892 .addReg(SrcRegQ, RegState::Undef)
5893 .addReg(SrcRegQ, RegState::Undef)
5894 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5895 ++NumZCRegMoveInstrsFPR;
5896 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5897 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5898 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5899 &AArch64::FPR64RegClass);
5900 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5901 &AArch64::FPR64RegClass);
5902 // This instruction is reading and writing D registers. This may upset
5903 // the register scavenger and machine verifier, so we need to indicate
5904 // that we are reading an undefined value from SrcRegD, but a proper
5905 // value from SrcReg.
5906 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5907 .addReg(SrcRegD, RegState::Undef)
5908 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5909 ++NumZCRegMoveInstrsFPR;
5910 } else {
5911 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5912 .addReg(SrcReg, getKillRegState(KillSrc));
5913 if (Subtarget.hasZeroCycleRegMoveFPR32())
5914 ++NumZCRegMoveInstrsFPR;
5915 }
5916 return;
5917 }
5918
5919 if (AArch64::FPR16RegClass.contains(DestReg) &&
5920 AArch64::FPR16RegClass.contains(SrcReg)) {
5921 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5922 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5923 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5924 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5925 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5926 &AArch64::FPR128RegClass);
5927 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5928 &AArch64::FPR128RegClass);
5929 // This instruction is reading and writing Q registers. This may upset
5930 // the register scavenger and machine verifier, so we need to indicate
5931 // that we are reading an undefined value from SrcRegQ, but a proper
5932 // value from SrcReg.
5933 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5934 .addReg(SrcRegQ, RegState::Undef)
5935 .addReg(SrcRegQ, RegState::Undef)
5936 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5937 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5938 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5939 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5940 &AArch64::FPR64RegClass);
5941 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5942 &AArch64::FPR64RegClass);
5943 // This instruction is reading and writing D registers. This may upset
5944 // the register scavenger and machine verifier, so we need to indicate
5945 // that we are reading an undefined value from SrcRegD, but a proper
5946 // value from SrcReg.
5947 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5948 .addReg(SrcRegD, RegState::Undef)
5949 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5950 } else {
5951 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5952 &AArch64::FPR32RegClass);
5953 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5954 &AArch64::FPR32RegClass);
5955 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5956 .addReg(SrcReg, getKillRegState(KillSrc));
5957 }
5958 return;
5959 }
5960
5961 if (AArch64::FPR8RegClass.contains(DestReg) &&
5962 AArch64::FPR8RegClass.contains(SrcReg)) {
5963 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5964 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5965 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5966 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5967 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5968 &AArch64::FPR128RegClass);
5969 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5970 &AArch64::FPR128RegClass);
5971 // This instruction is reading and writing Q registers. This may upset
5972 // the register scavenger and machine verifier, so we need to indicate
5973 // that we are reading an undefined value from SrcRegQ, but a proper
5974 // value from SrcReg.
5975 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5976 .addReg(SrcRegQ, RegState::Undef)
5977 .addReg(SrcRegQ, RegState::Undef)
5978 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5979 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5980 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5981 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5982 &AArch64::FPR64RegClass);
5983 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5984 &AArch64::FPR64RegClass);
5985 // This instruction is reading and writing D registers. This may upset
5986 // the register scavenger and machine verifier, so we need to indicate
5987 // that we are reading an undefined value from SrcRegD, but a proper
5988 // value from SrcReg.
5989 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5990 .addReg(SrcRegD, RegState::Undef)
5991 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5992 } else {
5993 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5994 &AArch64::FPR32RegClass);
5995 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5996 &AArch64::FPR32RegClass);
5997 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5998 .addReg(SrcReg, getKillRegState(KillSrc));
5999 }
6000 return;
6001 }
6002
6003 // Copies between GPR64 and FPR64.
6004 if (AArch64::FPR64RegClass.contains(DestReg) &&
6005 AArch64::GPR64RegClass.contains(SrcReg)) {
6006 if (AArch64::XZR == SrcReg) {
6007 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
6008 } else {
6009 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
6010 .addReg(SrcReg, getKillRegState(KillSrc));
6011 }
6012 return;
6013 }
6014 if (AArch64::GPR64RegClass.contains(DestReg) &&
6015 AArch64::FPR64RegClass.contains(SrcReg)) {
6016 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
6017 .addReg(SrcReg, getKillRegState(KillSrc));
6018 return;
6019 }
6020 // Copies between GPR32 and FPR32.
6021 if (AArch64::FPR32RegClass.contains(DestReg) &&
6022 AArch64::GPR32RegClass.contains(SrcReg)) {
6023 if (AArch64::WZR == SrcReg) {
6024 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
6025 } else {
6026 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
6027 .addReg(SrcReg, getKillRegState(KillSrc));
6028 }
6029 return;
6030 }
6031 if (AArch64::GPR32RegClass.contains(DestReg) &&
6032 AArch64::FPR32RegClass.contains(SrcReg)) {
6033 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
6034 .addReg(SrcReg, getKillRegState(KillSrc));
6035 return;
6036 }
6037
6038 if (DestReg == AArch64::NZCV) {
6039 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6040 BuildMI(MBB, I, DL, get(AArch64::MSR))
6041 .addImm(AArch64SysReg::NZCV)
6042 .addReg(SrcReg, getKillRegState(KillSrc))
6043 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
6044 return;
6045 }
6046
6047 if (SrcReg == AArch64::NZCV) {
6048 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6049 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
6050 .addImm(AArch64SysReg::NZCV)
6051 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
6052 return;
6053 }
6054
6055#ifndef NDEBUG
6056 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6057 << "\n";
6058#endif
6059 llvm_unreachable("unimplemented reg-to-reg copy");
6060}
6061
6064 MachineBasicBlock::iterator InsertBefore,
6065 const MCInstrDesc &MCID,
6066 Register SrcReg, bool IsKill,
6067 unsigned SubIdx0, unsigned SubIdx1, int FI,
6068 MachineMemOperand *MMO) {
6069 Register SrcReg0 = SrcReg;
6070 Register SrcReg1 = SrcReg;
6071 if (SrcReg.isPhysical()) {
6072 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
6073 SubIdx0 = 0;
6074 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
6075 SubIdx1 = 0;
6076 }
6077 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6078 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
6079 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
6080 .addFrameIndex(FI)
6081 .addImm(0)
6082 .addMemOperand(MMO);
6083}
6084
6087 Register SrcReg, bool isKill, int FI,
6088 const TargetRegisterClass *RC,
6089 Register VReg,
6090 MachineInstr::MIFlag Flags) const {
6091 MachineFunction &MF = *MBB.getParent();
6092 MachineFrameInfo &MFI = MF.getFrameInfo();
6093
6095 MachineMemOperand *MMO =
6097 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6098 unsigned Opc = 0;
6099 bool Offset = true;
6101 unsigned StackID = TargetStackID::Default;
6102 switch (RI.getSpillSize(*RC)) {
6103 case 1:
6104 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6105 Opc = AArch64::STRBui;
6106 break;
6107 case 2: {
6108 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6109 Opc = AArch64::STRHui;
6110 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6111 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6112 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6113 "Unexpected register store without SVE store instructions");
6114 Opc = AArch64::STR_PXI;
6116 }
6117 break;
6118 }
6119 case 4:
6120 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6121 Opc = AArch64::STRWui;
6122 if (SrcReg.isVirtual())
6123 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
6124 else
6125 assert(SrcReg != AArch64::WSP);
6126 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6127 Opc = AArch64::STRSui;
6128 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6129 Opc = AArch64::STR_PPXI;
6131 }
6132 break;
6133 case 8:
6134 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6135 Opc = AArch64::STRXui;
6136 if (SrcReg.isVirtual())
6137 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6138 else
6139 assert(SrcReg != AArch64::SP);
6140 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6141 Opc = AArch64::STRDui;
6142 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6144 get(AArch64::STPWi), SrcReg, isKill,
6145 AArch64::sube32, AArch64::subo32, FI, MMO);
6146 return;
6147 }
6148 break;
6149 case 16:
6150 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6151 Opc = AArch64::STRQui;
6152 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6153 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6154 Opc = AArch64::ST1Twov1d;
6155 Offset = false;
6156 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6158 get(AArch64::STPXi), SrcReg, isKill,
6159 AArch64::sube64, AArch64::subo64, FI, MMO);
6160 return;
6161 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6162 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6163 "Unexpected register store without SVE store instructions");
6164 Opc = AArch64::STR_ZXI;
6166 }
6167 break;
6168 case 24:
6169 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6170 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6171 Opc = AArch64::ST1Threev1d;
6172 Offset = false;
6173 }
6174 break;
6175 case 32:
6176 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6177 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6178 Opc = AArch64::ST1Fourv1d;
6179 Offset = false;
6180 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6181 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6182 Opc = AArch64::ST1Twov2d;
6183 Offset = false;
6184 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6185 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6186 "Unexpected register store without SVE store instructions");
6187 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6189 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6190 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6191 "Unexpected register store without SVE store instructions");
6192 Opc = AArch64::STR_ZZXI;
6194 }
6195 break;
6196 case 48:
6197 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6198 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6199 Opc = AArch64::ST1Threev2d;
6200 Offset = false;
6201 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6202 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6203 "Unexpected register store without SVE store instructions");
6204 Opc = AArch64::STR_ZZZXI;
6206 }
6207 break;
6208 case 64:
6209 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6210 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6211 Opc = AArch64::ST1Fourv2d;
6212 Offset = false;
6213 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6214 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6215 "Unexpected register store without SVE store instructions");
6216 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6218 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6219 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6220 "Unexpected register store without SVE store instructions");
6221 Opc = AArch64::STR_ZZZZXI;
6223 }
6224 break;
6225 }
6226 assert(Opc && "Unknown register class");
6227 MFI.setStackID(FI, StackID);
6228
6230 .addReg(SrcReg, getKillRegState(isKill))
6231 .addFrameIndex(FI);
6232
6233 if (Offset)
6234 MI.addImm(0);
6235 if (PNRReg.isValid())
6236 MI.addDef(PNRReg, RegState::Implicit);
6237 MI.addMemOperand(MMO);
6238}
6239
6242 MachineBasicBlock::iterator InsertBefore,
6243 const MCInstrDesc &MCID,
6244 Register DestReg, unsigned SubIdx0,
6245 unsigned SubIdx1, int FI,
6246 MachineMemOperand *MMO) {
6247 Register DestReg0 = DestReg;
6248 Register DestReg1 = DestReg;
6249 bool IsUndef = true;
6250 if (DestReg.isPhysical()) {
6251 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6252 SubIdx0 = 0;
6253 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6254 SubIdx1 = 0;
6255 IsUndef = false;
6256 }
6257 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6258 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6259 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6260 .addFrameIndex(FI)
6261 .addImm(0)
6262 .addMemOperand(MMO);
6263}
6264
6267 Register DestReg, int FI,
6268 const TargetRegisterClass *RC,
6269 Register VReg, unsigned SubReg,
6270 MachineInstr::MIFlag Flags) const {
6271 MachineFunction &MF = *MBB.getParent();
6272 MachineFrameInfo &MFI = MF.getFrameInfo();
6274 MachineMemOperand *MMO =
6276 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6277
6278 unsigned Opc = 0;
6279 bool Offset = true;
6280 unsigned StackID = TargetStackID::Default;
6282 switch (TRI.getSpillSize(*RC)) {
6283 case 1:
6284 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6285 Opc = AArch64::LDRBui;
6286 break;
6287 case 2: {
6288 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6289 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6290 Opc = AArch64::LDRHui;
6291 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6292 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6293 "Unexpected register load without SVE load instructions");
6294 if (IsPNR)
6295 PNRReg = DestReg;
6296 Opc = AArch64::LDR_PXI;
6298 }
6299 break;
6300 }
6301 case 4:
6302 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6303 Opc = AArch64::LDRWui;
6304 if (DestReg.isVirtual())
6305 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6306 else
6307 assert(DestReg != AArch64::WSP);
6308 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6309 Opc = AArch64::LDRSui;
6310 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6311 Opc = AArch64::LDR_PPXI;
6313 }
6314 break;
6315 case 8:
6316 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6317 Opc = AArch64::LDRXui;
6318 if (DestReg.isVirtual())
6319 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6320 else
6321 assert(DestReg != AArch64::SP);
6322 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6323 Opc = AArch64::LDRDui;
6324 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6326 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6327 AArch64::subo32, FI, MMO);
6328 return;
6329 }
6330 break;
6331 case 16:
6332 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6333 Opc = AArch64::LDRQui;
6334 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6335 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6336 Opc = AArch64::LD1Twov1d;
6337 Offset = false;
6338 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6340 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6341 AArch64::subo64, FI, MMO);
6342 return;
6343 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6344 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6345 "Unexpected register load without SVE load instructions");
6346 Opc = AArch64::LDR_ZXI;
6348 }
6349 break;
6350 case 24:
6351 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6352 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6353 Opc = AArch64::LD1Threev1d;
6354 Offset = false;
6355 }
6356 break;
6357 case 32:
6358 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6359 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6360 Opc = AArch64::LD1Fourv1d;
6361 Offset = false;
6362 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6363 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6364 Opc = AArch64::LD1Twov2d;
6365 Offset = false;
6366 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6367 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6368 "Unexpected register load without SVE load instructions");
6369 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6371 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6372 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6373 "Unexpected register load without SVE load instructions");
6374 Opc = AArch64::LDR_ZZXI;
6376 }
6377 break;
6378 case 48:
6379 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6380 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6381 Opc = AArch64::LD1Threev2d;
6382 Offset = false;
6383 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6384 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6385 "Unexpected register load without SVE load instructions");
6386 Opc = AArch64::LDR_ZZZXI;
6388 }
6389 break;
6390 case 64:
6391 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6392 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6393 Opc = AArch64::LD1Fourv2d;
6394 Offset = false;
6395 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6396 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6397 "Unexpected register load without SVE load instructions");
6398 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6400 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6401 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6402 "Unexpected register load without SVE load instructions");
6403 Opc = AArch64::LDR_ZZZZXI;
6405 }
6406 break;
6407 }
6408
6409 assert(Opc && "Unknown register class");
6410 MFI.setStackID(FI, StackID);
6411
6413 .addReg(DestReg, getDefRegState(true))
6414 .addFrameIndex(FI);
6415 if (Offset)
6416 MI.addImm(0);
6417 if (PNRReg.isValid() && !PNRReg.isVirtual())
6418 MI.addDef(PNRReg, RegState::Implicit);
6419 MI.addMemOperand(MMO);
6420}
6421
6423 const MachineInstr &UseMI,
6424 const TargetRegisterInfo *TRI) {
6425 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6426 UseMI.getIterator()),
6427 [TRI](const MachineInstr &I) {
6428 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6429 I.readsRegister(AArch64::NZCV, TRI);
6430 });
6431}
6432
6433void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6434 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6435 // The smallest scalable element supported by scaled SVE addressing
6436 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6437 // byte offset must always be a multiple of 2.
6438 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6439
6440 // VGSized offsets are divided by '2', because the VG register is the
6441 // the number of 64bit granules as opposed to 128bit vector chunks,
6442 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6443 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6444 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6445 ByteSized = Offset.getFixed();
6446 VGSized = Offset.getScalable() / 2;
6447}
6448
6449/// Returns the offset in parts to which this frame offset can be
6450/// decomposed for the purpose of describing a frame offset.
6451/// For non-scalable offsets this is simply its byte size.
6452void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6453 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6454 int64_t &NumDataVectors) {
6455 // The smallest scalable element supported by scaled SVE addressing
6456 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6457 // byte offset must always be a multiple of 2.
6458 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6459
6460 NumBytes = Offset.getFixed();
6461 NumDataVectors = 0;
6462 NumPredicateVectors = Offset.getScalable() / 2;
6463 // This method is used to get the offsets to adjust the frame offset.
6464 // If the function requires ADDPL to be used and needs more than two ADDPL
6465 // instructions, part of the offset is folded into NumDataVectors so that it
6466 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6467 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6468 NumPredicateVectors > 62) {
6469 NumDataVectors = NumPredicateVectors / 8;
6470 NumPredicateVectors -= NumDataVectors * 8;
6471 }
6472}
6473
6474// Convenience function to create a DWARF expression for: Constant `Operation`.
6475// This helper emits compact sequences for common cases. For example, for`-15
6476// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6479 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6480 // -Constant (1 to 31)
6481 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6482 Operation = dwarf::DW_OP_minus;
6483 } else if (Constant >= 0 && Constant <= 31) {
6484 // Literal value 0 to 31
6485 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6486 } else {
6487 // Signed constant
6488 Expr.push_back(dwarf::DW_OP_consts);
6490 }
6491 return Expr.push_back(Operation);
6492}
6493
6494// Convenience function to create a DWARF expression for a register.
6495static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6496 Expr.push_back((char)dwarf::DW_OP_bregx);
6498 Expr.push_back(0);
6499}
6500
6501// Convenience function to create a DWARF expression for loading a register from
6502// a CFA offset.
6504 int64_t OffsetFromDefCFA) {
6505 // This assumes the top of the DWARF stack contains the CFA.
6506 Expr.push_back(dwarf::DW_OP_dup);
6507 // Add the offset to the register.
6508 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6509 // Dereference the address (loads a 64 bit value)..
6510 Expr.push_back(dwarf::DW_OP_deref);
6511}
6512
6513// Convenience function to create a comment for
6514// (+/-) NumBytes (* RegScale)?
6515static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6516 StringRef RegScale = {}) {
6517 if (NumBytes) {
6518 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6519 if (!RegScale.empty())
6520 Comment << ' ' << RegScale;
6521 }
6522}
6523
6524// Creates an MCCFIInstruction:
6525// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6527 unsigned Reg,
6528 const StackOffset &Offset) {
6529 int64_t NumBytes, NumVGScaledBytes;
6530 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6531 NumVGScaledBytes);
6532 std::string CommentBuffer;
6533 llvm::raw_string_ostream Comment(CommentBuffer);
6534
6535 if (Reg == AArch64::SP)
6536 Comment << "sp";
6537 else if (Reg == AArch64::FP)
6538 Comment << "fp";
6539 else
6540 Comment << printReg(Reg, &TRI);
6541
6542 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6543 SmallString<64> Expr;
6544 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6545 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6546 // Reg + NumBytes
6547 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6548 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6549 appendOffsetComment(NumBytes, Comment);
6550 if (NumVGScaledBytes) {
6551 // + VG * NumVGScaledBytes
6552 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6553 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6554 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6555 Expr.push_back(dwarf::DW_OP_plus);
6556 }
6557
6558 // Wrap this into DW_CFA_def_cfa.
6559 SmallString<64> DefCfaExpr;
6560 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6561 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6562 DefCfaExpr.append(Expr.str());
6563 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6564 Comment.str());
6565}
6566
6568 unsigned FrameReg, unsigned Reg,
6569 const StackOffset &Offset,
6570 bool LastAdjustmentWasScalable) {
6571 if (Offset.getScalable())
6572 return createDefCFAExpression(TRI, Reg, Offset);
6573
6574 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6575 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6576
6577 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6578 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6579}
6580
6583 const StackOffset &OffsetFromDefCFA,
6584 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6585 int64_t NumBytes, NumVGScaledBytes;
6586 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6587 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6588
6589 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6590
6591 // Non-scalable offsets can use DW_CFA_offset directly.
6592 if (!NumVGScaledBytes)
6593 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6594
6595 std::string CommentBuffer;
6596 llvm::raw_string_ostream Comment(CommentBuffer);
6597 Comment << printReg(Reg, &TRI) << " @ cfa";
6598
6599 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6600 assert(NumVGScaledBytes && "Expected scalable offset");
6601 SmallString<64> OffsetExpr;
6602 // + VG * NumVGScaledBytes
6603 StringRef VGRegScale;
6604 if (IncomingVGOffsetFromDefCFA) {
6605 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6606 VGRegScale = "* IncomingVG";
6607 } else {
6608 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6609 VGRegScale = "* VG";
6610 }
6611 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6612 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6613 OffsetExpr.push_back(dwarf::DW_OP_plus);
6614 if (NumBytes) {
6615 // + NumBytes
6616 appendOffsetComment(NumBytes, Comment);
6617 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6618 }
6619
6620 // Wrap this into DW_CFA_expression
6621 SmallString<64> CfaExpr;
6622 CfaExpr.push_back(dwarf::DW_CFA_expression);
6623 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6624 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6625 CfaExpr.append(OffsetExpr.str());
6626
6627 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6628 Comment.str());
6629}
6630
6631// Helper function to emit a frame offset adjustment from a given
6632// pointer (SrcReg), stored into DestReg. This function is explicit
6633// in that it requires the opcode.
6636 const DebugLoc &DL, unsigned DestReg,
6637 unsigned SrcReg, int64_t Offset, unsigned Opc,
6638 const TargetInstrInfo *TII,
6639 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6640 bool *HasWinCFI, bool EmitCFAOffset,
6641 StackOffset CFAOffset, unsigned FrameReg) {
6642 int Sign = 1;
6643 unsigned MaxEncoding, ShiftSize;
6644 switch (Opc) {
6645 case AArch64::ADDXri:
6646 case AArch64::ADDSXri:
6647 case AArch64::SUBXri:
6648 case AArch64::SUBSXri:
6649 MaxEncoding = 0xfff;
6650 ShiftSize = 12;
6651 break;
6652 case AArch64::ADDVL_XXI:
6653 case AArch64::ADDPL_XXI:
6654 case AArch64::ADDSVL_XXI:
6655 case AArch64::ADDSPL_XXI:
6656 MaxEncoding = 31;
6657 ShiftSize = 0;
6658 if (Offset < 0) {
6659 MaxEncoding = 32;
6660 Sign = -1;
6661 Offset = -Offset;
6662 }
6663 break;
6664 default:
6665 llvm_unreachable("Unsupported opcode");
6666 }
6667
6668 // `Offset` can be in bytes or in "scalable bytes".
6669 int VScale = 1;
6670 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6671 VScale = 16;
6672 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6673 VScale = 2;
6674
6675 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6676 // scratch register. If DestReg is a virtual register, use it as the
6677 // scratch register; otherwise, create a new virtual register (to be
6678 // replaced by the scavenger at the end of PEI). That case can be optimized
6679 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6680 // register can be loaded with offset%8 and the add/sub can use an extending
6681 // instruction with LSL#3.
6682 // Currently the function handles any offsets but generates a poor sequence
6683 // of code.
6684 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6685
6686 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6687 Register TmpReg = DestReg;
6688 if (TmpReg == AArch64::XZR)
6689 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6690 &AArch64::GPR64RegClass);
6691 do {
6692 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6693 unsigned LocalShiftSize = 0;
6694 if (ThisVal > MaxEncoding) {
6695 ThisVal = ThisVal >> ShiftSize;
6696 LocalShiftSize = ShiftSize;
6697 }
6698 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6699 "Encoding cannot handle value that big");
6700
6701 Offset -= ThisVal << LocalShiftSize;
6702 if (Offset == 0)
6703 TmpReg = DestReg;
6704 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6705 .addReg(SrcReg)
6706 .addImm(Sign * (int)ThisVal);
6707 if (ShiftSize)
6708 MBI = MBI.addImm(
6710 MBI = MBI.setMIFlag(Flag);
6711
6712 auto Change =
6713 VScale == 1
6714 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6715 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6716 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6717 CFAOffset += Change;
6718 else
6719 CFAOffset -= Change;
6720 if (EmitCFAOffset && DestReg == TmpReg) {
6721 MachineFunction &MF = *MBB.getParent();
6722 const TargetSubtargetInfo &STI = MF.getSubtarget();
6723 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6724
6725 unsigned CFIIndex = MF.addFrameInst(
6726 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6727 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6728 .addCFIIndex(CFIIndex)
6729 .setMIFlags(Flag);
6730 }
6731
6732 if (NeedsWinCFI) {
6733 int Imm = (int)(ThisVal << LocalShiftSize);
6734 if (VScale != 1 && DestReg == AArch64::SP) {
6735 if (HasWinCFI)
6736 *HasWinCFI = true;
6737 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6738 .addImm(ThisVal)
6739 .setMIFlag(Flag);
6740 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6741 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6742 assert(VScale == 1 && "Expected non-scalable operation");
6743 if (HasWinCFI)
6744 *HasWinCFI = true;
6745 if (Imm == 0)
6746 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6747 else
6748 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6749 .addImm(Imm)
6750 .setMIFlag(Flag);
6751 assert(Offset == 0 && "Expected remaining offset to be zero to "
6752 "emit a single SEH directive");
6753 } else if (DestReg == AArch64::SP) {
6754 assert(VScale == 1 && "Expected non-scalable operation");
6755 if (HasWinCFI)
6756 *HasWinCFI = true;
6757 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6758 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6759 .addImm(Imm)
6760 .setMIFlag(Flag);
6761 }
6762 }
6763
6764 SrcReg = TmpReg;
6765 } while (Offset);
6766}
6767
6770 unsigned DestReg, unsigned SrcReg,
6772 MachineInstr::MIFlag Flag, bool SetNZCV,
6773 bool NeedsWinCFI, bool *HasWinCFI,
6774 bool EmitCFAOffset, StackOffset CFAOffset,
6775 unsigned FrameReg) {
6776 // If a function is marked as arm_locally_streaming, then the runtime value of
6777 // vscale in the prologue/epilogue is different the runtime value of vscale
6778 // in the function's body. To avoid having to consider multiple vscales,
6779 // we can use `addsvl` to allocate any scalable stack-slots, which under
6780 // most circumstances will be only locals, not callee-save slots.
6781 const Function &F = MBB.getParent()->getFunction();
6782 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6783
6784 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6785 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6786 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6787
6788 // Insert ADDSXri for scalable offset at the end.
6789 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6790 if (NeedsFinalDefNZCV)
6791 SetNZCV = false;
6792
6793 // First emit non-scalable frame offsets, or a simple 'mov'.
6794 if (Bytes || (!Offset && SrcReg != DestReg)) {
6795 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6796 "SP increment/decrement not 8-byte aligned");
6797 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6798 if (Bytes < 0) {
6799 Bytes = -Bytes;
6800 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6801 }
6802 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6803 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6804 FrameReg);
6805 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6806 ? StackOffset::getFixed(-Bytes)
6807 : StackOffset::getFixed(Bytes);
6808 SrcReg = DestReg;
6809 FrameReg = DestReg;
6810 }
6811
6812 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6813 "WinCFI can't allocate fractions of an SVE data vector");
6814
6815 if (NumDataVectors) {
6816 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6817 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6818 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6819 FrameReg);
6820 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6821 SrcReg = DestReg;
6822 }
6823
6824 if (NumPredicateVectors) {
6825 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6826 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6827 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6828 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6829 FrameReg);
6830 }
6831
6832 if (NeedsFinalDefNZCV)
6833 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
6834 .addReg(DestReg)
6835 .addImm(0)
6836 .addImm(0);
6837}
6838
6841 int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS,
6842 VirtRegMap *VRM) const {
6844 // This is a bit of a hack. Consider this instruction:
6845 //
6846 // %0 = COPY %sp; GPR64all:%0
6847 //
6848 // We explicitly chose GPR64all for the virtual register so such a copy might
6849 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6850 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6851 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6852 //
6853 // To prevent that, we are going to constrain the %0 register class here.
6854 if (MI.isFullCopy()) {
6855 Register DstReg = MI.getOperand(0).getReg();
6856 Register SrcReg = MI.getOperand(1).getReg();
6857 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6858 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6859 return nullptr;
6860 }
6861 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6862 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6863 return nullptr;
6864 }
6865 // Nothing can folded with copy from/to NZCV.
6866 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6867 return nullptr;
6868 }
6869
6870 // Handle the case where a copy is being spilled or filled but the source
6871 // and destination register class don't match. For example:
6872 //
6873 // %0 = COPY %xzr; GPR64common:%0
6874 //
6875 // In this case we can still safely fold away the COPY and generate the
6876 // following spill code:
6877 //
6878 // STRXui %xzr, %stack.0
6879 //
6880 // This also eliminates spilled cross register class COPYs (e.g. between x and
6881 // d regs) of the same size. For example:
6882 //
6883 // %0 = COPY %1; GPR64:%0, FPR64:%1
6884 //
6885 // will be filled as
6886 //
6887 // LDRDui %0, fi<#0>
6888 //
6889 // instead of
6890 //
6891 // LDRXui %Temp, fi<#0>
6892 // %0 = FMOV %Temp
6893 //
6894 if (MI.isCopy() && Ops.size() == 1 &&
6895 // Make sure we're only folding the explicit COPY defs/uses.
6896 (Ops[0] == 0 || Ops[0] == 1)) {
6897 bool IsSpill = Ops[0] == 0;
6898 bool IsFill = !IsSpill;
6900 const MachineRegisterInfo &MRI = MF.getRegInfo();
6901 MachineBasicBlock &MBB = *MI.getParent();
6902 const MachineOperand &DstMO = MI.getOperand(0);
6903 const MachineOperand &SrcMO = MI.getOperand(1);
6904 Register DstReg = DstMO.getReg();
6905 Register SrcReg = SrcMO.getReg();
6906 // This is slightly expensive to compute for physical regs since
6907 // getMinimalPhysRegClass is slow.
6908 auto getRegClass = [&](unsigned Reg) {
6909 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6910 : TRI.getMinimalPhysRegClass(Reg);
6911 };
6912
6913 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6914 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6915 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6916 "Mismatched register size in non subreg COPY");
6917 if (IsSpill)
6918 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6919 getRegClass(SrcReg), Register());
6920 else
6921 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6922 getRegClass(DstReg), Register());
6923 return &*--InsertPt;
6924 }
6925
6926 // Handle cases like spilling def of:
6927 //
6928 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6929 //
6930 // where the physical register source can be widened and stored to the full
6931 // virtual reg destination stack slot, in this case producing:
6932 //
6933 // STRXui %xzr, %stack.0
6934 //
6935 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6936 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6937 assert(SrcMO.getSubReg() == 0 &&
6938 "Unexpected subreg on physical register");
6939 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6940 FrameIndex, &AArch64::GPR64RegClass, Register());
6941 return &*--InsertPt;
6942 }
6943
6944 // Handle cases like filling use of:
6945 //
6946 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6947 //
6948 // where we can load the full virtual reg source stack slot, into the subreg
6949 // destination, in this case producing:
6950 //
6951 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6952 //
6953 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6954 const TargetRegisterClass *FillRC = nullptr;
6955 switch (DstMO.getSubReg()) {
6956 default:
6957 break;
6958 case AArch64::sub_32:
6959 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
6960 FillRC = &AArch64::GPR32RegClass;
6961 break;
6962 case AArch64::ssub:
6963 FillRC = &AArch64::FPR32RegClass;
6964 break;
6965 case AArch64::dsub:
6966 FillRC = &AArch64::FPR64RegClass;
6967 break;
6968 }
6969
6970 if (FillRC) {
6971 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6972 TRI.getRegSizeInBits(*FillRC) &&
6973 "Mismatched regclass size on folded subreg COPY");
6974 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
6975 Register());
6976 MachineInstr &LoadMI = *--InsertPt;
6977 MachineOperand &LoadDst = LoadMI.getOperand(0);
6978 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6979 LoadDst.setSubReg(DstMO.getSubReg());
6980 LoadDst.setIsUndef();
6981 return &LoadMI;
6982 }
6983 }
6984 }
6985
6986 // Cannot fold.
6987 return nullptr;
6988}
6989
6991 StackOffset &SOffset,
6992 bool *OutUseUnscaledOp,
6993 unsigned *OutUnscaledOp,
6994 int64_t *EmittableOffset) {
6995 // Set output values in case of early exit.
6996 if (EmittableOffset)
6997 *EmittableOffset = 0;
6998 if (OutUseUnscaledOp)
6999 *OutUseUnscaledOp = false;
7000 if (OutUnscaledOp)
7001 *OutUnscaledOp = 0;
7002
7003 // Exit early for structured vector spills/fills as they can't take an
7004 // immediate offset.
7005 switch (MI.getOpcode()) {
7006 default:
7007 break;
7008 case AArch64::LD1Rv1d:
7009 case AArch64::LD1Rv2s:
7010 case AArch64::LD1Rv2d:
7011 case AArch64::LD1Rv4h:
7012 case AArch64::LD1Rv4s:
7013 case AArch64::LD1Rv8b:
7014 case AArch64::LD1Rv8h:
7015 case AArch64::LD1Rv16b:
7016 case AArch64::LD1Twov2d:
7017 case AArch64::LD1Threev2d:
7018 case AArch64::LD1Fourv2d:
7019 case AArch64::LD1Twov1d:
7020 case AArch64::LD1Threev1d:
7021 case AArch64::LD1Fourv1d:
7022 case AArch64::ST1Twov2d:
7023 case AArch64::ST1Threev2d:
7024 case AArch64::ST1Fourv2d:
7025 case AArch64::ST1Twov1d:
7026 case AArch64::ST1Threev1d:
7027 case AArch64::ST1Fourv1d:
7028 case AArch64::ST1i8:
7029 case AArch64::ST1i16:
7030 case AArch64::ST1i32:
7031 case AArch64::ST1i64:
7032 case AArch64::IRG:
7033 case AArch64::IRGstack:
7034 case AArch64::STGloop:
7035 case AArch64::STZGloop:
7037 }
7038
7039 // Get the min/max offset and the scale.
7040 TypeSize ScaleValue(0U, false), Width(0U, false);
7041 int64_t MinOff, MaxOff;
7042 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
7043 MaxOff))
7044 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7045
7046 // Construct the complete offset.
7047 bool IsMulVL = ScaleValue.isScalable();
7048 unsigned Scale = ScaleValue.getKnownMinValue();
7049 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7050
7051 const MachineOperand &ImmOpnd =
7052 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
7053 Offset += ImmOpnd.getImm() * Scale;
7054
7055 // If the offset doesn't match the scale, we rewrite the instruction to
7056 // use the unscaled instruction instead. Likewise, if we have a negative
7057 // offset and there is an unscaled op to use.
7058 std::optional<unsigned> UnscaledOp =
7060 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7061 if (useUnscaledOp &&
7062 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
7063 MaxOff))
7064 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7065
7066 Scale = ScaleValue.getKnownMinValue();
7067 assert(IsMulVL == ScaleValue.isScalable() &&
7068 "Unscaled opcode has different value for scalable");
7069
7070 int64_t Remainder = Offset % Scale;
7071 assert(!(Remainder && useUnscaledOp) &&
7072 "Cannot have remainder when using unscaled op");
7073
7074 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7075 int64_t NewOffset = Offset / Scale;
7076 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7077 Offset = Remainder;
7078 else {
7079 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7080 Offset = Offset - (NewOffset * Scale);
7081 }
7082
7083 if (EmittableOffset)
7084 *EmittableOffset = NewOffset;
7085 if (OutUseUnscaledOp)
7086 *OutUseUnscaledOp = useUnscaledOp;
7087 if (OutUnscaledOp && UnscaledOp)
7088 *OutUnscaledOp = *UnscaledOp;
7089
7090 if (IsMulVL)
7091 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
7092 else
7093 SOffset = StackOffset::get(Offset, SOffset.getScalable());
7095 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7096}
7097
7099 unsigned FrameReg, StackOffset &Offset,
7100 const AArch64InstrInfo *TII) {
7101 unsigned Opcode = MI.getOpcode();
7102 unsigned ImmIdx = FrameRegIdx + 1;
7103
7104 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7105 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
7106 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
7107 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
7108 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
7109 MI.eraseFromParent();
7110 Offset = StackOffset();
7111 return true;
7112 }
7113
7114 int64_t NewOffset;
7115 unsigned UnscaledOp;
7116 bool UseUnscaledOp;
7117 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
7118 &UnscaledOp, &NewOffset);
7121 // Replace the FrameIndex with FrameReg.
7122 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
7123 if (UseUnscaledOp)
7124 MI.setDesc(TII->get(UnscaledOp));
7125
7126 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
7127 return !Offset;
7128 }
7129
7130 return false;
7131}
7132
7138
7139MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7140
7141// AArch64 supports MachineCombiner.
7142bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7143
7144// True when Opc sets flag
7145static bool isCombineInstrSettingFlag(unsigned Opc) {
7146 switch (Opc) {
7147 case AArch64::ADDSWrr:
7148 case AArch64::ADDSWri:
7149 case AArch64::ADDSXrr:
7150 case AArch64::ADDSXri:
7151 case AArch64::SUBSWrr:
7152 case AArch64::SUBSXrr:
7153 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7154 case AArch64::SUBSWri:
7155 case AArch64::SUBSXri:
7156 return true;
7157 default:
7158 break;
7159 }
7160 return false;
7161}
7162
7163// 32b Opcodes that can be combined with a MUL
7164static bool isCombineInstrCandidate32(unsigned Opc) {
7165 switch (Opc) {
7166 case AArch64::ADDWrr:
7167 case AArch64::ADDWri:
7168 case AArch64::SUBWrr:
7169 case AArch64::ADDSWrr:
7170 case AArch64::ADDSWri:
7171 case AArch64::SUBSWrr:
7172 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7173 case AArch64::SUBWri:
7174 case AArch64::SUBSWri:
7175 return true;
7176 default:
7177 break;
7178 }
7179 return false;
7180}
7181
7182// 64b Opcodes that can be combined with a MUL
7183static bool isCombineInstrCandidate64(unsigned Opc) {
7184 switch (Opc) {
7185 case AArch64::ADDXrr:
7186 case AArch64::ADDXri:
7187 case AArch64::SUBXrr:
7188 case AArch64::ADDSXrr:
7189 case AArch64::ADDSXri:
7190 case AArch64::SUBSXrr:
7191 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7192 case AArch64::SUBXri:
7193 case AArch64::SUBSXri:
7194 case AArch64::ADDv8i8:
7195 case AArch64::ADDv16i8:
7196 case AArch64::ADDv4i16:
7197 case AArch64::ADDv8i16:
7198 case AArch64::ADDv2i32:
7199 case AArch64::ADDv4i32:
7200 case AArch64::SUBv8i8:
7201 case AArch64::SUBv16i8:
7202 case AArch64::SUBv4i16:
7203 case AArch64::SUBv8i16:
7204 case AArch64::SUBv2i32:
7205 case AArch64::SUBv4i32:
7206 return true;
7207 default:
7208 break;
7209 }
7210 return false;
7211}
7212
7213// FP Opcodes that can be combined with a FMUL.
7214static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7215 switch (Inst.getOpcode()) {
7216 default:
7217 break;
7218 case AArch64::FADDHrr:
7219 case AArch64::FADDSrr:
7220 case AArch64::FADDDrr:
7221 case AArch64::FADDv4f16:
7222 case AArch64::FADDv8f16:
7223 case AArch64::FADDv2f32:
7224 case AArch64::FADDv2f64:
7225 case AArch64::FADDv4f32:
7226 case AArch64::FSUBHrr:
7227 case AArch64::FSUBSrr:
7228 case AArch64::FSUBDrr:
7229 case AArch64::FSUBv4f16:
7230 case AArch64::FSUBv8f16:
7231 case AArch64::FSUBv2f32:
7232 case AArch64::FSUBv2f64:
7233 case AArch64::FSUBv4f32:
7235 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7236 // the target options or if FADD/FSUB has the contract fast-math flag.
7237 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7239 }
7240 return false;
7241}
7242
7243// Opcodes that can be combined with a MUL
7247
7248//
7249// Utility routine that checks if \param MO is defined by an
7250// \param CombineOpc instruction in the basic block \param MBB
7252 unsigned CombineOpc, unsigned ZeroReg = 0,
7253 bool CheckZeroReg = false) {
7254 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7255 MachineInstr *MI = nullptr;
7256
7257 if (MO.isReg() && MO.getReg().isVirtual())
7258 MI = MRI.getUniqueVRegDef(MO.getReg());
7259 // And it needs to be in the trace (otherwise, it won't have a depth).
7260 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7261 return false;
7262 // Must only used by the user we combine with.
7263 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7264 return false;
7265
7266 if (CheckZeroReg) {
7267 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7268 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7269 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7270 // The third input reg must be zero.
7271 if (MI->getOperand(3).getReg() != ZeroReg)
7272 return false;
7273 }
7274
7275 if (isCombineInstrSettingFlag(CombineOpc) &&
7276 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7277 return false;
7278
7279 return true;
7280}
7281
7282//
7283// Is \param MO defined by an integer multiply and can be combined?
7285 unsigned MulOpc, unsigned ZeroReg) {
7286 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7287}
7288
7289//
7290// Is \param MO defined by a floating-point multiply and can be combined?
7292 unsigned MulOpc) {
7293 return canCombine(MBB, MO, MulOpc);
7294}
7295
7296// TODO: There are many more machine instruction opcodes to match:
7297// 1. Other data types (integer, vectors)
7298// 2. Other math / logic operations (xor, or)
7299// 3. Other forms of the same operation (intrinsics and other variants)
7300bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7301 bool Invert) const {
7302 if (Invert)
7303 return false;
7304 switch (Inst.getOpcode()) {
7305 // == Floating-point types ==
7306 // -- Floating-point instructions --
7307 case AArch64::FADDHrr:
7308 case AArch64::FADDSrr:
7309 case AArch64::FADDDrr:
7310 case AArch64::FMULHrr:
7311 case AArch64::FMULSrr:
7312 case AArch64::FMULDrr:
7313 case AArch64::FMULX16:
7314 case AArch64::FMULX32:
7315 case AArch64::FMULX64:
7316 // -- Advanced SIMD instructions --
7317 case AArch64::FADDv4f16:
7318 case AArch64::FADDv8f16:
7319 case AArch64::FADDv2f32:
7320 case AArch64::FADDv4f32:
7321 case AArch64::FADDv2f64:
7322 case AArch64::FMULv4f16:
7323 case AArch64::FMULv8f16:
7324 case AArch64::FMULv2f32:
7325 case AArch64::FMULv4f32:
7326 case AArch64::FMULv2f64:
7327 case AArch64::FMULXv4f16:
7328 case AArch64::FMULXv8f16:
7329 case AArch64::FMULXv2f32:
7330 case AArch64::FMULXv4f32:
7331 case AArch64::FMULXv2f64:
7332 // -- SVE instructions --
7333 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7334 // in the SVE instruction set (though there are predicated ones).
7335 case AArch64::FADD_ZZZ_H:
7336 case AArch64::FADD_ZZZ_S:
7337 case AArch64::FADD_ZZZ_D:
7338 case AArch64::FMUL_ZZZ_H:
7339 case AArch64::FMUL_ZZZ_S:
7340 case AArch64::FMUL_ZZZ_D:
7343
7344 // == Integer types ==
7345 // -- Base instructions --
7346 // Opcodes MULWrr and MULXrr don't exist because
7347 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7348 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7349 // The machine-combiner does not support three-source-operands machine
7350 // instruction. So we cannot reassociate MULs.
7351 case AArch64::ADDWrr:
7352 case AArch64::ADDXrr:
7353 case AArch64::ANDWrr:
7354 case AArch64::ANDXrr:
7355 case AArch64::ORRWrr:
7356 case AArch64::ORRXrr:
7357 case AArch64::EORWrr:
7358 case AArch64::EORXrr:
7359 case AArch64::EONWrr:
7360 case AArch64::EONXrr:
7361 // -- Advanced SIMD instructions --
7362 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7363 // in the Advanced SIMD instruction set.
7364 case AArch64::ADDv8i8:
7365 case AArch64::ADDv16i8:
7366 case AArch64::ADDv4i16:
7367 case AArch64::ADDv8i16:
7368 case AArch64::ADDv2i32:
7369 case AArch64::ADDv4i32:
7370 case AArch64::ADDv1i64:
7371 case AArch64::ADDv2i64:
7372 case AArch64::MULv8i8:
7373 case AArch64::MULv16i8:
7374 case AArch64::MULv4i16:
7375 case AArch64::MULv8i16:
7376 case AArch64::MULv2i32:
7377 case AArch64::MULv4i32:
7378 case AArch64::ANDv8i8:
7379 case AArch64::ANDv16i8:
7380 case AArch64::ORRv8i8:
7381 case AArch64::ORRv16i8:
7382 case AArch64::EORv8i8:
7383 case AArch64::EORv16i8:
7384 // -- SVE instructions --
7385 case AArch64::ADD_ZZZ_B:
7386 case AArch64::ADD_ZZZ_H:
7387 case AArch64::ADD_ZZZ_S:
7388 case AArch64::ADD_ZZZ_D:
7389 case AArch64::MUL_ZZZ_B:
7390 case AArch64::MUL_ZZZ_H:
7391 case AArch64::MUL_ZZZ_S:
7392 case AArch64::MUL_ZZZ_D:
7393 case AArch64::AND_ZZZ:
7394 case AArch64::ORR_ZZZ:
7395 case AArch64::EOR_ZZZ:
7396 return true;
7397
7398 default:
7399 return false;
7400 }
7401}
7402
7403/// Find instructions that can be turned into madd.
7405 SmallVectorImpl<unsigned> &Patterns) {
7406 unsigned Opc = Root.getOpcode();
7407 MachineBasicBlock &MBB = *Root.getParent();
7408 bool Found = false;
7409
7411 return false;
7413 int Cmp_NZCV =
7414 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7415 // When NZCV is live bail out.
7416 if (Cmp_NZCV == -1)
7417 return false;
7418 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7419 // When opcode can't change bail out.
7420 // CHECKME: do we miss any cases for opcode conversion?
7421 if (NewOpc == Opc)
7422 return false;
7423 Opc = NewOpc;
7424 }
7425
7426 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7427 unsigned Pattern) {
7428 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7429 Patterns.push_back(Pattern);
7430 Found = true;
7431 }
7432 };
7433
7434 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7435 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7436 Patterns.push_back(Pattern);
7437 Found = true;
7438 }
7439 };
7440
7442
7443 switch (Opc) {
7444 default:
7445 break;
7446 case AArch64::ADDWrr:
7447 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7448 "ADDWrr does not have register operands");
7449 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7450 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7451 break;
7452 case AArch64::ADDXrr:
7453 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7454 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7455 break;
7456 case AArch64::SUBWrr:
7457 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7458 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7459 break;
7460 case AArch64::SUBXrr:
7461 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7462 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7463 break;
7464 case AArch64::ADDWri:
7465 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7466 break;
7467 case AArch64::ADDXri:
7468 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7469 break;
7470 case AArch64::SUBWri:
7471 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7472 break;
7473 case AArch64::SUBXri:
7474 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7475 break;
7476 case AArch64::ADDv8i8:
7477 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7478 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7479 break;
7480 case AArch64::ADDv16i8:
7481 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7482 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7483 break;
7484 case AArch64::ADDv4i16:
7485 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7486 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7487 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7488 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7489 break;
7490 case AArch64::ADDv8i16:
7491 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7492 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7493 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7494 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7495 break;
7496 case AArch64::ADDv2i32:
7497 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7498 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7499 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7500 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7501 break;
7502 case AArch64::ADDv4i32:
7503 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7504 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7505 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7506 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7507 break;
7508 case AArch64::SUBv8i8:
7509 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7510 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7511 break;
7512 case AArch64::SUBv16i8:
7513 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7514 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7515 break;
7516 case AArch64::SUBv4i16:
7517 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7518 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7519 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7520 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7521 break;
7522 case AArch64::SUBv8i16:
7523 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7524 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7525 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7526 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7527 break;
7528 case AArch64::SUBv2i32:
7529 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7530 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7531 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7532 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7533 break;
7534 case AArch64::SUBv4i32:
7535 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7536 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7537 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7538 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7539 break;
7540 }
7541 return Found;
7542}
7543
7544bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7545 switch (Opcode) {
7546 default:
7547 break;
7548 case AArch64::UABALB_ZZZ_D:
7549 case AArch64::UABALB_ZZZ_H:
7550 case AArch64::UABALB_ZZZ_S:
7551 case AArch64::UABALT_ZZZ_D:
7552 case AArch64::UABALT_ZZZ_H:
7553 case AArch64::UABALT_ZZZ_S:
7554 case AArch64::SABALB_ZZZ_D:
7555 case AArch64::SABALB_ZZZ_S:
7556 case AArch64::SABALB_ZZZ_H:
7557 case AArch64::SABALT_ZZZ_D:
7558 case AArch64::SABALT_ZZZ_S:
7559 case AArch64::SABALT_ZZZ_H:
7560 case AArch64::UABALv16i8_v8i16:
7561 case AArch64::UABALv2i32_v2i64:
7562 case AArch64::UABALv4i16_v4i32:
7563 case AArch64::UABALv4i32_v2i64:
7564 case AArch64::UABALv8i16_v4i32:
7565 case AArch64::UABALv8i8_v8i16:
7566 case AArch64::UABAv16i8:
7567 case AArch64::UABAv2i32:
7568 case AArch64::UABAv4i16:
7569 case AArch64::UABAv4i32:
7570 case AArch64::UABAv8i16:
7571 case AArch64::UABAv8i8:
7572 case AArch64::SABALv16i8_v8i16:
7573 case AArch64::SABALv2i32_v2i64:
7574 case AArch64::SABALv4i16_v4i32:
7575 case AArch64::SABALv4i32_v2i64:
7576 case AArch64::SABALv8i16_v4i32:
7577 case AArch64::SABALv8i8_v8i16:
7578 case AArch64::SABAv16i8:
7579 case AArch64::SABAv2i32:
7580 case AArch64::SABAv4i16:
7581 case AArch64::SABAv4i32:
7582 case AArch64::SABAv8i16:
7583 case AArch64::SABAv8i8:
7584 return true;
7585 }
7586
7587 return false;
7588}
7589
7590unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7591 unsigned AccumulationOpcode) const {
7592 switch (AccumulationOpcode) {
7593 default:
7594 llvm_unreachable("Unsupported accumulation Opcode!");
7595 case AArch64::UABALB_ZZZ_D:
7596 return AArch64::UABDLB_ZZZ_D;
7597 case AArch64::UABALB_ZZZ_H:
7598 return AArch64::UABDLB_ZZZ_H;
7599 case AArch64::UABALB_ZZZ_S:
7600 return AArch64::UABDLB_ZZZ_S;
7601 case AArch64::UABALT_ZZZ_D:
7602 return AArch64::UABDLT_ZZZ_D;
7603 case AArch64::UABALT_ZZZ_H:
7604 return AArch64::UABDLT_ZZZ_H;
7605 case AArch64::UABALT_ZZZ_S:
7606 return AArch64::UABDLT_ZZZ_S;
7607 case AArch64::UABALv16i8_v8i16:
7608 return AArch64::UABDLv16i8_v8i16;
7609 case AArch64::UABALv2i32_v2i64:
7610 return AArch64::UABDLv2i32_v2i64;
7611 case AArch64::UABALv4i16_v4i32:
7612 return AArch64::UABDLv4i16_v4i32;
7613 case AArch64::UABALv4i32_v2i64:
7614 return AArch64::UABDLv4i32_v2i64;
7615 case AArch64::UABALv8i16_v4i32:
7616 return AArch64::UABDLv8i16_v4i32;
7617 case AArch64::UABALv8i8_v8i16:
7618 return AArch64::UABDLv8i8_v8i16;
7619 case AArch64::UABAv16i8:
7620 return AArch64::UABDv16i8;
7621 case AArch64::UABAv2i32:
7622 return AArch64::UABDv2i32;
7623 case AArch64::UABAv4i16:
7624 return AArch64::UABDv4i16;
7625 case AArch64::UABAv4i32:
7626 return AArch64::UABDv4i32;
7627 case AArch64::UABAv8i16:
7628 return AArch64::UABDv8i16;
7629 case AArch64::UABAv8i8:
7630 return AArch64::UABDv8i8;
7631 case AArch64::SABALB_ZZZ_D:
7632 return AArch64::SABDLB_ZZZ_D;
7633 case AArch64::SABALB_ZZZ_S:
7634 return AArch64::SABDLB_ZZZ_S;
7635 case AArch64::SABALB_ZZZ_H:
7636 return AArch64::SABDLB_ZZZ_H;
7637 case AArch64::SABALT_ZZZ_D:
7638 return AArch64::SABDLT_ZZZ_D;
7639 case AArch64::SABALT_ZZZ_S:
7640 return AArch64::SABDLT_ZZZ_S;
7641 case AArch64::SABALT_ZZZ_H:
7642 return AArch64::SABDLT_ZZZ_H;
7643 case AArch64::SABALv16i8_v8i16:
7644 return AArch64::SABDLv16i8_v8i16;
7645 case AArch64::SABALv2i32_v2i64:
7646 return AArch64::SABDLv2i32_v2i64;
7647 case AArch64::SABALv4i16_v4i32:
7648 return AArch64::SABDLv4i16_v4i32;
7649 case AArch64::SABALv4i32_v2i64:
7650 return AArch64::SABDLv4i32_v2i64;
7651 case AArch64::SABALv8i16_v4i32:
7652 return AArch64::SABDLv8i16_v4i32;
7653 case AArch64::SABALv8i8_v8i16:
7654 return AArch64::SABDLv8i8_v8i16;
7655 case AArch64::SABAv16i8:
7656 return AArch64::SABDv16i8;
7657 case AArch64::SABAv2i32:
7658 return AArch64::SABAv2i32;
7659 case AArch64::SABAv4i16:
7660 return AArch64::SABDv4i16;
7661 case AArch64::SABAv4i32:
7662 return AArch64::SABDv4i32;
7663 case AArch64::SABAv8i16:
7664 return AArch64::SABDv8i16;
7665 case AArch64::SABAv8i8:
7666 return AArch64::SABDv8i8;
7667 }
7668}
7669
7670/// Floating-Point Support
7671
7672/// Find instructions that can be turned into madd.
7674 SmallVectorImpl<unsigned> &Patterns) {
7675
7676 if (!isCombineInstrCandidateFP(Root))
7677 return false;
7678
7679 MachineBasicBlock &MBB = *Root.getParent();
7680 bool Found = false;
7681
7682 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7683 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7684 Patterns.push_back(Pattern);
7685 return true;
7686 }
7687 return false;
7688 };
7689
7691
7692 switch (Root.getOpcode()) {
7693 default:
7694 assert(false && "Unsupported FP instruction in combiner\n");
7695 break;
7696 case AArch64::FADDHrr:
7697 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7698 "FADDHrr does not have register operands");
7699
7700 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7701 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7702 break;
7703 case AArch64::FADDSrr:
7704 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7705 "FADDSrr does not have register operands");
7706
7707 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7708 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7709
7710 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7711 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7712 break;
7713 case AArch64::FADDDrr:
7714 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7715 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7716
7717 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7718 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7719 break;
7720 case AArch64::FADDv4f16:
7721 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7722 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7723
7724 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7725 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7726 break;
7727 case AArch64::FADDv8f16:
7728 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7729 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7730
7731 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7732 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7733 break;
7734 case AArch64::FADDv2f32:
7735 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7736 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7737
7738 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7739 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7740 break;
7741 case AArch64::FADDv2f64:
7742 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7743 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7744
7745 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7746 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7747 break;
7748 case AArch64::FADDv4f32:
7749 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7750 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7751
7752 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7753 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7754 break;
7755 case AArch64::FSUBHrr:
7756 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7757 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7758 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7759 break;
7760 case AArch64::FSUBSrr:
7761 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7762
7763 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7764 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7765
7766 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7767 break;
7768 case AArch64::FSUBDrr:
7769 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7770
7771 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7772 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7773
7774 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7775 break;
7776 case AArch64::FSUBv4f16:
7777 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7778 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7779
7780 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7781 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7782 break;
7783 case AArch64::FSUBv8f16:
7784 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7785 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7786
7787 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7788 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7789 break;
7790 case AArch64::FSUBv2f32:
7791 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7792 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7793
7794 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7795 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7796 break;
7797 case AArch64::FSUBv2f64:
7798 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7799 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7800
7801 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7802 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7803 break;
7804 case AArch64::FSUBv4f32:
7805 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7806 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7807
7808 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7809 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7810 break;
7811 }
7812 return Found;
7813}
7814
7816 SmallVectorImpl<unsigned> &Patterns) {
7817 MachineBasicBlock &MBB = *Root.getParent();
7818 bool Found = false;
7819
7820 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7821 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7822 MachineOperand &MO = Root.getOperand(Operand);
7823 MachineInstr *MI = nullptr;
7824 if (MO.isReg() && MO.getReg().isVirtual())
7825 MI = MRI.getUniqueVRegDef(MO.getReg());
7826 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7827 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7828 MI->getOperand(1).getReg().isVirtual())
7829 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7830 if (MI && MI->getOpcode() == Opcode) {
7831 Patterns.push_back(Pattern);
7832 return true;
7833 }
7834 return false;
7835 };
7836
7838
7839 switch (Root.getOpcode()) {
7840 default:
7841 return false;
7842 case AArch64::FMULv2f32:
7843 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7844 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7845 break;
7846 case AArch64::FMULv2f64:
7847 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7848 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7849 break;
7850 case AArch64::FMULv4f16:
7851 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7852 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7853 break;
7854 case AArch64::FMULv4f32:
7855 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7856 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7857 break;
7858 case AArch64::FMULv8f16:
7859 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7860 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7861 break;
7862 }
7863
7864 return Found;
7865}
7866
7868 SmallVectorImpl<unsigned> &Patterns) {
7869 unsigned Opc = Root.getOpcode();
7870 MachineBasicBlock &MBB = *Root.getParent();
7871 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7872
7873 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7874 MachineOperand &MO = Root.getOperand(1);
7876 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7877 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7881 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7882 Patterns.push_back(Pattern);
7883 return true;
7884 }
7885 return false;
7886 };
7887
7888 switch (Opc) {
7889 default:
7890 break;
7891 case AArch64::FNEGDr:
7892 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7893 case AArch64::FNEGSr:
7894 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7895 }
7896
7897 return false;
7898}
7899
7900/// Return true when a code sequence can improve throughput. It
7901/// should be called only for instructions in loops.
7902/// \param Pattern - combiner pattern
7904 switch (Pattern) {
7905 default:
7906 break;
8012 return true;
8013 } // end switch (Pattern)
8014 return false;
8015}
8016
8017/// Find other MI combine patterns.
8019 SmallVectorImpl<unsigned> &Patterns) {
8020 // A - (B + C) ==> (A - B) - C or (A - C) - B
8021 unsigned Opc = Root.getOpcode();
8022 MachineBasicBlock &MBB = *Root.getParent();
8023
8024 switch (Opc) {
8025 case AArch64::SUBWrr:
8026 case AArch64::SUBSWrr:
8027 case AArch64::SUBXrr:
8028 case AArch64::SUBSXrr:
8029 // Found candidate root.
8030 break;
8031 default:
8032 return false;
8033 }
8034
8036 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
8037 -1)
8038 return false;
8039
8040 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
8041 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
8042 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
8043 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
8046 return true;
8047 }
8048
8049 return false;
8050}
8051
8052/// Check if the given instruction forms a gather load pattern that can be
8053/// optimized for better Memory-Level Parallelism (MLP). This function
8054/// identifies chains of NEON lane load instructions that load data from
8055/// different memory addresses into individual lanes of a 128-bit vector
8056/// register, then attempts to split the pattern into parallel loads to break
8057/// the serial dependency between instructions.
8058///
8059/// Pattern Matched:
8060/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8061/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8062///
8063/// Transformed Into:
8064/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8065/// to combine the results, enabling better memory-level parallelism.
8066///
8067/// Supported Element Types:
8068/// - 32-bit elements (LD1i32, 4 lanes total)
8069/// - 16-bit elements (LD1i16, 8 lanes total)
8070/// - 8-bit elements (LD1i8, 16 lanes total)
8072 SmallVectorImpl<unsigned> &Patterns,
8073 unsigned LoadLaneOpCode, unsigned NumLanes) {
8074 const MachineFunction *MF = Root.getMF();
8075
8076 // Early exit if optimizing for size.
8077 if (MF->getFunction().hasMinSize())
8078 return false;
8079
8080 const MachineRegisterInfo &MRI = MF->getRegInfo();
8082
8083 // The root of the pattern must load into the last lane of the vector.
8084 if (Root.getOperand(2).getImm() != NumLanes - 1)
8085 return false;
8086
8087 // Check that we have load into all lanes except lane 0.
8088 // For each load we also want to check that:
8089 // 1. It has a single non-debug use (since we will be replacing the virtual
8090 // register)
8091 // 2. That the addressing mode only uses a single pointer operand
8092 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8093 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
8094 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8096 while (!RemainingLanes.empty() && CurrInstr &&
8097 CurrInstr->getOpcode() == LoadLaneOpCode &&
8098 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
8099 CurrInstr->getNumOperands() == 4) {
8100 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
8101 LoadInstrs.push_back(CurrInstr);
8102 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8103 }
8104
8105 // Check that we have found a match for lanes N-1.. 1.
8106 if (!RemainingLanes.empty())
8107 return false;
8108
8109 // Match the SUBREG_TO_REG sequence.
8110 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8111 return false;
8112
8113 // Verify that the subreg to reg loads an integer into the first lane.
8114 auto Lane0LoadReg = CurrInstr->getOperand(1).getReg();
8115 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8116 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8117 return false;
8118
8119 // Verify that it also has a single non debug use.
8120 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
8121 return false;
8122
8123 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
8124
8125 // If there is any chance of aliasing, do not apply the pattern.
8126 // Walk backward through the MBB starting from Root.
8127 // Exit early if we've encountered all load instructions or hit the search
8128 // limit.
8129 auto MBBItr = Root.getIterator();
8130 unsigned RemainingSteps = GatherOptSearchLimit;
8131 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8132 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
8133 const MachineBasicBlock *MBB = Root.getParent();
8134
8135 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8136 !RemainingLoadInstrs.empty();
8137 --MBBItr, --RemainingSteps) {
8138 const MachineInstr &CurrInstr = *MBBItr;
8139
8140 // Remove this instruction from remaining loads if it's one we're tracking.
8141 RemainingLoadInstrs.erase(&CurrInstr);
8142
8143 // Check for potential aliasing with any of the load instructions to
8144 // optimize.
8145 if (CurrInstr.isLoadFoldBarrier())
8146 return false;
8147 }
8148
8149 // If we hit the search limit without finding all load instructions,
8150 // don't match the pattern.
8151 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8152 return false;
8153
8154 switch (NumLanes) {
8155 case 4:
8157 break;
8158 case 8:
8160 break;
8161 case 16:
8163 break;
8164 default:
8165 llvm_unreachable("Got bad number of lanes for gather pattern.");
8166 }
8167
8168 return true;
8169}
8170
8171/// Search for patterns of LD instructions we can optimize.
8173 SmallVectorImpl<unsigned> &Patterns) {
8174
8175 // The pattern searches for loads into single lanes.
8176 switch (Root.getOpcode()) {
8177 case AArch64::LD1i32:
8178 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8179 case AArch64::LD1i16:
8180 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8181 case AArch64::LD1i8:
8182 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8183 default:
8184 return false;
8185 }
8186}
8187
8188/// Generate optimized instruction sequence for gather load patterns to improve
8189/// Memory-Level Parallelism (MLP). This function transforms a chain of
8190/// sequential NEON lane loads into parallel vector loads that can execute
8191/// concurrently.
8192static void
8196 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8197 unsigned Pattern, unsigned NumLanes) {
8198 MachineFunction &MF = *Root.getParent()->getParent();
8199 MachineRegisterInfo &MRI = MF.getRegInfo();
8201
8202 // Gather the initial load instructions to build the pattern.
8203 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8204 MachineInstr *CurrInstr = &Root;
8205 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8206 LoadToLaneInstrs.push_back(CurrInstr);
8207 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8208 }
8209
8210 // Sort the load instructions according to the lane.
8211 llvm::sort(LoadToLaneInstrs,
8212 [](const MachineInstr *A, const MachineInstr *B) {
8213 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8214 });
8215
8216 MachineInstr *SubregToReg = CurrInstr;
8217 LoadToLaneInstrs.push_back(
8218 MRI.getUniqueVRegDef(SubregToReg->getOperand(1).getReg()));
8219 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8220
8221 const TargetRegisterClass *FPR128RegClass =
8222 MRI.getRegClass(Root.getOperand(0).getReg());
8223
8224 // Helper lambda to create a LD1 instruction.
8225 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8226 Register SrcRegister, unsigned Lane,
8227 Register OffsetRegister,
8228 bool OffsetRegisterKillState) {
8229 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8230 MachineInstrBuilder LoadIndexIntoRegister =
8231 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8232 NewRegister)
8233 .addReg(SrcRegister)
8234 .addImm(Lane)
8235 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
8236 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8237 InsInstrs.push_back(LoadIndexIntoRegister);
8238 return NewRegister;
8239 };
8240
8241 // Helper to create load instruction based on the NumLanes in the NEON
8242 // register we are rewriting.
8243 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8244 Register OffsetReg,
8245 bool KillState) -> MachineInstrBuilder {
8246 unsigned Opcode;
8247 switch (NumLanes) {
8248 case 4:
8249 Opcode = AArch64::LDRSui;
8250 break;
8251 case 8:
8252 Opcode = AArch64::LDRHui;
8253 break;
8254 case 16:
8255 Opcode = AArch64::LDRBui;
8256 break;
8257 default:
8259 "Got unsupported number of lanes in machine-combiner gather pattern");
8260 }
8261 // Immediate offset load
8262 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8263 .addReg(OffsetReg)
8264 .addImm(0);
8265 };
8266
8267 // Load the remaining lanes into register 0.
8268 auto LanesToLoadToReg0 =
8269 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8270 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8271 Register PrevReg = SubregToReg->getOperand(0).getReg();
8272 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8273 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8274 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8275 OffsetRegOperand.getReg(),
8276 OffsetRegOperand.isKill());
8277 DelInstrs.push_back(LoadInstr);
8278 }
8279 Register LastLoadReg0 = PrevReg;
8280
8281 // First load into register 1. Perform an integer load to zero out the upper
8282 // lanes in a single instruction.
8283 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8284 MachineInstr *OriginalSplitLoad =
8285 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8286 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8287 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8288
8289 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8290 OriginalSplitLoad->getOperand(3);
8291 MachineInstrBuilder MiddleIndexLoadInstr =
8292 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8293 OriginalSplitToLoadOffsetOperand.getReg(),
8294 OriginalSplitToLoadOffsetOperand.isKill());
8295
8296 InstrIdxForVirtReg.insert(
8297 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8298 InsInstrs.push_back(MiddleIndexLoadInstr);
8299 DelInstrs.push_back(OriginalSplitLoad);
8300
8301 // Subreg To Reg instruction for register 1.
8302 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8303 unsigned SubregType;
8304 switch (NumLanes) {
8305 case 4:
8306 SubregType = AArch64::ssub;
8307 break;
8308 case 8:
8309 SubregType = AArch64::hsub;
8310 break;
8311 case 16:
8312 SubregType = AArch64::bsub;
8313 break;
8314 default:
8316 "Got invalid NumLanes for machine-combiner gather pattern");
8317 }
8318
8319 auto SubRegToRegInstr =
8320 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8321 DestRegForSubregToReg)
8322 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8323 .addImm(SubregType);
8324 InstrIdxForVirtReg.insert(
8325 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8326 InsInstrs.push_back(SubRegToRegInstr);
8327
8328 // Load remaining lanes into register 1.
8329 auto LanesToLoadToReg1 =
8330 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8331 LoadToLaneInstrsAscending.end());
8332 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8333 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8334 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8335 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8336 OffsetRegOperand.getReg(),
8337 OffsetRegOperand.isKill());
8338
8339 // Do not add the last reg to DelInstrs - it will be removed later.
8340 if (Index == NumLanes / 2 - 2) {
8341 break;
8342 }
8343 DelInstrs.push_back(LoadInstr);
8344 }
8345 Register LastLoadReg1 = PrevReg;
8346
8347 // Create the final zip instruction to combine the results.
8348 MachineInstrBuilder ZipInstr =
8349 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8350 Root.getOperand(0).getReg())
8351 .addReg(LastLoadReg0)
8352 .addReg(LastLoadReg1);
8353 InsInstrs.push_back(ZipInstr);
8354}
8355
8369
8370/// Return true when there is potentially a faster code sequence for an
8371/// instruction chain ending in \p Root. All potential patterns are listed in
8372/// the \p Pattern vector. Pattern should be sorted in priority order since the
8373/// pattern evaluator stops checking as soon as it finds a faster sequence.
8374
8375bool AArch64InstrInfo::getMachineCombinerPatterns(
8376 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8377 bool DoRegPressureReduce) const {
8378 // Integer patterns
8379 if (getMaddPatterns(Root, Patterns))
8380 return true;
8381 // Floating point patterns
8382 if (getFMULPatterns(Root, Patterns))
8383 return true;
8384 if (getFMAPatterns(Root, Patterns))
8385 return true;
8386 if (getFNEGPatterns(Root, Patterns))
8387 return true;
8388
8389 // Other patterns
8390 if (getMiscPatterns(Root, Patterns))
8391 return true;
8392
8393 // Load patterns
8394 if (getLoadPatterns(Root, Patterns))
8395 return true;
8396
8397 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8398 DoRegPressureReduce);
8399}
8400
8402/// genFusedMultiply - Generate fused multiply instructions.
8403/// This function supports both integer and floating point instructions.
8404/// A typical example:
8405/// F|MUL I=A,B,0
8406/// F|ADD R,I,C
8407/// ==> F|MADD R,A,B,C
8408/// \param MF Containing MachineFunction
8409/// \param MRI Register information
8410/// \param TII Target information
8411/// \param Root is the F|ADD instruction
8412/// \param [out] InsInstrs is a vector of machine instructions and will
8413/// contain the generated madd instruction
8414/// \param IdxMulOpd is index of operand in Root that is the result of
8415/// the F|MUL. In the example above IdxMulOpd is 1.
8416/// \param MaddOpc the opcode fo the f|madd instruction
8417/// \param RC Register class of operands
8418/// \param kind of fma instruction (addressing mode) to be generated
8419/// \param ReplacedAddend is the result register from the instruction
8420/// replacing the non-combined operand, if any.
8421static MachineInstr *
8423 const TargetInstrInfo *TII, MachineInstr &Root,
8424 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8425 unsigned MaddOpc, const TargetRegisterClass *RC,
8427 const Register *ReplacedAddend = nullptr) {
8428 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8429
8430 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8431 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8432 Register ResultReg = Root.getOperand(0).getReg();
8433 Register SrcReg0 = MUL->getOperand(1).getReg();
8434 bool Src0IsKill = MUL->getOperand(1).isKill();
8435 Register SrcReg1 = MUL->getOperand(2).getReg();
8436 bool Src1IsKill = MUL->getOperand(2).isKill();
8437
8438 Register SrcReg2;
8439 bool Src2IsKill;
8440 if (ReplacedAddend) {
8441 // If we just generated a new addend, we must be it's only use.
8442 SrcReg2 = *ReplacedAddend;
8443 Src2IsKill = true;
8444 } else {
8445 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8446 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8447 }
8448
8449 if (ResultReg.isVirtual())
8450 MRI.constrainRegClass(ResultReg, RC);
8451 if (SrcReg0.isVirtual())
8452 MRI.constrainRegClass(SrcReg0, RC);
8453 if (SrcReg1.isVirtual())
8454 MRI.constrainRegClass(SrcReg1, RC);
8455 if (SrcReg2.isVirtual())
8456 MRI.constrainRegClass(SrcReg2, RC);
8457
8459 if (kind == FMAInstKind::Default)
8460 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8461 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8462 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8463 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8464 else if (kind == FMAInstKind::Indexed)
8465 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8466 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8467 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8468 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8469 .addImm(MUL->getOperand(3).getImm());
8470 else if (kind == FMAInstKind::Accumulator)
8471 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8472 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8473 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8474 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8475 else
8476 assert(false && "Invalid FMA instruction kind \n");
8477 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8478 InsInstrs.push_back(MIB);
8479 return MUL;
8480}
8481
8482static MachineInstr *
8484 const TargetInstrInfo *TII, MachineInstr &Root,
8486 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8487
8488 unsigned Opc = 0;
8489 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8490 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8491 Opc = AArch64::FNMADDSrrr;
8492 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8493 Opc = AArch64::FNMADDDrrr;
8494 else
8495 return nullptr;
8496
8497 Register ResultReg = Root.getOperand(0).getReg();
8498 Register SrcReg0 = MAD->getOperand(1).getReg();
8499 Register SrcReg1 = MAD->getOperand(2).getReg();
8500 Register SrcReg2 = MAD->getOperand(3).getReg();
8501 bool Src0IsKill = MAD->getOperand(1).isKill();
8502 bool Src1IsKill = MAD->getOperand(2).isKill();
8503 bool Src2IsKill = MAD->getOperand(3).isKill();
8504 if (ResultReg.isVirtual())
8505 MRI.constrainRegClass(ResultReg, RC);
8506 if (SrcReg0.isVirtual())
8507 MRI.constrainRegClass(SrcReg0, RC);
8508 if (SrcReg1.isVirtual())
8509 MRI.constrainRegClass(SrcReg1, RC);
8510 if (SrcReg2.isVirtual())
8511 MRI.constrainRegClass(SrcReg2, RC);
8512
8514 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8515 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8516 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8517 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8518 InsInstrs.push_back(MIB);
8519
8520 return MAD;
8521}
8522
8523/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8524static MachineInstr *
8527 unsigned IdxDupOp, unsigned MulOpc,
8528 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8529 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8530 "Invalid index of FMUL operand");
8531
8532 MachineFunction &MF = *Root.getMF();
8534
8535 MachineInstr *Dup =
8536 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8537
8538 if (Dup->getOpcode() == TargetOpcode::COPY)
8539 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8540
8541 Register DupSrcReg = Dup->getOperand(1).getReg();
8542 MRI.clearKillFlags(DupSrcReg);
8543 MRI.constrainRegClass(DupSrcReg, RC);
8544
8545 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8546
8547 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8548 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8549
8550 Register ResultReg = Root.getOperand(0).getReg();
8551
8553 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8554 .add(MulOp)
8555 .addReg(DupSrcReg)
8556 .addImm(DupSrcLane);
8557
8558 InsInstrs.push_back(MIB);
8559 return &Root;
8560}
8561
8562/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8563/// instructions.
8564///
8565/// \see genFusedMultiply
8569 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8570 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8572}
8573
8574/// genNeg - Helper to generate an intermediate negation of the second operand
8575/// of Root
8577 const TargetInstrInfo *TII, MachineInstr &Root,
8579 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8580 unsigned MnegOpc, const TargetRegisterClass *RC) {
8581 Register NewVR = MRI.createVirtualRegister(RC);
8583 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8584 .add(Root.getOperand(2));
8585 InsInstrs.push_back(MIB);
8586
8587 assert(InstrIdxForVirtReg.empty());
8588 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8589
8590 return NewVR;
8591}
8592
8593/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8594/// instructions with an additional negation of the accumulator
8598 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8599 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8600 assert(IdxMulOpd == 1);
8601
8602 Register NewVR =
8603 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8604 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8605 FMAInstKind::Accumulator, &NewVR);
8606}
8607
8608/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8609/// instructions.
8610///
8611/// \see genFusedMultiply
8615 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8616 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8618}
8619
8620/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8621/// instructions with an additional negation of the accumulator
8625 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8626 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8627 assert(IdxMulOpd == 1);
8628
8629 Register NewVR =
8630 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8631
8632 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8633 FMAInstKind::Indexed, &NewVR);
8634}
8635
8636/// genMaddR - Generate madd instruction and combine mul and add using
8637/// an extra virtual register
8638/// Example - an ADD intermediate needs to be stored in a register:
8639/// MUL I=A,B,0
8640/// ADD R,I,Imm
8641/// ==> ORR V, ZR, Imm
8642/// ==> MADD R,A,B,V
8643/// \param MF Containing MachineFunction
8644/// \param MRI Register information
8645/// \param TII Target information
8646/// \param Root is the ADD instruction
8647/// \param [out] InsInstrs is a vector of machine instructions and will
8648/// contain the generated madd instruction
8649/// \param IdxMulOpd is index of operand in Root that is the result of
8650/// the MUL. In the example above IdxMulOpd is 1.
8651/// \param MaddOpc the opcode fo the madd instruction
8652/// \param VR is a virtual register that holds the value of an ADD operand
8653/// (V in the example above).
8654/// \param RC Register class of operands
8656 const TargetInstrInfo *TII, MachineInstr &Root,
8658 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8659 const TargetRegisterClass *RC) {
8660 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8661
8662 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8663 Register ResultReg = Root.getOperand(0).getReg();
8664 Register SrcReg0 = MUL->getOperand(1).getReg();
8665 bool Src0IsKill = MUL->getOperand(1).isKill();
8666 Register SrcReg1 = MUL->getOperand(2).getReg();
8667 bool Src1IsKill = MUL->getOperand(2).isKill();
8668
8669 if (ResultReg.isVirtual())
8670 MRI.constrainRegClass(ResultReg, RC);
8671 if (SrcReg0.isVirtual())
8672 MRI.constrainRegClass(SrcReg0, RC);
8673 if (SrcReg1.isVirtual())
8674 MRI.constrainRegClass(SrcReg1, RC);
8676 MRI.constrainRegClass(VR, RC);
8677
8679 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8680 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8681 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8682 .addReg(VR);
8683 // Insert the MADD
8684 InsInstrs.push_back(MIB);
8685 return MUL;
8686}
8687
8688/// Do the following transformation
8689/// A - (B + C) ==> (A - B) - C
8690/// A - (B + C) ==> (A - C) - B
8692 const TargetInstrInfo *TII, MachineInstr &Root,
8695 unsigned IdxOpd1,
8696 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8697 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8698 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8699 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8700
8701 Register ResultReg = Root.getOperand(0).getReg();
8702 Register RegA = Root.getOperand(1).getReg();
8703 bool RegAIsKill = Root.getOperand(1).isKill();
8704 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8705 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8706 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8707 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8708 Register NewVR =
8710
8711 unsigned Opcode = Root.getOpcode();
8712 if (Opcode == AArch64::SUBSWrr)
8713 Opcode = AArch64::SUBWrr;
8714 else if (Opcode == AArch64::SUBSXrr)
8715 Opcode = AArch64::SUBXrr;
8716 else
8717 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8718 "Unexpected instruction opcode.");
8719
8720 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8721 Flags &= ~MachineInstr::NoSWrap;
8722 Flags &= ~MachineInstr::NoUWrap;
8723
8724 MachineInstrBuilder MIB1 =
8725 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8726 .addReg(RegA, getKillRegState(RegAIsKill))
8727 .addReg(RegB, getKillRegState(RegBIsKill))
8728 .setMIFlags(Flags);
8729 MachineInstrBuilder MIB2 =
8730 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8731 .addReg(NewVR, getKillRegState(true))
8732 .addReg(RegC, getKillRegState(RegCIsKill))
8733 .setMIFlags(Flags);
8734
8735 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8736 InsInstrs.push_back(MIB1);
8737 InsInstrs.push_back(MIB2);
8738 DelInstrs.push_back(AddMI);
8739 DelInstrs.push_back(&Root);
8740}
8741
8742unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8743 unsigned int AccumulatorOpCode) const {
8744 switch (AccumulatorOpCode) {
8745 case AArch64::UABALB_ZZZ_D:
8746 case AArch64::SABALB_ZZZ_D:
8747 case AArch64::UABALT_ZZZ_D:
8748 case AArch64::SABALT_ZZZ_D:
8749 return AArch64::ADD_ZZZ_D;
8750 case AArch64::UABALB_ZZZ_H:
8751 case AArch64::SABALB_ZZZ_H:
8752 case AArch64::UABALT_ZZZ_H:
8753 case AArch64::SABALT_ZZZ_H:
8754 return AArch64::ADD_ZZZ_H;
8755 case AArch64::UABALB_ZZZ_S:
8756 case AArch64::SABALB_ZZZ_S:
8757 case AArch64::UABALT_ZZZ_S:
8758 case AArch64::SABALT_ZZZ_S:
8759 return AArch64::ADD_ZZZ_S;
8760 case AArch64::UABALv16i8_v8i16:
8761 case AArch64::SABALv8i8_v8i16:
8762 case AArch64::SABAv8i16:
8763 case AArch64::UABAv8i16:
8764 return AArch64::ADDv8i16;
8765 case AArch64::SABALv2i32_v2i64:
8766 case AArch64::UABALv2i32_v2i64:
8767 case AArch64::SABALv4i32_v2i64:
8768 return AArch64::ADDv2i64;
8769 case AArch64::UABALv4i16_v4i32:
8770 case AArch64::SABALv4i16_v4i32:
8771 case AArch64::SABALv8i16_v4i32:
8772 case AArch64::SABAv4i32:
8773 case AArch64::UABAv4i32:
8774 return AArch64::ADDv4i32;
8775 case AArch64::UABALv4i32_v2i64:
8776 return AArch64::ADDv2i64;
8777 case AArch64::UABALv8i16_v4i32:
8778 return AArch64::ADDv4i32;
8779 case AArch64::UABALv8i8_v8i16:
8780 case AArch64::SABALv16i8_v8i16:
8781 return AArch64::ADDv8i16;
8782 case AArch64::UABAv16i8:
8783 case AArch64::SABAv16i8:
8784 return AArch64::ADDv16i8;
8785 case AArch64::UABAv4i16:
8786 case AArch64::SABAv4i16:
8787 return AArch64::ADDv4i16;
8788 case AArch64::UABAv2i32:
8789 case AArch64::SABAv2i32:
8790 return AArch64::ADDv2i32;
8791 case AArch64::UABAv8i8:
8792 case AArch64::SABAv8i8:
8793 return AArch64::ADDv8i8;
8794 default:
8795 llvm_unreachable("Unknown accumulator opcode");
8796 }
8797}
8798
8799/// When getMachineCombinerPatterns() finds potential patterns,
8800/// this function generates the instructions that could replace the
8801/// original code sequence
8802void AArch64InstrInfo::genAlternativeCodeSequence(
8803 MachineInstr &Root, unsigned Pattern,
8806 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8807 MachineBasicBlock &MBB = *Root.getParent();
8808 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8809 MachineFunction &MF = *MBB.getParent();
8810 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8811
8812 MachineInstr *MUL = nullptr;
8813 const TargetRegisterClass *RC;
8814 unsigned Opc;
8815 switch (Pattern) {
8816 default:
8817 // Reassociate instructions.
8818 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8819 DelInstrs, InstrIdxForVirtReg);
8820 return;
8822 // A - (B + C)
8823 // ==> (A - B) - C
8824 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
8825 InstrIdxForVirtReg);
8826 return;
8828 // A - (B + C)
8829 // ==> (A - C) - B
8830 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
8831 InstrIdxForVirtReg);
8832 return;
8835 // MUL I=A,B,0
8836 // ADD R,I,C
8837 // ==> MADD R,A,B,C
8838 // --- Create(MADD);
8840 Opc = AArch64::MADDWrrr;
8841 RC = &AArch64::GPR32RegClass;
8842 } else {
8843 Opc = AArch64::MADDXrrr;
8844 RC = &AArch64::GPR64RegClass;
8845 }
8846 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8847 break;
8850 // MUL I=A,B,0
8851 // ADD R,C,I
8852 // ==> MADD R,A,B,C
8853 // --- Create(MADD);
8855 Opc = AArch64::MADDWrrr;
8856 RC = &AArch64::GPR32RegClass;
8857 } else {
8858 Opc = AArch64::MADDXrrr;
8859 RC = &AArch64::GPR64RegClass;
8860 }
8861 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8862 break;
8867 // MUL I=A,B,0
8868 // ADD/SUB R,I,Imm
8869 // ==> MOV V, Imm/-Imm
8870 // ==> MADD R,A,B,V
8871 // --- Create(MADD);
8872 const TargetRegisterClass *RC;
8873 unsigned BitSize, MovImm;
8876 MovImm = AArch64::MOVi32imm;
8877 RC = &AArch64::GPR32spRegClass;
8878 BitSize = 32;
8879 Opc = AArch64::MADDWrrr;
8880 RC = &AArch64::GPR32RegClass;
8881 } else {
8882 MovImm = AArch64::MOVi64imm;
8883 RC = &AArch64::GPR64spRegClass;
8884 BitSize = 64;
8885 Opc = AArch64::MADDXrrr;
8886 RC = &AArch64::GPR64RegClass;
8887 }
8888 Register NewVR = MRI.createVirtualRegister(RC);
8889 uint64_t Imm = Root.getOperand(2).getImm();
8890
8891 if (Root.getOperand(3).isImm()) {
8892 unsigned Val = Root.getOperand(3).getImm();
8893 Imm = Imm << Val;
8894 }
8895 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8897 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
8898 // Check that the immediate can be composed via a single instruction.
8900 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
8901 if (Insn.size() != 1)
8902 return;
8903 MachineInstrBuilder MIB1 =
8904 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
8905 .addImm(IsSub ? -Imm : Imm);
8906 InsInstrs.push_back(MIB1);
8907 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8908 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8909 break;
8910 }
8913 // MUL I=A,B,0
8914 // SUB R,I, C
8915 // ==> SUB V, 0, C
8916 // ==> MADD R,A,B,V // = -C + A*B
8917 // --- Create(MADD);
8918 const TargetRegisterClass *SubRC;
8919 unsigned SubOpc, ZeroReg;
8921 SubOpc = AArch64::SUBWrr;
8922 SubRC = &AArch64::GPR32spRegClass;
8923 ZeroReg = AArch64::WZR;
8924 Opc = AArch64::MADDWrrr;
8925 RC = &AArch64::GPR32RegClass;
8926 } else {
8927 SubOpc = AArch64::SUBXrr;
8928 SubRC = &AArch64::GPR64spRegClass;
8929 ZeroReg = AArch64::XZR;
8930 Opc = AArch64::MADDXrrr;
8931 RC = &AArch64::GPR64RegClass;
8932 }
8933 Register NewVR = MRI.createVirtualRegister(SubRC);
8934 // SUB NewVR, 0, C
8935 MachineInstrBuilder MIB1 =
8936 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
8937 .addReg(ZeroReg)
8938 .add(Root.getOperand(2));
8939 InsInstrs.push_back(MIB1);
8940 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8941 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8942 break;
8943 }
8946 // MUL I=A,B,0
8947 // SUB R,C,I
8948 // ==> MSUB R,A,B,C (computes C - A*B)
8949 // --- Create(MSUB);
8951 Opc = AArch64::MSUBWrrr;
8952 RC = &AArch64::GPR32RegClass;
8953 } else {
8954 Opc = AArch64::MSUBXrrr;
8955 RC = &AArch64::GPR64RegClass;
8956 }
8957 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8958 break;
8960 Opc = AArch64::MLAv8i8;
8961 RC = &AArch64::FPR64RegClass;
8962 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8963 break;
8965 Opc = AArch64::MLAv8i8;
8966 RC = &AArch64::FPR64RegClass;
8967 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8968 break;
8970 Opc = AArch64::MLAv16i8;
8971 RC = &AArch64::FPR128RegClass;
8972 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8973 break;
8975 Opc = AArch64::MLAv16i8;
8976 RC = &AArch64::FPR128RegClass;
8977 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8978 break;
8980 Opc = AArch64::MLAv4i16;
8981 RC = &AArch64::FPR64RegClass;
8982 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8983 break;
8985 Opc = AArch64::MLAv4i16;
8986 RC = &AArch64::FPR64RegClass;
8987 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8988 break;
8990 Opc = AArch64::MLAv8i16;
8991 RC = &AArch64::FPR128RegClass;
8992 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8993 break;
8995 Opc = AArch64::MLAv8i16;
8996 RC = &AArch64::FPR128RegClass;
8997 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8998 break;
9000 Opc = AArch64::MLAv2i32;
9001 RC = &AArch64::FPR64RegClass;
9002 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9003 break;
9005 Opc = AArch64::MLAv2i32;
9006 RC = &AArch64::FPR64RegClass;
9007 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9008 break;
9010 Opc = AArch64::MLAv4i32;
9011 RC = &AArch64::FPR128RegClass;
9012 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9013 break;
9015 Opc = AArch64::MLAv4i32;
9016 RC = &AArch64::FPR128RegClass;
9017 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9018 break;
9019
9021 Opc = AArch64::MLAv8i8;
9022 RC = &AArch64::FPR64RegClass;
9023 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9024 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
9025 RC);
9026 break;
9028 Opc = AArch64::MLSv8i8;
9029 RC = &AArch64::FPR64RegClass;
9030 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9031 break;
9033 Opc = AArch64::MLAv16i8;
9034 RC = &AArch64::FPR128RegClass;
9035 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9036 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
9037 RC);
9038 break;
9040 Opc = AArch64::MLSv16i8;
9041 RC = &AArch64::FPR128RegClass;
9042 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9043 break;
9045 Opc = AArch64::MLAv4i16;
9046 RC = &AArch64::FPR64RegClass;
9047 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9048 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9049 RC);
9050 break;
9052 Opc = AArch64::MLSv4i16;
9053 RC = &AArch64::FPR64RegClass;
9054 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9055 break;
9057 Opc = AArch64::MLAv8i16;
9058 RC = &AArch64::FPR128RegClass;
9059 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9060 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9061 RC);
9062 break;
9064 Opc = AArch64::MLSv8i16;
9065 RC = &AArch64::FPR128RegClass;
9066 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9067 break;
9069 Opc = AArch64::MLAv2i32;
9070 RC = &AArch64::FPR64RegClass;
9071 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9072 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9073 RC);
9074 break;
9076 Opc = AArch64::MLSv2i32;
9077 RC = &AArch64::FPR64RegClass;
9078 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9079 break;
9081 Opc = AArch64::MLAv4i32;
9082 RC = &AArch64::FPR128RegClass;
9083 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9084 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9085 RC);
9086 break;
9088 Opc = AArch64::MLSv4i32;
9089 RC = &AArch64::FPR128RegClass;
9090 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9091 break;
9092
9094 Opc = AArch64::MLAv4i16_indexed;
9095 RC = &AArch64::FPR64RegClass;
9096 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9097 break;
9099 Opc = AArch64::MLAv4i16_indexed;
9100 RC = &AArch64::FPR64RegClass;
9101 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9102 break;
9104 Opc = AArch64::MLAv8i16_indexed;
9105 RC = &AArch64::FPR128RegClass;
9106 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9107 break;
9109 Opc = AArch64::MLAv8i16_indexed;
9110 RC = &AArch64::FPR128RegClass;
9111 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9112 break;
9114 Opc = AArch64::MLAv2i32_indexed;
9115 RC = &AArch64::FPR64RegClass;
9116 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9117 break;
9119 Opc = AArch64::MLAv2i32_indexed;
9120 RC = &AArch64::FPR64RegClass;
9121 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9122 break;
9124 Opc = AArch64::MLAv4i32_indexed;
9125 RC = &AArch64::FPR128RegClass;
9126 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9127 break;
9129 Opc = AArch64::MLAv4i32_indexed;
9130 RC = &AArch64::FPR128RegClass;
9131 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9132 break;
9133
9135 Opc = AArch64::MLAv4i16_indexed;
9136 RC = &AArch64::FPR64RegClass;
9137 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9138 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9139 RC);
9140 break;
9142 Opc = AArch64::MLSv4i16_indexed;
9143 RC = &AArch64::FPR64RegClass;
9144 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9145 break;
9147 Opc = AArch64::MLAv8i16_indexed;
9148 RC = &AArch64::FPR128RegClass;
9149 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9150 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9151 RC);
9152 break;
9154 Opc = AArch64::MLSv8i16_indexed;
9155 RC = &AArch64::FPR128RegClass;
9156 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9157 break;
9159 Opc = AArch64::MLAv2i32_indexed;
9160 RC = &AArch64::FPR64RegClass;
9161 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9162 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9163 RC);
9164 break;
9166 Opc = AArch64::MLSv2i32_indexed;
9167 RC = &AArch64::FPR64RegClass;
9168 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9169 break;
9171 Opc = AArch64::MLAv4i32_indexed;
9172 RC = &AArch64::FPR128RegClass;
9173 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9174 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9175 RC);
9176 break;
9178 Opc = AArch64::MLSv4i32_indexed;
9179 RC = &AArch64::FPR128RegClass;
9180 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9181 break;
9182
9183 // Floating Point Support
9185 Opc = AArch64::FMADDHrrr;
9186 RC = &AArch64::FPR16RegClass;
9187 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9188 break;
9190 Opc = AArch64::FMADDSrrr;
9191 RC = &AArch64::FPR32RegClass;
9192 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9193 break;
9195 Opc = AArch64::FMADDDrrr;
9196 RC = &AArch64::FPR64RegClass;
9197 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9198 break;
9199
9201 Opc = AArch64::FMADDHrrr;
9202 RC = &AArch64::FPR16RegClass;
9203 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9204 break;
9206 Opc = AArch64::FMADDSrrr;
9207 RC = &AArch64::FPR32RegClass;
9208 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9209 break;
9211 Opc = AArch64::FMADDDrrr;
9212 RC = &AArch64::FPR64RegClass;
9213 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9214 break;
9215
9217 Opc = AArch64::FMLAv1i32_indexed;
9218 RC = &AArch64::FPR32RegClass;
9219 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9221 break;
9223 Opc = AArch64::FMLAv1i32_indexed;
9224 RC = &AArch64::FPR32RegClass;
9225 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9227 break;
9228
9230 Opc = AArch64::FMLAv1i64_indexed;
9231 RC = &AArch64::FPR64RegClass;
9232 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9234 break;
9236 Opc = AArch64::FMLAv1i64_indexed;
9237 RC = &AArch64::FPR64RegClass;
9238 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9240 break;
9241
9243 RC = &AArch64::FPR64RegClass;
9244 Opc = AArch64::FMLAv4i16_indexed;
9245 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9247 break;
9249 RC = &AArch64::FPR64RegClass;
9250 Opc = AArch64::FMLAv4f16;
9251 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9253 break;
9255 RC = &AArch64::FPR64RegClass;
9256 Opc = AArch64::FMLAv4i16_indexed;
9257 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9259 break;
9261 RC = &AArch64::FPR64RegClass;
9262 Opc = AArch64::FMLAv4f16;
9263 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9265 break;
9266
9269 RC = &AArch64::FPR64RegClass;
9271 Opc = AArch64::FMLAv2i32_indexed;
9272 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9274 } else {
9275 Opc = AArch64::FMLAv2f32;
9276 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9278 }
9279 break;
9282 RC = &AArch64::FPR64RegClass;
9284 Opc = AArch64::FMLAv2i32_indexed;
9285 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9287 } else {
9288 Opc = AArch64::FMLAv2f32;
9289 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9291 }
9292 break;
9293
9295 RC = &AArch64::FPR128RegClass;
9296 Opc = AArch64::FMLAv8i16_indexed;
9297 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9299 break;
9301 RC = &AArch64::FPR128RegClass;
9302 Opc = AArch64::FMLAv8f16;
9303 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9305 break;
9307 RC = &AArch64::FPR128RegClass;
9308 Opc = AArch64::FMLAv8i16_indexed;
9309 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9311 break;
9313 RC = &AArch64::FPR128RegClass;
9314 Opc = AArch64::FMLAv8f16;
9315 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9317 break;
9318
9321 RC = &AArch64::FPR128RegClass;
9323 Opc = AArch64::FMLAv2i64_indexed;
9324 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9326 } else {
9327 Opc = AArch64::FMLAv2f64;
9328 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9330 }
9331 break;
9334 RC = &AArch64::FPR128RegClass;
9336 Opc = AArch64::FMLAv2i64_indexed;
9337 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9339 } else {
9340 Opc = AArch64::FMLAv2f64;
9341 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9343 }
9344 break;
9345
9348 RC = &AArch64::FPR128RegClass;
9350 Opc = AArch64::FMLAv4i32_indexed;
9351 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9353 } else {
9354 Opc = AArch64::FMLAv4f32;
9355 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9357 }
9358 break;
9359
9362 RC = &AArch64::FPR128RegClass;
9364 Opc = AArch64::FMLAv4i32_indexed;
9365 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9367 } else {
9368 Opc = AArch64::FMLAv4f32;
9369 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9371 }
9372 break;
9373
9375 Opc = AArch64::FNMSUBHrrr;
9376 RC = &AArch64::FPR16RegClass;
9377 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9378 break;
9380 Opc = AArch64::FNMSUBSrrr;
9381 RC = &AArch64::FPR32RegClass;
9382 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9383 break;
9385 Opc = AArch64::FNMSUBDrrr;
9386 RC = &AArch64::FPR64RegClass;
9387 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9388 break;
9389
9391 Opc = AArch64::FNMADDHrrr;
9392 RC = &AArch64::FPR16RegClass;
9393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9394 break;
9396 Opc = AArch64::FNMADDSrrr;
9397 RC = &AArch64::FPR32RegClass;
9398 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9399 break;
9401 Opc = AArch64::FNMADDDrrr;
9402 RC = &AArch64::FPR64RegClass;
9403 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9404 break;
9405
9407 Opc = AArch64::FMSUBHrrr;
9408 RC = &AArch64::FPR16RegClass;
9409 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9410 break;
9412 Opc = AArch64::FMSUBSrrr;
9413 RC = &AArch64::FPR32RegClass;
9414 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9415 break;
9417 Opc = AArch64::FMSUBDrrr;
9418 RC = &AArch64::FPR64RegClass;
9419 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9420 break;
9421
9423 Opc = AArch64::FMLSv1i32_indexed;
9424 RC = &AArch64::FPR32RegClass;
9425 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9427 break;
9428
9430 Opc = AArch64::FMLSv1i64_indexed;
9431 RC = &AArch64::FPR64RegClass;
9432 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9434 break;
9435
9438 RC = &AArch64::FPR64RegClass;
9439 Register NewVR = MRI.createVirtualRegister(RC);
9440 MachineInstrBuilder MIB1 =
9441 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9442 .add(Root.getOperand(2));
9443 InsInstrs.push_back(MIB1);
9444 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9446 Opc = AArch64::FMLAv4f16;
9447 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9448 FMAInstKind::Accumulator, &NewVR);
9449 } else {
9450 Opc = AArch64::FMLAv4i16_indexed;
9451 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9452 FMAInstKind::Indexed, &NewVR);
9453 }
9454 break;
9455 }
9457 RC = &AArch64::FPR64RegClass;
9458 Opc = AArch64::FMLSv4f16;
9459 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9461 break;
9463 RC = &AArch64::FPR64RegClass;
9464 Opc = AArch64::FMLSv4i16_indexed;
9465 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9467 break;
9468
9471 RC = &AArch64::FPR64RegClass;
9473 Opc = AArch64::FMLSv2i32_indexed;
9474 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9476 } else {
9477 Opc = AArch64::FMLSv2f32;
9478 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9480 }
9481 break;
9482
9485 RC = &AArch64::FPR128RegClass;
9486 Register NewVR = MRI.createVirtualRegister(RC);
9487 MachineInstrBuilder MIB1 =
9488 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9489 .add(Root.getOperand(2));
9490 InsInstrs.push_back(MIB1);
9491 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9493 Opc = AArch64::FMLAv8f16;
9494 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9495 FMAInstKind::Accumulator, &NewVR);
9496 } else {
9497 Opc = AArch64::FMLAv8i16_indexed;
9498 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9499 FMAInstKind::Indexed, &NewVR);
9500 }
9501 break;
9502 }
9504 RC = &AArch64::FPR128RegClass;
9505 Opc = AArch64::FMLSv8f16;
9506 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9508 break;
9510 RC = &AArch64::FPR128RegClass;
9511 Opc = AArch64::FMLSv8i16_indexed;
9512 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9514 break;
9515
9518 RC = &AArch64::FPR128RegClass;
9520 Opc = AArch64::FMLSv2i64_indexed;
9521 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9523 } else {
9524 Opc = AArch64::FMLSv2f64;
9525 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9527 }
9528 break;
9529
9532 RC = &AArch64::FPR128RegClass;
9534 Opc = AArch64::FMLSv4i32_indexed;
9535 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9537 } else {
9538 Opc = AArch64::FMLSv4f32;
9539 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9541 }
9542 break;
9545 RC = &AArch64::FPR64RegClass;
9546 Register NewVR = MRI.createVirtualRegister(RC);
9547 MachineInstrBuilder MIB1 =
9548 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9549 .add(Root.getOperand(2));
9550 InsInstrs.push_back(MIB1);
9551 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9553 Opc = AArch64::FMLAv2i32_indexed;
9554 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9555 FMAInstKind::Indexed, &NewVR);
9556 } else {
9557 Opc = AArch64::FMLAv2f32;
9558 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9559 FMAInstKind::Accumulator, &NewVR);
9560 }
9561 break;
9562 }
9565 RC = &AArch64::FPR128RegClass;
9566 Register NewVR = MRI.createVirtualRegister(RC);
9567 MachineInstrBuilder MIB1 =
9568 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9569 .add(Root.getOperand(2));
9570 InsInstrs.push_back(MIB1);
9571 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9573 Opc = AArch64::FMLAv4i32_indexed;
9574 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9575 FMAInstKind::Indexed, &NewVR);
9576 } else {
9577 Opc = AArch64::FMLAv4f32;
9578 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9579 FMAInstKind::Accumulator, &NewVR);
9580 }
9581 break;
9582 }
9585 RC = &AArch64::FPR128RegClass;
9586 Register NewVR = MRI.createVirtualRegister(RC);
9587 MachineInstrBuilder MIB1 =
9588 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9589 .add(Root.getOperand(2));
9590 InsInstrs.push_back(MIB1);
9591 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9593 Opc = AArch64::FMLAv2i64_indexed;
9594 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9595 FMAInstKind::Indexed, &NewVR);
9596 } else {
9597 Opc = AArch64::FMLAv2f64;
9598 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9599 FMAInstKind::Accumulator, &NewVR);
9600 }
9601 break;
9602 }
9605 unsigned IdxDupOp =
9607 : 2;
9608 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9609 &AArch64::FPR128RegClass, MRI);
9610 break;
9611 }
9614 unsigned IdxDupOp =
9616 : 2;
9617 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9618 &AArch64::FPR128RegClass, MRI);
9619 break;
9620 }
9623 unsigned IdxDupOp =
9625 : 2;
9626 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9627 &AArch64::FPR128_loRegClass, MRI);
9628 break;
9629 }
9632 unsigned IdxDupOp =
9634 : 2;
9635 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9636 &AArch64::FPR128RegClass, MRI);
9637 break;
9638 }
9641 unsigned IdxDupOp =
9643 : 2;
9644 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9645 &AArch64::FPR128_loRegClass, MRI);
9646 break;
9647 }
9649 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9650 break;
9651 }
9653 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9654 Pattern, 4);
9655 break;
9656 }
9658 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9659 Pattern, 8);
9660 break;
9661 }
9663 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9664 Pattern, 16);
9665 break;
9666 }
9667
9668 } // end switch (Pattern)
9669 // Record MUL and ADD/SUB for deletion
9670 if (MUL)
9671 DelInstrs.push_back(MUL);
9672 DelInstrs.push_back(&Root);
9673
9674 // Set the flags on the inserted instructions to be the merged flags of the
9675 // instructions that we have combined.
9676 uint32_t Flags = Root.getFlags();
9677 if (MUL)
9678 Flags = Root.mergeFlagsWith(*MUL);
9679 for (auto *MI : InsInstrs)
9680 MI->setFlags(Flags);
9681}
9682
9683/// Replace csincr-branch sequence by simple conditional branch
9684///
9685/// Examples:
9686/// 1. \code
9687/// csinc w9, wzr, wzr, <condition code>
9688/// tbnz w9, #0, 0x44
9689/// \endcode
9690/// to
9691/// \code
9692/// b.<inverted condition code>
9693/// \endcode
9694///
9695/// 2. \code
9696/// csinc w9, wzr, wzr, <condition code>
9697/// tbz w9, #0, 0x44
9698/// \endcode
9699/// to
9700/// \code
9701/// b.<condition code>
9702/// \endcode
9703///
9704/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9705/// compare's constant operand is power of 2.
9706///
9707/// Examples:
9708/// \code
9709/// and w8, w8, #0x400
9710/// cbnz w8, L1
9711/// \endcode
9712/// to
9713/// \code
9714/// tbnz w8, #10, L1
9715/// \endcode
9716///
9717/// \param MI Conditional Branch
9718/// \return True when the simple conditional branch is generated
9719///
9721 bool IsNegativeBranch = false;
9722 bool IsTestAndBranch = false;
9723 unsigned TargetBBInMI = 0;
9724 switch (MI.getOpcode()) {
9725 default:
9726 llvm_unreachable("Unknown branch instruction?");
9727 case AArch64::Bcc:
9728 case AArch64::CBWPri:
9729 case AArch64::CBXPri:
9730 case AArch64::CBBAssertExt:
9731 case AArch64::CBHAssertExt:
9732 case AArch64::CBWPrr:
9733 case AArch64::CBXPrr:
9734 return false;
9735 case AArch64::CBZW:
9736 case AArch64::CBZX:
9737 TargetBBInMI = 1;
9738 break;
9739 case AArch64::CBNZW:
9740 case AArch64::CBNZX:
9741 TargetBBInMI = 1;
9742 IsNegativeBranch = true;
9743 break;
9744 case AArch64::TBZW:
9745 case AArch64::TBZX:
9746 TargetBBInMI = 2;
9747 IsTestAndBranch = true;
9748 break;
9749 case AArch64::TBNZW:
9750 case AArch64::TBNZX:
9751 TargetBBInMI = 2;
9752 IsNegativeBranch = true;
9753 IsTestAndBranch = true;
9754 break;
9755 }
9756 // So we increment a zero register and test for bits other
9757 // than bit 0? Conservatively bail out in case the verifier
9758 // missed this case.
9759 if (IsTestAndBranch && MI.getOperand(1).getImm())
9760 return false;
9761
9762 // Find Definition.
9763 assert(MI.getParent() && "Incomplete machine instruction\n");
9764 MachineBasicBlock *MBB = MI.getParent();
9765 MachineFunction *MF = MBB->getParent();
9766 MachineRegisterInfo *MRI = &MF->getRegInfo();
9767 Register VReg = MI.getOperand(0).getReg();
9768 if (!VReg.isVirtual())
9769 return false;
9770
9771 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9772
9773 // Look through COPY instructions to find definition.
9774 while (DefMI->isCopy()) {
9775 Register CopyVReg = DefMI->getOperand(1).getReg();
9776 if (!MRI->hasOneNonDBGUse(CopyVReg))
9777 return false;
9778 if (!MRI->hasOneDef(CopyVReg))
9779 return false;
9780 DefMI = MRI->getVRegDef(CopyVReg);
9781 }
9782
9783 switch (DefMI->getOpcode()) {
9784 default:
9785 return false;
9786 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9787 case AArch64::ANDWri:
9788 case AArch64::ANDXri: {
9789 if (IsTestAndBranch)
9790 return false;
9791 if (DefMI->getParent() != MBB)
9792 return false;
9793 if (!MRI->hasOneNonDBGUse(VReg))
9794 return false;
9795
9796 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9798 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
9799 if (!isPowerOf2_64(Mask))
9800 return false;
9801
9802 MachineOperand &MO = DefMI->getOperand(1);
9803 Register NewReg = MO.getReg();
9804 if (!NewReg.isVirtual())
9805 return false;
9806
9807 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9808
9809 MachineBasicBlock &RefToMBB = *MBB;
9810 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
9811 DebugLoc DL = MI.getDebugLoc();
9812 unsigned Imm = Log2_64(Mask);
9813 unsigned Opc = (Imm < 32)
9814 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9815 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9816 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
9817 .addReg(NewReg)
9818 .addImm(Imm)
9819 .addMBB(TBB);
9820 // Register lives on to the CBZ now.
9821 MO.setIsKill(false);
9822
9823 // For immediate smaller than 32, we need to use the 32-bit
9824 // variant (W) in all cases. Indeed the 64-bit variant does not
9825 // allow to encode them.
9826 // Therefore, if the input register is 64-bit, we need to take the
9827 // 32-bit sub-part.
9828 if (!Is32Bit && Imm < 32)
9829 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
9830 MI.eraseFromParent();
9831 return true;
9832 }
9833 // Look for CSINC
9834 case AArch64::CSINCWr:
9835 case AArch64::CSINCXr: {
9836 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
9837 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
9838 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
9839 DefMI->getOperand(2).getReg() == AArch64::XZR))
9840 return false;
9841
9842 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
9843 true) != -1)
9844 return false;
9845
9846 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
9847 // Convert only when the condition code is not modified between
9848 // the CSINC and the branch. The CC may be used by other
9849 // instructions in between.
9851 return false;
9852 MachineBasicBlock &RefToMBB = *MBB;
9853 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
9854 DebugLoc DL = MI.getDebugLoc();
9855 if (IsNegativeBranch)
9857 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
9858 MI.eraseFromParent();
9859 return true;
9860 }
9861 }
9862}
9863
9864std::pair<unsigned, unsigned>
9865AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9866 const unsigned Mask = AArch64II::MO_FRAGMENT;
9867 return std::make_pair(TF & Mask, TF & ~Mask);
9868}
9869
9871AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9872 using namespace AArch64II;
9873
9874 static const std::pair<unsigned, const char *> TargetFlags[] = {
9875 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9876 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9877 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9878 {MO_HI12, "aarch64-hi12"}};
9879 return ArrayRef(TargetFlags);
9880}
9881
9883AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9884 using namespace AArch64II;
9885
9886 static const std::pair<unsigned, const char *> TargetFlags[] = {
9887 {MO_COFFSTUB, "aarch64-coffstub"},
9888 {MO_GOT, "aarch64-got"},
9889 {MO_NC, "aarch64-nc"},
9890 {MO_S, "aarch64-s"},
9891 {MO_TLS, "aarch64-tls"},
9892 {MO_DLLIMPORT, "aarch64-dllimport"},
9893 {MO_PREL, "aarch64-prel"},
9894 {MO_TAGGED, "aarch64-tagged"},
9895 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9896 };
9897 return ArrayRef(TargetFlags);
9898}
9899
9901AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9902 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9903 {{MOSuppressPair, "aarch64-suppress-pair"},
9904 {MOStridedAccess, "aarch64-strided-access"}};
9905 return ArrayRef(TargetFlags);
9906}
9907
9908/// Constants defining how certain sequences should be outlined.
9909/// This encompasses how an outlined function should be called, and what kind of
9910/// frame should be emitted for that outlined function.
9911///
9912/// \p MachineOutlinerDefault implies that the function should be called with
9913/// a save and restore of LR to the stack.
9914///
9915/// That is,
9916///
9917/// I1 Save LR OUTLINED_FUNCTION:
9918/// I2 --> BL OUTLINED_FUNCTION I1
9919/// I3 Restore LR I2
9920/// I3
9921/// RET
9922///
9923/// * Call construction overhead: 3 (save + BL + restore)
9924/// * Frame construction overhead: 1 (ret)
9925/// * Requires stack fixups? Yes
9926///
9927/// \p MachineOutlinerTailCall implies that the function is being created from
9928/// a sequence of instructions ending in a return.
9929///
9930/// That is,
9931///
9932/// I1 OUTLINED_FUNCTION:
9933/// I2 --> B OUTLINED_FUNCTION I1
9934/// RET I2
9935/// RET
9936///
9937/// * Call construction overhead: 1 (B)
9938/// * Frame construction overhead: 0 (Return included in sequence)
9939/// * Requires stack fixups? No
9940///
9941/// \p MachineOutlinerNoLRSave implies that the function should be called using
9942/// a BL instruction, but doesn't require LR to be saved and restored. This
9943/// happens when LR is known to be dead.
9944///
9945/// That is,
9946///
9947/// I1 OUTLINED_FUNCTION:
9948/// I2 --> BL OUTLINED_FUNCTION I1
9949/// I3 I2
9950/// I3
9951/// RET
9952///
9953/// * Call construction overhead: 1 (BL)
9954/// * Frame construction overhead: 1 (RET)
9955/// * Requires stack fixups? No
9956///
9957/// \p MachineOutlinerThunk implies that the function is being created from
9958/// a sequence of instructions ending in a call. The outlined function is
9959/// called with a BL instruction, and the outlined function tail-calls the
9960/// original call destination.
9961///
9962/// That is,
9963///
9964/// I1 OUTLINED_FUNCTION:
9965/// I2 --> BL OUTLINED_FUNCTION I1
9966/// BL f I2
9967/// B f
9968/// * Call construction overhead: 1 (BL)
9969/// * Frame construction overhead: 0
9970/// * Requires stack fixups? No
9971///
9972/// \p MachineOutlinerRegSave implies that the function should be called with a
9973/// save and restore of LR to an available register. This allows us to avoid
9974/// stack fixups. Note that this outlining variant is compatible with the
9975/// NoLRSave case.
9976///
9977/// That is,
9978///
9979/// I1 Save LR OUTLINED_FUNCTION:
9980/// I2 --> BL OUTLINED_FUNCTION I1
9981/// I3 Restore LR I2
9982/// I3
9983/// RET
9984///
9985/// * Call construction overhead: 3 (save + BL + restore)
9986/// * Frame construction overhead: 1 (ret)
9987/// * Requires stack fixups? No
9989 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9990 MachineOutlinerTailCall, /// Only emit a branch.
9991 MachineOutlinerNoLRSave, /// Emit a call and return.
9992 MachineOutlinerThunk, /// Emit a call and tail-call.
9993 MachineOutlinerRegSave /// Same as default, but save to a register.
9994};
9995
10001
10003AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
10004 MachineFunction *MF = C.getMF();
10005 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
10006 const AArch64RegisterInfo *ARI =
10007 static_cast<const AArch64RegisterInfo *>(&TRI);
10008 // Check if there is an available register across the sequence that we can
10009 // use.
10010 for (unsigned Reg : AArch64::GPR64RegClass) {
10011 if (!ARI->isReservedReg(*MF, Reg) &&
10012 Reg != AArch64::LR && // LR is not reserved, but don't use it.
10013 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
10014 Reg != AArch64::X17 && // Ditto for X17.
10015 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
10016 C.isAvailableInsideSeq(Reg, TRI))
10017 return Reg;
10018 }
10019 return Register();
10020}
10021
10022static bool
10024 const outliner::Candidate &b) {
10025 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10026 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10027
10028 return MFIa->getSignReturnAddressCondition() ==
10030}
10031
10032static bool
10034 const outliner::Candidate &b) {
10035 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10036 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10037
10038 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10039}
10040
10042 const outliner::Candidate &b) {
10043 const AArch64Subtarget &SubtargetA =
10045 const AArch64Subtarget &SubtargetB =
10046 b.getMF()->getSubtarget<AArch64Subtarget>();
10047 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10048}
10049
10050std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10051AArch64InstrInfo::getOutliningCandidateInfo(
10052 const MachineModuleInfo &MMI,
10053 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10054 unsigned MinRepeats) const {
10055 unsigned SequenceSize = 0;
10056 for (auto &MI : RepeatedSequenceLocs[0])
10057 SequenceSize += getInstSizeInBytes(MI);
10058
10059 unsigned NumBytesToCreateFrame = 0;
10060
10061 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10062 // These instructions are fused together by the scheduler.
10063 // Any candidate where ADRP is the last instruction should be rejected
10064 // as that will lead to splitting ADRP pair.
10065 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10066 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10067 if (LastMI.getOpcode() == AArch64::ADRP &&
10068 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10069 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10070 return std::nullopt;
10071 }
10072
10073 // Similarly any candidate where the first instruction is ADD/LDR with a
10074 // page offset should be rejected to avoid ADRP splitting.
10075 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10076 FirstMI.getOpcode() == AArch64::LDRXui) &&
10077 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10078 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10079 return std::nullopt;
10080 }
10081
10082 // We only allow outlining for functions having exactly matching return
10083 // address signing attributes, i.e., all share the same value for the
10084 // attribute "sign-return-address" and all share the same type of key they
10085 // are signed with.
10086 // Additionally we require all functions to simultaneously either support
10087 // v8.3a features or not. Otherwise an outlined function could get signed
10088 // using dedicated v8.3 instructions and a call from a function that doesn't
10089 // support v8.3 instructions would therefore be invalid.
10090 if (std::adjacent_find(
10091 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
10092 [](const outliner::Candidate &a, const outliner::Candidate &b) {
10093 // Return true if a and b are non-equal w.r.t. return address
10094 // signing or support of v8.3a features
10095 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10096 outliningCandidatesSigningKeyConsensus(a, b) &&
10097 outliningCandidatesV8_3OpsConsensus(a, b)) {
10098 return false;
10099 }
10100 return true;
10101 }) != RepeatedSequenceLocs.end()) {
10102 return std::nullopt;
10103 }
10104
10105 // Since at this point all candidates agree on their return address signing
10106 // picking just one is fine. If the candidate functions potentially sign their
10107 // return addresses, the outlined function should do the same. Note that in
10108 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10109 // not certainly true that the outlined function will have to sign its return
10110 // address but this decision is made later, when the decision to outline
10111 // has already been made.
10112 // The same holds for the number of additional instructions we need: On
10113 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10114 // necessary. However, at this point we don't know if the outlined function
10115 // will have a RET instruction so we assume the worst.
10116 const TargetRegisterInfo &TRI = getRegisterInfo();
10117 // Performing a tail call may require extra checks when PAuth is enabled.
10118 // If PAuth is disabled, set it to zero for uniformity.
10119 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10120 const auto RASignCondition = RepeatedSequenceLocs[0]
10121 .getMF()
10122 ->getInfo<AArch64FunctionInfo>()
10123 ->getSignReturnAddressCondition();
10124 if (RASignCondition != SignReturnAddress::None) {
10125 // One PAC and one AUT instructions
10126 NumBytesToCreateFrame += 8;
10127
10128 // PAuth is enabled - set extra tail call cost, if any.
10129 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10130 *RepeatedSequenceLocs[0].getMF());
10131 NumBytesToCheckLRInTCEpilogue =
10133 // Checking the authenticated LR value may significantly impact
10134 // SequenceSize, so account for it for more precise results.
10135 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
10136 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10137
10138 // We have to check if sp modifying instructions would get outlined.
10139 // If so we only allow outlining if sp is unchanged overall, so matching
10140 // sub and add instructions are okay to outline, all other sp modifications
10141 // are not
10142 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10143 int SPValue = 0;
10144 for (auto &MI : C) {
10145 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
10146 switch (MI.getOpcode()) {
10147 case AArch64::ADDXri:
10148 case AArch64::ADDWri:
10149 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10150 assert(MI.getOperand(2).isImm() &&
10151 "Expected operand to be immediate");
10152 assert(MI.getOperand(1).isReg() &&
10153 "Expected operand to be a register");
10154 // Check if the add just increments sp. If so, we search for
10155 // matching sub instructions that decrement sp. If not, the
10156 // modification is illegal
10157 if (MI.getOperand(1).getReg() == AArch64::SP)
10158 SPValue += MI.getOperand(2).getImm();
10159 else
10160 return true;
10161 break;
10162 case AArch64::SUBXri:
10163 case AArch64::SUBWri:
10164 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10165 assert(MI.getOperand(2).isImm() &&
10166 "Expected operand to be immediate");
10167 assert(MI.getOperand(1).isReg() &&
10168 "Expected operand to be a register");
10169 // Check if the sub just decrements sp. If so, we search for
10170 // matching add instructions that increment sp. If not, the
10171 // modification is illegal
10172 if (MI.getOperand(1).getReg() == AArch64::SP)
10173 SPValue -= MI.getOperand(2).getImm();
10174 else
10175 return true;
10176 break;
10177 default:
10178 return true;
10179 }
10180 }
10181 }
10182 if (SPValue)
10183 return true;
10184 return false;
10185 };
10186 // Remove candidates with illegal stack modifying instructions
10187 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10188
10189 // If the sequence doesn't have enough candidates left, then we're done.
10190 if (RepeatedSequenceLocs.size() < MinRepeats)
10191 return std::nullopt;
10192 }
10193
10194 // Properties about candidate MBBs that hold for all of them.
10195 unsigned FlagsSetInAll = 0xF;
10196
10197 // Compute liveness information for each candidate, and set FlagsSetInAll.
10198 for (outliner::Candidate &C : RepeatedSequenceLocs)
10199 FlagsSetInAll &= C.Flags;
10200
10201 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10202
10203 // Helper lambda which sets call information for every candidate.
10204 auto SetCandidateCallInfo =
10205 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10206 for (outliner::Candidate &C : RepeatedSequenceLocs)
10207 C.setCallInfo(CallID, NumBytesForCall);
10208 };
10209
10210 unsigned FrameID = MachineOutlinerDefault;
10211 NumBytesToCreateFrame += 4;
10212
10213 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10214 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10215 });
10216
10217 // We check to see if CFI Instructions are present, and if they are
10218 // we find the number of CFI Instructions in the candidates.
10219 unsigned CFICount = 0;
10220 for (auto &I : RepeatedSequenceLocs[0]) {
10221 if (I.isCFIInstruction())
10222 CFICount++;
10223 }
10224
10225 // We compare the number of found CFI Instructions to the number of CFI
10226 // instructions in the parent function for each candidate. We must check this
10227 // since if we outline one of the CFI instructions in a function, we have to
10228 // outline them all for correctness. If we do not, the address offsets will be
10229 // incorrect between the two sections of the program.
10230 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10231 std::vector<MCCFIInstruction> CFIInstructions =
10232 C.getMF()->getFrameInstructions();
10233
10234 if (CFICount > 0 && CFICount != CFIInstructions.size())
10235 return std::nullopt;
10236 }
10237
10238 // Returns true if an instructions is safe to fix up, false otherwise.
10239 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10240 if (MI.isCall())
10241 return true;
10242
10243 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10244 !MI.readsRegister(AArch64::SP, &TRI))
10245 return true;
10246
10247 // Any modification of SP will break our code to save/restore LR.
10248 // FIXME: We could handle some instructions which add a constant
10249 // offset to SP, with a bit more work.
10250 if (MI.modifiesRegister(AArch64::SP, &TRI))
10251 return false;
10252
10253 // At this point, we have a stack instruction that we might need to
10254 // fix up. We'll handle it if it's a load or store.
10255 if (MI.mayLoadOrStore()) {
10256 const MachineOperand *Base; // Filled with the base operand of MI.
10257 int64_t Offset; // Filled with the offset of MI.
10258 bool OffsetIsScalable;
10259
10260 // Does it allow us to offset the base operand and is the base the
10261 // register SP?
10262 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10263 !Base->isReg() || Base->getReg() != AArch64::SP)
10264 return false;
10265
10266 // Fixe-up code below assumes bytes.
10267 if (OffsetIsScalable)
10268 return false;
10269
10270 // Find the minimum/maximum offset for this instruction and check
10271 // if fixing it up would be in range.
10272 int64_t MinOffset,
10273 MaxOffset; // Unscaled offsets for the instruction.
10274 // The scale to multiply the offsets by.
10275 TypeSize Scale(0U, false), DummyWidth(0U, false);
10276 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10277
10278 Offset += 16; // Update the offset to what it would be if we outlined.
10279 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10280 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10281 return false;
10282
10283 // It's in range, so we can outline it.
10284 return true;
10285 }
10286
10287 // FIXME: Add handling for instructions like "add x0, sp, #8".
10288
10289 // We can't fix it up, so don't outline it.
10290 return false;
10291 };
10292
10293 // True if it's possible to fix up each stack instruction in this sequence.
10294 // Important for frames/call variants that modify the stack.
10295 bool AllStackInstrsSafe =
10296 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10297
10298 // If the last instruction in any candidate is a terminator, then we should
10299 // tail call all of the candidates.
10300 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10301 FrameID = MachineOutlinerTailCall;
10302 NumBytesToCreateFrame = 0;
10303 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10304 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10305 }
10306
10307 else if (LastInstrOpcode == AArch64::BL ||
10308 ((LastInstrOpcode == AArch64::BLR ||
10309 LastInstrOpcode == AArch64::BLRNoIP) &&
10310 !HasBTI)) {
10311 // FIXME: Do we need to check if the code after this uses the value of LR?
10312 FrameID = MachineOutlinerThunk;
10313 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10314 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10315 }
10316
10317 else {
10318 // We need to decide how to emit calls + frames. We can always emit the same
10319 // frame if we don't need to save to the stack. If we have to save to the
10320 // stack, then we need a different frame.
10321 unsigned NumBytesNoStackCalls = 0;
10322 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10323
10324 // Check if we have to save LR.
10325 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10326 bool LRAvailable =
10328 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10329 : true;
10330 // If we have a noreturn caller, then we're going to be conservative and
10331 // say that we have to save LR. If we don't have a ret at the end of the
10332 // block, then we can't reason about liveness accurately.
10333 //
10334 // FIXME: We can probably do better than always disabling this in
10335 // noreturn functions by fixing up the liveness info.
10336 bool IsNoReturn =
10337 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10338
10339 // Is LR available? If so, we don't need a save.
10340 if (LRAvailable && !IsNoReturn) {
10341 NumBytesNoStackCalls += 4;
10342 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10343 CandidatesWithoutStackFixups.push_back(C);
10344 }
10345
10346 // Is an unused register available? If so, we won't modify the stack, so
10347 // we can outline with the same frame type as those that don't save LR.
10348 else if (findRegisterToSaveLRTo(C)) {
10349 NumBytesNoStackCalls += 12;
10350 C.setCallInfo(MachineOutlinerRegSave, 12);
10351 CandidatesWithoutStackFixups.push_back(C);
10352 }
10353
10354 // Is SP used in the sequence at all? If not, we don't have to modify
10355 // the stack, so we are guaranteed to get the same frame.
10356 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10357 NumBytesNoStackCalls += 12;
10358 C.setCallInfo(MachineOutlinerDefault, 12);
10359 CandidatesWithoutStackFixups.push_back(C);
10360 }
10361
10362 // If we outline this, we need to modify the stack. Pretend we don't
10363 // outline this by saving all of its bytes.
10364 else {
10365 NumBytesNoStackCalls += SequenceSize;
10366 }
10367 }
10368
10369 // If there are no places where we have to save LR, then note that we
10370 // don't have to update the stack. Otherwise, give every candidate the
10371 // default call type, as long as it's safe to do so.
10372 if (!AllStackInstrsSafe ||
10373 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10374 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10375 FrameID = MachineOutlinerNoLRSave;
10376 if (RepeatedSequenceLocs.size() < MinRepeats)
10377 return std::nullopt;
10378 } else {
10379 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10380
10381 // Bugzilla ID: 46767
10382 // TODO: Check if fixing up the stack more than once is safe so we can
10383 // outline these.
10384 //
10385 // An outline resulting in a caller that requires stack fixups at the
10386 // callsite to a callee that also requires stack fixups can happen when
10387 // there are no available registers at the candidate callsite for a
10388 // candidate that itself also has calls.
10389 //
10390 // In other words if function_containing_sequence in the following pseudo
10391 // assembly requires that we save LR at the point of the call, but there
10392 // are no available registers: in this case we save using SP and as a
10393 // result the SP offsets requires stack fixups by multiples of 16.
10394 //
10395 // function_containing_sequence:
10396 // ...
10397 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10398 // call OUTLINED_FUNCTION_N
10399 // restore LR from SP
10400 // ...
10401 //
10402 // OUTLINED_FUNCTION_N:
10403 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10404 // ...
10405 // bl foo
10406 // restore LR from SP
10407 // ret
10408 //
10409 // Because the code to handle more than one stack fixup does not
10410 // currently have the proper checks for legality, these cases will assert
10411 // in the AArch64 MachineOutliner. This is because the code to do this
10412 // needs more hardening, testing, better checks that generated code is
10413 // legal, etc and because it is only verified to handle a single pass of
10414 // stack fixup.
10415 //
10416 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10417 // these cases until they are known to be handled. Bugzilla 46767 is
10418 // referenced in comments at the assert site.
10419 //
10420 // To avoid asserting (or generating non-legal code on noassert builds)
10421 // we remove all candidates which would need more than one stack fixup by
10422 // pruning the cases where the candidate has calls while also having no
10423 // available LR and having no available general purpose registers to copy
10424 // LR to (ie one extra stack save/restore).
10425 //
10426 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10427 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10428 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10429 return (llvm::any_of(C, IsCall)) &&
10430 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10431 !findRegisterToSaveLRTo(C));
10432 });
10433 }
10434 }
10435
10436 // If we dropped all of the candidates, bail out here.
10437 if (RepeatedSequenceLocs.size() < MinRepeats)
10438 return std::nullopt;
10439 }
10440
10441 // Does every candidate's MBB contain a call? If so, then we might have a call
10442 // in the range.
10443 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10444 // Check if the range contains a call. These require a save + restore of the
10445 // link register.
10446 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10447 bool ModStackToSaveLR = false;
10448 if (any_of(drop_end(FirstCand),
10449 [](const MachineInstr &MI) { return MI.isCall(); }))
10450 ModStackToSaveLR = true;
10451
10452 // Handle the last instruction separately. If this is a tail call, then the
10453 // last instruction is a call. We don't want to save + restore in this case.
10454 // However, it could be possible that the last instruction is a call without
10455 // it being valid to tail call this sequence. We should consider this as
10456 // well.
10457 else if (FrameID != MachineOutlinerThunk &&
10458 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10459 ModStackToSaveLR = true;
10460
10461 if (ModStackToSaveLR) {
10462 // We can't fix up the stack. Bail out.
10463 if (!AllStackInstrsSafe)
10464 return std::nullopt;
10465
10466 // Save + restore LR.
10467 NumBytesToCreateFrame += 8;
10468 }
10469 }
10470
10471 // If we have CFI instructions, we can only outline if the outlined section
10472 // can be a tail call
10473 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10474 return std::nullopt;
10475
10476 return std::make_unique<outliner::OutlinedFunction>(
10477 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10478}
10479
10480void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10481 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10482 // If a bunch of candidates reach this point they must agree on their return
10483 // address signing. It is therefore enough to just consider the signing
10484 // behaviour of one of them
10485 const auto &CFn = Candidates.front().getMF()->getFunction();
10486
10487 if (CFn.hasFnAttribute("ptrauth-returns"))
10488 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10489 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10490 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10491 // Since all candidates belong to the same module, just copy the
10492 // function-level attributes of an arbitrary function.
10493 if (CFn.hasFnAttribute("sign-return-address"))
10494 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10495 if (CFn.hasFnAttribute("sign-return-address-key"))
10496 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10497
10498 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10499}
10500
10501bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10502 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10503 const Function &F = MF.getFunction();
10504
10505 // Can F be deduplicated by the linker? If it can, don't outline from it.
10506 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10507 return false;
10508
10509 // Don't outline from functions with section markings; the program could
10510 // expect that all the code is in the named section.
10511 // FIXME: Allow outlining from multiple functions with the same section
10512 // marking.
10513 if (F.hasSection())
10514 return false;
10515
10516 // Outlining from functions with redzones is unsafe since the outliner may
10517 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10518 // outline from it.
10519 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10520 if (!AFI || AFI->hasRedZone().value_or(true))
10521 return false;
10522
10523 // FIXME: Determine whether it is safe to outline from functions which contain
10524 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10525 // outlined together and ensure it is safe to outline with async unwind info,
10526 // required for saving & restoring VG around calls.
10527 if (AFI->hasStreamingModeChanges())
10528 return false;
10529
10530 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10532 return false;
10533
10534 // It's safe to outline from MF.
10535 return true;
10536}
10537
10539AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10540 unsigned &Flags) const {
10542 "Must track liveness!");
10544 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10545 Ranges;
10546 // According to the AArch64 Procedure Call Standard, the following are
10547 // undefined on entry/exit from a function call:
10548 //
10549 // * Registers x16, x17, (and thus w16, w17)
10550 // * Condition codes (and thus the NZCV register)
10551 //
10552 // If any of these registers are used inside or live across an outlined
10553 // function, then they may be modified later, either by the compiler or
10554 // some other tool (like the linker).
10555 //
10556 // To avoid outlining in these situations, partition each block into ranges
10557 // where these registers are dead. We will only outline from those ranges.
10558 LiveRegUnits LRU(getRegisterInfo());
10559 auto AreAllUnsafeRegsDead = [&LRU]() {
10560 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10561 LRU.available(AArch64::NZCV);
10562 };
10563
10564 // We need to know if LR is live across an outlining boundary later on in
10565 // order to decide how we'll create the outlined call, frame, etc.
10566 //
10567 // It's pretty expensive to check this for *every candidate* within a block.
10568 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10569 // to compute liveness from the end of the block for O(n) candidates within
10570 // the block.
10571 //
10572 // So, to improve the average case, let's keep track of liveness from the end
10573 // of the block to the beginning of *every outlinable range*. If we know that
10574 // LR is available in every range we could outline from, then we know that
10575 // we don't need to check liveness for any candidate within that range.
10576 bool LRAvailableEverywhere = true;
10577 // Compute liveness bottom-up.
10578 LRU.addLiveOuts(MBB);
10579 // Update flags that require info about the entire MBB.
10580 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10581 if (MI.isCall() && !MI.isTerminator())
10583 };
10584 // Range: [RangeBegin, RangeEnd)
10585 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10586 unsigned RangeLen;
10587 auto CreateNewRangeStartingAt =
10588 [&RangeBegin, &RangeEnd,
10589 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10590 RangeBegin = NewBegin;
10591 RangeEnd = std::next(RangeBegin);
10592 RangeLen = 0;
10593 };
10594 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10595 // At least one unsafe register is not dead. We do not want to outline at
10596 // this point. If it is long enough to outline from and does not cross a
10597 // bundle boundary, save the range [RangeBegin, RangeEnd).
10598 if (RangeLen <= 1)
10599 return;
10600 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10601 return;
10602 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10603 return;
10604 Ranges.emplace_back(RangeBegin, RangeEnd);
10605 };
10606 // Find the first point where all unsafe registers are dead.
10607 // FIND: <safe instr> <-- end of first potential range
10608 // SKIP: <unsafe def>
10609 // SKIP: ... everything between ...
10610 // SKIP: <unsafe use>
10611 auto FirstPossibleEndPt = MBB.instr_rbegin();
10612 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10613 LRU.stepBackward(*FirstPossibleEndPt);
10614 // Update flags that impact how we outline across the entire block,
10615 // regardless of safety.
10616 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10617 if (AreAllUnsafeRegsDead())
10618 break;
10619 }
10620 // If we exhausted the entire block, we have no safe ranges to outline.
10621 if (FirstPossibleEndPt == MBB.instr_rend())
10622 return Ranges;
10623 // Current range.
10624 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10625 // StartPt points to the first place where all unsafe registers
10626 // are dead (if there is any such point). Begin partitioning the MBB into
10627 // ranges.
10628 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10629 LRU.stepBackward(MI);
10630 UpdateWholeMBBFlags(MI);
10631 if (!AreAllUnsafeRegsDead()) {
10632 SaveRangeIfNonEmpty();
10633 CreateNewRangeStartingAt(MI.getIterator());
10634 continue;
10635 }
10636 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10637 RangeBegin = MI.getIterator();
10638 ++RangeLen;
10639 }
10640 // Above loop misses the last (or only) range. If we are still safe, then
10641 // let's save the range.
10642 if (AreAllUnsafeRegsDead())
10643 SaveRangeIfNonEmpty();
10644 if (Ranges.empty())
10645 return Ranges;
10646 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10647 // the order.
10648 std::reverse(Ranges.begin(), Ranges.end());
10649 // If there is at least one outlinable range where LR is unavailable
10650 // somewhere, remember that.
10651 if (!LRAvailableEverywhere)
10653 return Ranges;
10654}
10655
10657AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10659 unsigned Flags) const {
10660 MachineInstr &MI = *MIT;
10661
10662 // Don't outline anything used for return address signing. The outlined
10663 // function will get signed later if needed
10664 switch (MI.getOpcode()) {
10665 case AArch64::PACM:
10666 case AArch64::PACIASP:
10667 case AArch64::PACIBSP:
10668 case AArch64::PACIASPPC:
10669 case AArch64::PACIBSPPC:
10670 case AArch64::AUTIASP:
10671 case AArch64::AUTIBSP:
10672 case AArch64::AUTIASPPCi:
10673 case AArch64::AUTIASPPCr:
10674 case AArch64::AUTIBSPPCi:
10675 case AArch64::AUTIBSPPCr:
10676 case AArch64::RETAA:
10677 case AArch64::RETAB:
10678 case AArch64::RETAASPPCi:
10679 case AArch64::RETAASPPCr:
10680 case AArch64::RETABSPPCi:
10681 case AArch64::RETABSPPCr:
10682 case AArch64::EMITBKEY:
10683 case AArch64::PAUTH_PROLOGUE:
10684 case AArch64::PAUTH_EPILOGUE:
10686 }
10687
10688 // We can only outline these if we will tail call the outlined function, or
10689 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10690 // in a tail call.
10691 //
10692 // FIXME: If the proper fixups for the offset are implemented, this should be
10693 // possible.
10694 if (MI.isCFIInstruction())
10696
10697 // Is this a terminator for a basic block?
10698 if (MI.isTerminator())
10699 // TargetInstrInfo::getOutliningType has already filtered out anything
10700 // that would break this, so we can allow it here.
10702
10703 // Make sure none of the operands are un-outlinable.
10704 for (const MachineOperand &MOP : MI.operands()) {
10705 // A check preventing CFI indices was here before, but only CFI
10706 // instructions should have those.
10707 assert(!MOP.isCFIIndex());
10708
10709 // If it uses LR or W30 explicitly, then don't touch it.
10710 if (MOP.isReg() && !MOP.isImplicit() &&
10711 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10713 }
10714
10715 // Special cases for instructions that can always be outlined, but will fail
10716 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10717 // be outlined because they don't require a *specific* value to be in LR.
10718 if (MI.getOpcode() == AArch64::ADRP)
10720
10721 // If MI is a call we might be able to outline it. We don't want to outline
10722 // any calls that rely on the position of items on the stack. When we outline
10723 // something containing a call, we have to emit a save and restore of LR in
10724 // the outlined function. Currently, this always happens by saving LR to the
10725 // stack. Thus, if we outline, say, half the parameters for a function call
10726 // plus the call, then we'll break the callee's expectations for the layout
10727 // of the stack.
10728 //
10729 // FIXME: Allow calls to functions which construct a stack frame, as long
10730 // as they don't access arguments on the stack.
10731 // FIXME: Figure out some way to analyze functions defined in other modules.
10732 // We should be able to compute the memory usage based on the IR calling
10733 // convention, even if we can't see the definition.
10734 if (MI.isCall()) {
10735 // Get the function associated with the call. Look at each operand and find
10736 // the one that represents the callee and get its name.
10737 const Function *Callee = nullptr;
10738 for (const MachineOperand &MOP : MI.operands()) {
10739 if (MOP.isGlobal()) {
10740 Callee = dyn_cast<Function>(MOP.getGlobal());
10741 break;
10742 }
10743 }
10744
10745 // Never outline calls to mcount. There isn't any rule that would require
10746 // this, but the Linux kernel's "ftrace" feature depends on it.
10747 if (Callee && Callee->getName() == "\01_mcount")
10749
10750 // If we don't know anything about the callee, assume it depends on the
10751 // stack layout of the caller. In that case, it's only legal to outline
10752 // as a tail-call. Explicitly list the call instructions we know about so we
10753 // don't get unexpected results with call pseudo-instructions.
10754 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10755 if (MI.getOpcode() == AArch64::BLR ||
10756 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10757 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10758
10759 if (!Callee)
10760 return UnknownCallOutlineType;
10761
10762 // We have a function we have information about. Check it if it's something
10763 // can safely outline.
10764 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10765
10766 // We don't know what's going on with the callee at all. Don't touch it.
10767 if (!CalleeMF)
10768 return UnknownCallOutlineType;
10769
10770 // Check if we know anything about the callee saves on the function. If we
10771 // don't, then don't touch it, since that implies that we haven't
10772 // computed anything about its stack frame yet.
10773 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10774 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10775 MFI.getNumObjects() > 0)
10776 return UnknownCallOutlineType;
10777
10778 // At this point, we can say that CalleeMF ought to not pass anything on the
10779 // stack. Therefore, we can outline it.
10781 }
10782
10783 // Don't touch the link register or W30.
10784 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10785 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10787
10788 // Don't outline BTI instructions, because that will prevent the outlining
10789 // site from being indirectly callable.
10790 if (hasBTISemantics(MI))
10792
10794}
10795
10796void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10797 for (MachineInstr &MI : MBB) {
10798 const MachineOperand *Base;
10799 TypeSize Width(0, false);
10800 int64_t Offset;
10801 bool OffsetIsScalable;
10802
10803 // Is this a load or store with an immediate offset with SP as the base?
10804 if (!MI.mayLoadOrStore() ||
10805 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
10806 &RI) ||
10807 (Base->isReg() && Base->getReg() != AArch64::SP))
10808 continue;
10809
10810 // It is, so we have to fix it up.
10811 TypeSize Scale(0U, false);
10812 int64_t Dummy1, Dummy2;
10813
10814 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
10815 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10816 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
10817 assert(Scale != 0 && "Unexpected opcode!");
10818 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10819
10820 // We've pushed the return address to the stack, so add 16 to the offset.
10821 // This is safe, since we already checked if it would overflow when we
10822 // checked if this instruction was legal to outline.
10823 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10824 StackOffsetOperand.setImm(NewImm);
10825 }
10826}
10827
10829 const AArch64InstrInfo *TII,
10830 bool ShouldSignReturnAddr) {
10831 if (!ShouldSignReturnAddr)
10832 return;
10833
10834 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
10836 TII->createPauthEpilogueInstr(MBB, DebugLoc());
10837}
10838
10839void AArch64InstrInfo::buildOutlinedFrame(
10841 const outliner::OutlinedFunction &OF) const {
10842
10843 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10844
10845 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10846 FI->setOutliningStyle("Tail Call");
10847 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10848 // For thunk outlining, rewrite the last instruction from a call to a
10849 // tail-call.
10850 MachineInstr *Call = &*--MBB.instr_end();
10851 unsigned TailOpcode;
10852 if (Call->getOpcode() == AArch64::BL) {
10853 TailOpcode = AArch64::TCRETURNdi;
10854 } else {
10855 assert(Call->getOpcode() == AArch64::BLR ||
10856 Call->getOpcode() == AArch64::BLRNoIP);
10857 TailOpcode = AArch64::TCRETURNriALL;
10858 }
10859 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
10860 .add(Call->getOperand(0))
10861 .addImm(0);
10862 MBB.insert(MBB.end(), TC);
10864
10865 FI->setOutliningStyle("Thunk");
10866 }
10867
10868 bool IsLeafFunction = true;
10869
10870 // Is there a call in the outlined range?
10871 auto IsNonTailCall = [](const MachineInstr &MI) {
10872 return MI.isCall() && !MI.isReturn();
10873 };
10874
10875 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
10876 // Fix up the instructions in the range, since we're going to modify the
10877 // stack.
10878
10879 // Bugzilla ID: 46767
10880 // TODO: Check if fixing up twice is safe so we can outline these.
10881 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10882 "Can only fix up stack references once");
10883 fixupPostOutline(MBB);
10884
10885 IsLeafFunction = false;
10886
10887 // LR has to be a live in so that we can save it.
10888 if (!MBB.isLiveIn(AArch64::LR))
10889 MBB.addLiveIn(AArch64::LR);
10890
10893
10894 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10895 OF.FrameConstructionID == MachineOutlinerThunk)
10896 Et = std::prev(MBB.end());
10897
10898 // Insert a save before the outlined region
10899 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10900 .addReg(AArch64::SP, RegState::Define)
10901 .addReg(AArch64::LR)
10902 .addReg(AArch64::SP)
10903 .addImm(-16);
10904 It = MBB.insert(It, STRXpre);
10905
10906 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10907 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10908
10909 // Add a CFI saying the stack was moved 16 B down.
10910 CFIBuilder.buildDefCFAOffset(16);
10911
10912 // Add a CFI saying that the LR that we want to find is now 16 B higher
10913 // than before.
10914 CFIBuilder.buildOffset(AArch64::LR, -16);
10915 }
10916
10917 // Insert a restore before the terminator for the function.
10918 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10919 .addReg(AArch64::SP, RegState::Define)
10920 .addReg(AArch64::LR, RegState::Define)
10921 .addReg(AArch64::SP)
10922 .addImm(16);
10923 Et = MBB.insert(Et, LDRXpost);
10924 }
10925
10926 auto RASignCondition = FI->getSignReturnAddressCondition();
10927 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10928 RASignCondition, !IsLeafFunction);
10929
10930 // If this is a tail call outlined function, then there's already a return.
10931 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10932 OF.FrameConstructionID == MachineOutlinerThunk) {
10933 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10934 return;
10935 }
10936
10937 // It's not a tail call, so we have to insert the return ourselves.
10938
10939 // LR has to be a live in so that we can return to it.
10940 if (!MBB.isLiveIn(AArch64::LR))
10941 MBB.addLiveIn(AArch64::LR);
10942
10943 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
10944 .addReg(AArch64::LR);
10945 MBB.insert(MBB.end(), ret);
10946
10947 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10948
10949 FI->setOutliningStyle("Function");
10950
10951 // Did we have to modify the stack by saving the link register?
10952 if (OF.FrameConstructionID != MachineOutlinerDefault)
10953 return;
10954
10955 // We modified the stack.
10956 // Walk over the basic block and fix up all the stack accesses.
10957 fixupPostOutline(MBB);
10958}
10959
10960MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10963
10964 // Are we tail calling?
10965 if (C.CallConstructionID == MachineOutlinerTailCall) {
10966 // If yes, then we can just branch to the label.
10967 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
10968 .addGlobalAddress(M.getNamedValue(MF.getName()))
10969 .addImm(0));
10970 return It;
10971 }
10972
10973 // Are we saving the link register?
10974 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10975 C.CallConstructionID == MachineOutlinerThunk) {
10976 // No, so just insert the call.
10977 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10978 .addGlobalAddress(M.getNamedValue(MF.getName())));
10979 return It;
10980 }
10981
10982 // We want to return the spot where we inserted the call.
10984
10985 // Instructions for saving and restoring LR around the call instruction we're
10986 // going to insert.
10987 MachineInstr *Save;
10988 MachineInstr *Restore;
10989 // Can we save to a register?
10990 if (C.CallConstructionID == MachineOutlinerRegSave) {
10991 // FIXME: This logic should be sunk into a target-specific interface so that
10992 // we don't have to recompute the register.
10993 Register Reg = findRegisterToSaveLRTo(C);
10994 assert(Reg && "No callee-saved register available?");
10995
10996 // LR has to be a live in so that we can save it.
10997 if (!MBB.isLiveIn(AArch64::LR))
10998 MBB.addLiveIn(AArch64::LR);
10999
11000 // Save and restore LR from Reg.
11001 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
11002 .addReg(AArch64::XZR)
11003 .addReg(AArch64::LR)
11004 .addImm(0);
11005 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
11006 .addReg(AArch64::XZR)
11007 .addReg(Reg)
11008 .addImm(0);
11009 } else {
11010 // We have the default case. Save and restore from SP.
11011 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11012 .addReg(AArch64::SP, RegState::Define)
11013 .addReg(AArch64::LR)
11014 .addReg(AArch64::SP)
11015 .addImm(-16);
11016 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11017 .addReg(AArch64::SP, RegState::Define)
11018 .addReg(AArch64::LR, RegState::Define)
11019 .addReg(AArch64::SP)
11020 .addImm(16);
11021 }
11022
11023 It = MBB.insert(It, Save);
11024 It++;
11025
11026 // Insert the call.
11027 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11028 .addGlobalAddress(M.getNamedValue(MF.getName())));
11029 CallPt = It;
11030 It++;
11031
11032 It = MBB.insert(It, Restore);
11033 return CallPt;
11034}
11035
11036bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11037 MachineFunction &MF) const {
11038 return MF.getFunction().hasMinSize();
11039}
11040
11041void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11043 DebugLoc &DL,
11044 bool AllowSideEffects) const {
11045 const MachineFunction &MF = *MBB.getParent();
11046 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11047 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11048
11049 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11050 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
11051 } else if (STI.isSVEorStreamingSVEAvailable()) {
11052 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
11053 .addImm(0)
11054 .addImm(0);
11055 } else if (STI.isNeonAvailable()) {
11056 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
11057 .addImm(0);
11058 } else {
11059 // This is a streaming-compatible function without SVE. We don't have full
11060 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11061 // So given `movi v..` would be illegal use `fmov d..` instead.
11062 assert(STI.hasNEON() && "Expected to have NEON.");
11063 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
11064 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
11065 }
11066}
11067
11068std::optional<DestSourcePair>
11070
11071 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11072 // and zero immediate operands used as an alias for mov instruction.
11073 if (((MI.getOpcode() == AArch64::ORRWrs &&
11074 MI.getOperand(1).getReg() == AArch64::WZR &&
11075 MI.getOperand(3).getImm() == 0x0) ||
11076 (MI.getOpcode() == AArch64::ORRWrr &&
11077 MI.getOperand(1).getReg() == AArch64::WZR)) &&
11078 // Check that the w->w move is not a zero-extending w->x mov.
11079 (!MI.getOperand(0).getReg().isVirtual() ||
11080 MI.getOperand(0).getSubReg() == 0) &&
11081 (!MI.getOperand(0).getReg().isPhysical() ||
11082 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
11083 /*TRI=*/nullptr) == -1))
11084 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11085
11086 if (MI.getOpcode() == AArch64::ORRXrs &&
11087 MI.getOperand(1).getReg() == AArch64::XZR &&
11088 MI.getOperand(3).getImm() == 0x0)
11089 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11090
11091 return std::nullopt;
11092}
11093
11094std::optional<DestSourcePair>
11096 if ((MI.getOpcode() == AArch64::ORRWrs &&
11097 MI.getOperand(1).getReg() == AArch64::WZR &&
11098 MI.getOperand(3).getImm() == 0x0) ||
11099 (MI.getOpcode() == AArch64::ORRWrr &&
11100 MI.getOperand(1).getReg() == AArch64::WZR))
11101 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11102 return std::nullopt;
11103}
11104
11105std::optional<RegImmPair>
11106AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11107 int Sign = 1;
11108 int64_t Offset = 0;
11109
11110 // TODO: Handle cases where Reg is a super- or sub-register of the
11111 // destination register.
11112 const MachineOperand &Op0 = MI.getOperand(0);
11113 if (!Op0.isReg() || Reg != Op0.getReg())
11114 return std::nullopt;
11115
11116 switch (MI.getOpcode()) {
11117 default:
11118 return std::nullopt;
11119 case AArch64::SUBWri:
11120 case AArch64::SUBXri:
11121 case AArch64::SUBSWri:
11122 case AArch64::SUBSXri:
11123 Sign *= -1;
11124 [[fallthrough]];
11125 case AArch64::ADDSWri:
11126 case AArch64::ADDSXri:
11127 case AArch64::ADDWri:
11128 case AArch64::ADDXri: {
11129 // TODO: Third operand can be global address (usually some string).
11130 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
11131 !MI.getOperand(2).isImm())
11132 return std::nullopt;
11133 int Shift = MI.getOperand(3).getImm();
11134 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11135 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
11136 }
11137 }
11138 return RegImmPair{MI.getOperand(1).getReg(), Offset};
11139}
11140
11141/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11142/// the destination register then, if possible, describe the value in terms of
11143/// the source register.
11144static std::optional<ParamLoadedValue>
11146 const TargetInstrInfo *TII,
11147 const TargetRegisterInfo *TRI) {
11148 auto DestSrc = TII->isCopyLikeInstr(MI);
11149 if (!DestSrc)
11150 return std::nullopt;
11151
11152 Register DestReg = DestSrc->Destination->getReg();
11153 Register SrcReg = DestSrc->Source->getReg();
11154
11155 if (!DestReg.isValid() || !SrcReg.isValid())
11156 return std::nullopt;
11157
11158 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11159
11160 // If the described register is the destination, just return the source.
11161 if (DestReg == DescribedReg)
11162 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11163
11164 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11165 if (MI.getOpcode() == AArch64::ORRWrs &&
11166 TRI->isSuperRegister(DestReg, DescribedReg))
11167 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11168
11169 // We may need to describe the lower part of a ORRXrs move.
11170 if (MI.getOpcode() == AArch64::ORRXrs &&
11171 TRI->isSubRegister(DestReg, DescribedReg)) {
11172 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11173 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11174 }
11175
11176 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11177 "Unhandled ORR[XW]rs copy case");
11178
11179 return std::nullopt;
11180}
11181
11182bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11183 // Functions cannot be split to different sections on AArch64 if they have
11184 // a red zone. This is because relaxing a cross-section branch may require
11185 // incrementing the stack pointer to spill a register, which would overwrite
11186 // the red zone.
11187 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11188 return false;
11189
11191}
11192
11193bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11194 const MachineBasicBlock &MBB) const {
11195 // Asm Goto blocks can contain conditional branches to goto labels, which can
11196 // get moved out of range of the branch instruction.
11197 auto isAsmGoto = [](const MachineInstr &MI) {
11198 return MI.getOpcode() == AArch64::INLINEASM_BR;
11199 };
11200 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11201 return false;
11202
11203 // Because jump tables are label-relative instead of table-relative, they all
11204 // must be in the same section or relocation fixup handling will fail.
11205
11206 // Check if MBB is a jump table target
11207 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11208 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11209 return llvm::is_contained(JTE.MBBs, &MBB);
11210 };
11211 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11212 return false;
11213
11214 // Check if MBB contains a jump table lookup
11215 for (const MachineInstr &MI : MBB) {
11216 switch (MI.getOpcode()) {
11217 case TargetOpcode::G_BRJT:
11218 case AArch64::JumpTableDest32:
11219 case AArch64::JumpTableDest16:
11220 case AArch64::JumpTableDest8:
11221 return false;
11222 default:
11223 continue;
11224 }
11225 }
11226
11227 // MBB isn't a special case, so it's safe to be split to the cold section.
11228 return true;
11229}
11230
11231std::optional<ParamLoadedValue>
11232AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11233 Register Reg) const {
11234 const MachineFunction *MF = MI.getMF();
11235 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11236 switch (MI.getOpcode()) {
11237 case AArch64::MOVZWi:
11238 case AArch64::MOVZXi: {
11239 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11240 // 64-bit parameters, so we need to consider super-registers.
11241 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11242 return std::nullopt;
11243
11244 if (!MI.getOperand(1).isImm())
11245 return std::nullopt;
11246 int64_t Immediate = MI.getOperand(1).getImm();
11247 int Shift = MI.getOperand(2).getImm();
11248 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11249 nullptr);
11250 }
11251 case AArch64::ORRWrs:
11252 case AArch64::ORRXrs:
11253 return describeORRLoadedValue(MI, Reg, this, TRI);
11254 }
11255
11257}
11258
11259bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11260 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11261 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11262 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11263 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11264
11265 // Anyexts are nops.
11266 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11267 return true;
11268
11269 Register DefReg = ExtMI.getOperand(0).getReg();
11270 if (!MRI.hasOneNonDBGUse(DefReg))
11271 return false;
11272
11273 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11274 // addressing mode.
11275 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11276 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11277}
11278
11279uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11280 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11281}
11282
11283bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11284 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11285}
11286
11287bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11288 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11289}
11290
11291unsigned int
11292AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11293 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11294}
11295
11296bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11297 unsigned Scale) const {
11298 if (Offset && Scale)
11299 return false;
11300
11301 // Check Reg + Imm
11302 if (!Scale) {
11303 // 9-bit signed offset
11304 if (isInt<9>(Offset))
11305 return true;
11306
11307 // 12-bit unsigned offset
11308 unsigned Shift = Log2_64(NumBytes);
11309 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11310 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11311 (Offset >> Shift) << Shift == Offset)
11312 return true;
11313 return false;
11314 }
11315
11316 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11317 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11318}
11319
11321 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11322 return AArch64::BLRNoIP;
11323 else
11324 return AArch64::BLR;
11325}
11326
11328 DebugLoc DL) const {
11329 MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
11330 auto Builder = BuildMI(MBB, InsertPt, DL, get(AArch64::PAUTH_EPILOGUE))
11332
11333 const auto *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
11334 if (AFI->branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
11335 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11336}
11337
11339AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11340 Register TargetReg, bool FrameSetup) const {
11341 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11342
11343 MachineBasicBlock &MBB = *MBBI->getParent();
11344 MachineFunction &MF = *MBB.getParent();
11345 const AArch64InstrInfo *TII =
11346 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11347 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11348 DebugLoc DL = MBB.findDebugLoc(MBBI);
11349
11350 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11351 MachineBasicBlock *LoopTestMBB =
11352 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11353 MF.insert(MBBInsertPoint, LoopTestMBB);
11354 MachineBasicBlock *LoopBodyMBB =
11355 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11356 MF.insert(MBBInsertPoint, LoopBodyMBB);
11357 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11358 MF.insert(MBBInsertPoint, ExitMBB);
11359 MachineInstr::MIFlag Flags =
11361
11362 // LoopTest:
11363 // SUB SP, SP, #ProbeSize
11364 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11365 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11366
11367 // CMP SP, TargetReg
11368 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11369 AArch64::XZR)
11370 .addReg(AArch64::SP)
11371 .addReg(TargetReg)
11373 .setMIFlags(Flags);
11374
11375 // B.<Cond> LoopExit
11376 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11378 .addMBB(ExitMBB)
11379 .setMIFlags(Flags);
11380
11381 // LDR XZR, [SP]
11382 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::LDRXui))
11383 .addDef(AArch64::XZR)
11384 .addReg(AArch64::SP)
11385 .addImm(0)
11389 Align(8)))
11390 .setMIFlags(Flags);
11391
11392 // B loop
11393 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11394 .addMBB(LoopTestMBB)
11395 .setMIFlags(Flags);
11396
11397 // LoopExit:
11398 // MOV SP, TargetReg
11399 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11400 .addReg(TargetReg)
11401 .addImm(0)
11403 .setMIFlags(Flags);
11404
11405 // LDR XZR, [SP]
11406 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11407 .addReg(AArch64::XZR, RegState::Define)
11408 .addReg(AArch64::SP)
11409 .addImm(0)
11410 .setMIFlags(Flags);
11411
11412 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11414
11415 LoopTestMBB->addSuccessor(ExitMBB);
11416 LoopTestMBB->addSuccessor(LoopBodyMBB);
11417 LoopBodyMBB->addSuccessor(LoopTestMBB);
11418 MBB.addSuccessor(LoopTestMBB);
11419
11420 // Update liveins.
11421 if (MF.getRegInfo().reservedRegsFrozen())
11422 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11423
11424 return ExitMBB->begin();
11425}
11426
11427namespace {
11428class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11429 MachineFunction *MF;
11430 const TargetInstrInfo *TII;
11431 const TargetRegisterInfo *TRI;
11432 MachineRegisterInfo &MRI;
11433
11434 /// The block of the loop
11435 MachineBasicBlock *LoopBB;
11436 /// The conditional branch of the loop
11437 MachineInstr *CondBranch;
11438 /// The compare instruction for loop control
11439 MachineInstr *Comp;
11440 /// The number of the operand of the loop counter value in Comp
11441 unsigned CompCounterOprNum;
11442 /// The instruction that updates the loop counter value
11443 MachineInstr *Update;
11444 /// The number of the operand of the loop counter value in Update
11445 unsigned UpdateCounterOprNum;
11446 /// The initial value of the loop counter
11447 Register Init;
11448 /// True iff Update is a predecessor of Comp
11449 bool IsUpdatePriorComp;
11450
11451 /// The normalized condition used by createTripCountGreaterCondition()
11453
11454public:
11455 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11456 MachineInstr *Comp, unsigned CompCounterOprNum,
11457 MachineInstr *Update, unsigned UpdateCounterOprNum,
11458 Register Init, bool IsUpdatePriorComp,
11459 const SmallVectorImpl<MachineOperand> &Cond)
11460 : MF(Comp->getParent()->getParent()),
11461 TII(MF->getSubtarget().getInstrInfo()),
11462 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11463 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11464 CompCounterOprNum(CompCounterOprNum), Update(Update),
11465 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11466 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11467
11468 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11469 // Make the instructions for loop control be placed in stage 0.
11470 // The predecessors of Comp are considered by the caller.
11471 return MI == Comp;
11472 }
11473
11474 std::optional<bool> createTripCountGreaterCondition(
11475 int TC, MachineBasicBlock &MBB,
11476 SmallVectorImpl<MachineOperand> &CondParam) override {
11477 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11478 // Cond is normalized for such use.
11479 // The predecessors of the branch are assumed to have already been inserted.
11480 CondParam = Cond;
11481 return {};
11482 }
11483
11484 void createRemainingIterationsGreaterCondition(
11485 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11486 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11487
11488 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11489
11490 void adjustTripCount(int TripCountAdjust) override {}
11491
11492 bool isMVEExpanderSupported() override { return true; }
11493};
11494} // namespace
11495
11496/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11497/// is replaced by ReplaceReg. The output register is newly created.
11498/// The other operands are unchanged from MI.
11499static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11500 Register ReplaceReg, MachineBasicBlock &MBB,
11501 MachineBasicBlock::iterator InsertTo) {
11502 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11503 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11504 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11505 Register Result = 0;
11506 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11507 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11508 Result = MRI.createVirtualRegister(
11509 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11510 NewMI->getOperand(I).setReg(Result);
11511 } else if (I == ReplaceOprNum) {
11512 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11513 NewMI->getOperand(I).setReg(ReplaceReg);
11514 }
11515 }
11516 MBB.insert(InsertTo, NewMI);
11517 return Result;
11518}
11519
11520void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11523 // Create and accumulate conditions for next TC iterations.
11524 // Example:
11525 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11526 // # iteration of the kernel
11527 //
11528 // # insert the following instructions
11529 // cond = CSINCXr 0, 0, C, implicit $nzcv
11530 // counter = ADDXri counter, 1 # clone from this->Update
11531 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11532 // cond = CSINCXr cond, cond, C, implicit $nzcv
11533 // ... (repeat TC times)
11534 // SUBSXri cond, 0, implicit-def $nzcv
11535
11536 assert(CondBranch->getOpcode() == AArch64::Bcc);
11537 // CondCode to exit the loop
11539 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11540 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11542
11543 // Accumulate conditions to exit the loop
11544 Register AccCond = AArch64::XZR;
11545
11546 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11547 auto AccumulateCond = [&](Register CurCond,
11549 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11550 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11551 .addReg(NewCond, RegState::Define)
11552 .addReg(CurCond)
11553 .addReg(CurCond)
11555 return NewCond;
11556 };
11557
11558 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11559 // Update and Comp for I==0 are already exists in MBB
11560 // (MBB is an unrolled kernel)
11561 Register Counter;
11562 for (int I = 0; I <= TC; ++I) {
11563 Register NextCounter;
11564 if (I != 0)
11565 NextCounter =
11566 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11567
11568 AccCond = AccumulateCond(AccCond, CC);
11569
11570 if (I != TC) {
11571 if (I == 0) {
11572 if (Update != Comp && IsUpdatePriorComp) {
11573 Counter =
11574 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11575 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11576 MBB.end());
11577 } else {
11578 // can use already calculated value
11579 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11580 }
11581 } else if (Update != Comp) {
11582 NextCounter =
11583 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11584 }
11585 }
11586 Counter = NextCounter;
11587 }
11588 } else {
11589 Register Counter;
11590 if (LastStage0Insts.empty()) {
11591 // use initial counter value (testing if the trip count is sufficient to
11592 // be executed by pipelined code)
11593 Counter = Init;
11594 if (IsUpdatePriorComp)
11595 Counter =
11596 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11597 } else {
11598 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11599 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11600 }
11601
11602 for (int I = 0; I <= TC; ++I) {
11603 Register NextCounter;
11604 NextCounter =
11605 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11606 AccCond = AccumulateCond(AccCond, CC);
11607 if (I != TC && Update != Comp)
11608 NextCounter =
11609 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11610 Counter = NextCounter;
11611 }
11612 }
11613
11614 // If AccCond == 0, the remainder is greater than TC.
11615 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11616 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11617 .addReg(AccCond)
11618 .addImm(0)
11619 .addImm(0);
11620 Cond.clear();
11622}
11623
11624static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11625 Register &RegMBB, Register &RegOther) {
11626 assert(Phi.getNumOperands() == 5);
11627 if (Phi.getOperand(2).getMBB() == MBB) {
11628 RegMBB = Phi.getOperand(1).getReg();
11629 RegOther = Phi.getOperand(3).getReg();
11630 } else {
11631 assert(Phi.getOperand(4).getMBB() == MBB);
11632 RegMBB = Phi.getOperand(3).getReg();
11633 RegOther = Phi.getOperand(1).getReg();
11634 }
11635}
11636
11638 if (!Reg.isVirtual())
11639 return false;
11640 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11641 return MRI.getVRegDef(Reg)->getParent() != BB;
11642}
11643
11644/// If Reg is an induction variable, return true and set some parameters
11645static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11646 MachineInstr *&UpdateInst,
11647 unsigned &UpdateCounterOprNum, Register &InitReg,
11648 bool &IsUpdatePriorComp) {
11649 // Example:
11650 //
11651 // Preheader:
11652 // InitReg = ...
11653 // LoopBB:
11654 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11655 // Reg = COPY Reg0 ; COPY is ignored.
11656 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11657 // ; Reg is the value calculated in the previous
11658 // ; iteration, so IsUpdatePriorComp == false.
11659
11660 if (LoopBB->pred_size() != 2)
11661 return false;
11662 if (!Reg.isVirtual())
11663 return false;
11664 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11665 UpdateInst = nullptr;
11666 UpdateCounterOprNum = 0;
11667 InitReg = 0;
11668 IsUpdatePriorComp = true;
11669 Register CurReg = Reg;
11670 while (true) {
11671 MachineInstr *Def = MRI.getVRegDef(CurReg);
11672 if (Def->getParent() != LoopBB)
11673 return false;
11674 if (Def->isCopy()) {
11675 // Ignore copy instructions unless they contain subregisters
11676 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11677 return false;
11678 CurReg = Def->getOperand(1).getReg();
11679 } else if (Def->isPHI()) {
11680 if (InitReg != 0)
11681 return false;
11682 if (!UpdateInst)
11683 IsUpdatePriorComp = false;
11684 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11685 } else {
11686 if (UpdateInst)
11687 return false;
11688 switch (Def->getOpcode()) {
11689 case AArch64::ADDSXri:
11690 case AArch64::ADDSWri:
11691 case AArch64::SUBSXri:
11692 case AArch64::SUBSWri:
11693 case AArch64::ADDXri:
11694 case AArch64::ADDWri:
11695 case AArch64::SUBXri:
11696 case AArch64::SUBWri:
11697 UpdateInst = Def;
11698 UpdateCounterOprNum = 1;
11699 break;
11700 case AArch64::ADDSXrr:
11701 case AArch64::ADDSWrr:
11702 case AArch64::SUBSXrr:
11703 case AArch64::SUBSWrr:
11704 case AArch64::ADDXrr:
11705 case AArch64::ADDWrr:
11706 case AArch64::SUBXrr:
11707 case AArch64::SUBWrr:
11708 UpdateInst = Def;
11709 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11710 UpdateCounterOprNum = 1;
11711 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11712 UpdateCounterOprNum = 2;
11713 else
11714 return false;
11715 break;
11716 default:
11717 return false;
11718 }
11719 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11720 }
11721
11722 if (!CurReg.isVirtual())
11723 return false;
11724 if (Reg == CurReg)
11725 break;
11726 }
11727
11728 if (!UpdateInst)
11729 return false;
11730
11731 return true;
11732}
11733
11734std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11736 // Accept loops that meet the following conditions
11737 // * The conditional branch is BCC
11738 // * The compare instruction is ADDS/SUBS/WHILEXX
11739 // * One operand of the compare is an induction variable and the other is a
11740 // loop invariant value
11741 // * The induction variable is incremented/decremented by a single instruction
11742 // * Does not contain CALL or instructions which have unmodeled side effects
11743
11744 for (MachineInstr &MI : *LoopBB)
11745 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11746 // This instruction may use NZCV, which interferes with the instruction to
11747 // be inserted for loop control.
11748 return nullptr;
11749
11750 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11752 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11753 return nullptr;
11754
11755 // Infinite loops are not supported
11756 if (TBB == LoopBB && FBB == LoopBB)
11757 return nullptr;
11758
11759 // Must be conditional branch
11760 if (TBB != LoopBB && FBB == nullptr)
11761 return nullptr;
11762
11763 assert((TBB == LoopBB || FBB == LoopBB) &&
11764 "The Loop must be a single-basic-block loop");
11765
11766 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11768
11769 if (CondBranch->getOpcode() != AArch64::Bcc)
11770 return nullptr;
11771
11772 // Normalization for createTripCountGreaterCondition()
11773 if (TBB == LoopBB)
11775
11776 MachineInstr *Comp = nullptr;
11777 unsigned CompCounterOprNum = 0;
11778 for (MachineInstr &MI : reverse(*LoopBB)) {
11779 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
11780 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11781 // operands is a loop invariant value
11782
11783 switch (MI.getOpcode()) {
11784 case AArch64::SUBSXri:
11785 case AArch64::SUBSWri:
11786 case AArch64::ADDSXri:
11787 case AArch64::ADDSWri:
11788 Comp = &MI;
11789 CompCounterOprNum = 1;
11790 break;
11791 case AArch64::ADDSWrr:
11792 case AArch64::ADDSXrr:
11793 case AArch64::SUBSWrr:
11794 case AArch64::SUBSXrr:
11795 Comp = &MI;
11796 break;
11797 default:
11798 if (isWhileOpcode(MI.getOpcode())) {
11799 Comp = &MI;
11800 break;
11801 }
11802 return nullptr;
11803 }
11804
11805 if (CompCounterOprNum == 0) {
11806 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
11807 CompCounterOprNum = 2;
11808 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
11809 CompCounterOprNum = 1;
11810 else
11811 return nullptr;
11812 }
11813 break;
11814 }
11815 }
11816 if (!Comp)
11817 return nullptr;
11818
11819 MachineInstr *Update = nullptr;
11820 Register Init;
11821 bool IsUpdatePriorComp;
11822 unsigned UpdateCounterOprNum;
11823 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
11824 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
11825 return nullptr;
11826
11827 return std::make_unique<AArch64PipelinerLoopInfo>(
11828 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
11829 Init, IsUpdatePriorComp, Cond);
11830}
11831
11832/// verifyInstruction - Perform target specific instruction verification.
11833bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11834 StringRef &ErrInfo) const {
11835 // Verify that immediate offsets on load/store instructions are within range.
11836 // Stack objects with an FI operand are excluded as they can be fixed up
11837 // during PEI.
11838 TypeSize Scale(0U, false), Width(0U, false);
11839 int64_t MinOffset, MaxOffset;
11840 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11841 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
11842 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
11843 int64_t Imm = MI.getOperand(ImmIdx).getImm();
11844 if (Imm < MinOffset || Imm > MaxOffset) {
11845 ErrInfo = "Unexpected immediate on load/store instruction";
11846 return false;
11847 }
11848 }
11849 }
11850
11851 const MCInstrDesc &MCID = MI.getDesc();
11852 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11853 const MachineOperand &MO = MI.getOperand(Op);
11854 switch (MCID.operands()[Op].OperandType) {
11856 if (!MO.isImm() || MO.getImm() != 0) {
11857 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11858 return false;
11859 }
11860 break;
11862 if (!MO.isImm() ||
11864 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
11865 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
11866 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11867 return false;
11868 }
11869 break;
11870 default:
11871 break;
11872 }
11873 }
11874 return true;
11875}
11876
11877#define GET_INSTRINFO_HELPERS
11878#define GET_INSTRMAP_INFO
11879#include "AArch64GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if in a streaming call site region without SME-FA64.
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static std::optional< unsigned > getLFIInstSizeInBytes(const MachineInstr &MI)
Return the maximum number of bytes of code the specified instruction may be after LFI rewriting.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if the instruction at I is in a streaming call site region, within a single basic block.
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, RegState State, const TargetRegisterInfo *TRI)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
static bool isZExtLoad(const MachineInstr &MI)
Returns whether the instruction is a zero-extending load.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
void createPauthEpilogueInstr(MachineBasicBlock &MBB, DebugLoc DL) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSExtLoad(const MachineInstr &MI)
Returns whether the instruction is a sign-extending load.
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:123
bool empty() const
Definition DenseMap.h:109
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:64
bool usesWindowsCFI() const
Definition MCAsmInfo.h:665
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:576
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:618
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:591
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:688
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
bool def_empty(Register RegNo) const
def_empty - Return true if there are no instructions defining the specified register (it may be live-...
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
bool hasOneDef(Register RegNo) const
Return true if there is exactly one operand defining the specified register.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:207
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
iterator end() const
Definition BasicBlock.h:89
LLVM_ABI Instruction & back() const
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr RegState getDefRegState(bool B)
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
static MCRegister getXRegFromWReg(MCRegister Reg)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
constexpr RegState getUndefRegState(bool B)
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.