LLVM  16.0.0git
AArch64ISelDAGToDAG.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the AArch64 target.
10 //
11 //===----------------------------------------------------------------------===//
12 
14 #include "AArch64TargetMachine.h"
16 #include "llvm/ADT/APSInt.h"
18 #include "llvm/IR/Function.h" // To access function attributes.
19 #include "llvm/IR/GlobalValue.h"
20 #include "llvm/IR/Intrinsics.h"
21 #include "llvm/IR/IntrinsicsAArch64.h"
22 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/KnownBits.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "aarch64-isel"
31 
32 //===--------------------------------------------------------------------===//
33 /// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
34 /// instructions for SelectionDAG operations.
35 ///
36 namespace {
37 
38 class AArch64DAGToDAGISel : public SelectionDAGISel {
39 
40  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
41  /// make the right decision when generating code for different targets.
42  const AArch64Subtarget *Subtarget;
43 
44 public:
45  explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
46  CodeGenOpt::Level OptLevel)
47  : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr) {}
48 
49  StringRef getPassName() const override {
50  return "AArch64 Instruction Selection";
51  }
52 
53  bool runOnMachineFunction(MachineFunction &MF) override {
54  Subtarget = &MF.getSubtarget<AArch64Subtarget>();
56  }
57 
58  void Select(SDNode *Node) override;
59 
60  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
61  /// inline asm expressions.
62  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
63  unsigned ConstraintID,
64  std::vector<SDValue> &OutOps) override;
65 
66  template <signed Low, signed High, signed Scale>
67  bool SelectRDVLImm(SDValue N, SDValue &Imm);
68 
69  bool tryMLAV64LaneV128(SDNode *N);
70  bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
71  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
72  bool SelectArithUXTXRegister(SDValue N, SDValue &Reg, SDValue &Shift);
73  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
74  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
75  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
76  return SelectShiftedRegister(N, false, Reg, Shift);
77  }
78  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
79  return SelectShiftedRegister(N, true, Reg, Shift);
80  }
81  bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
82  return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
83  }
84  bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
85  return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
86  }
87  bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
88  return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
89  }
90  bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
91  return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
92  }
93  bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
94  return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
95  }
96  bool SelectAddrModeIndexedS9S128(SDValue N, SDValue &Base, SDValue &OffImm) {
97  return SelectAddrModeIndexedBitWidth(N, true, 9, 16, Base, OffImm);
98  }
99  bool SelectAddrModeIndexedU6S128(SDValue N, SDValue &Base, SDValue &OffImm) {
100  return SelectAddrModeIndexedBitWidth(N, false, 6, 16, Base, OffImm);
101  }
102  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
103  return SelectAddrModeIndexed(N, 1, Base, OffImm);
104  }
105  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
106  return SelectAddrModeIndexed(N, 2, Base, OffImm);
107  }
108  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
109  return SelectAddrModeIndexed(N, 4, Base, OffImm);
110  }
111  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
112  return SelectAddrModeIndexed(N, 8, Base, OffImm);
113  }
114  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
115  return SelectAddrModeIndexed(N, 16, Base, OffImm);
116  }
117  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
118  return SelectAddrModeUnscaled(N, 1, Base, OffImm);
119  }
120  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
121  return SelectAddrModeUnscaled(N, 2, Base, OffImm);
122  }
123  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
124  return SelectAddrModeUnscaled(N, 4, Base, OffImm);
125  }
126  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
127  return SelectAddrModeUnscaled(N, 8, Base, OffImm);
128  }
129  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
130  return SelectAddrModeUnscaled(N, 16, Base, OffImm);
131  }
132  template <unsigned Size, unsigned Max>
133  bool SelectAddrModeIndexedUImm(SDValue N, SDValue &Base, SDValue &OffImm) {
134  // Test if there is an appropriate addressing mode and check if the
135  // immediate fits.
136  bool Found = SelectAddrModeIndexed(N, Size, Base, OffImm);
137  if (Found) {
138  if (auto *CI = dyn_cast<ConstantSDNode>(OffImm)) {
139  int64_t C = CI->getSExtValue();
140  if (C <= Max)
141  return true;
142  }
143  }
144 
145  // Otherwise, base only, materialize address in register.
146  Base = N;
147  OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
148  return true;
149  }
150 
151  template<int Width>
152  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
153  SDValue &SignExtend, SDValue &DoShift) {
154  return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
155  }
156 
157  template<int Width>
158  bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
159  SDValue &SignExtend, SDValue &DoShift) {
160  return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
161  }
162 
163  bool SelectExtractHigh(SDValue N, SDValue &Res) {
164  if (Subtarget->isLittleEndian() && N->getOpcode() == ISD::BITCAST)
165  N = N->getOperand(0);
166  if (N->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
167  !isa<ConstantSDNode>(N->getOperand(1)))
168  return false;
169  EVT VT = N->getValueType(0);
170  EVT LVT = N->getOperand(0).getValueType();
171  unsigned Index = N->getConstantOperandVal(1);
172  if (!VT.is64BitVector() || !LVT.is128BitVector() ||
173  Index != VT.getVectorNumElements())
174  return false;
175  Res = N->getOperand(0);
176  return true;
177  }
178 
179  bool SelectDupZeroOrUndef(SDValue N) {
180  switch(N->getOpcode()) {
181  case ISD::UNDEF:
182  return true;
183  case AArch64ISD::DUP:
184  case ISD::SPLAT_VECTOR: {
185  auto Opnd0 = N->getOperand(0);
186  if (isNullConstant(Opnd0))
187  return true;
188  if (isNullFPConstant(Opnd0))
189  return true;
190  break;
191  }
192  default:
193  break;
194  }
195 
196  return false;
197  }
198 
199  bool SelectDupZero(SDValue N) {
200  switch(N->getOpcode()) {
201  case AArch64ISD::DUP:
202  case ISD::SPLAT_VECTOR: {
203  auto Opnd0 = N->getOperand(0);
204  if (isNullConstant(Opnd0))
205  return true;
206  if (isNullFPConstant(Opnd0))
207  return true;
208  break;
209  }
210  }
211 
212  return false;
213  }
214 
215  template<MVT::SimpleValueType VT>
216  bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
217  return SelectSVEAddSubImm(N, VT, Imm, Shift);
218  }
219 
220  template <MVT::SimpleValueType VT>
221  bool SelectSVECpyDupImm(SDValue N, SDValue &Imm, SDValue &Shift) {
222  return SelectSVECpyDupImm(N, VT, Imm, Shift);
223  }
224 
225  template <MVT::SimpleValueType VT, bool Invert = false>
226  bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
227  return SelectSVELogicalImm(N, VT, Imm, Invert);
228  }
229 
230  template <MVT::SimpleValueType VT>
231  bool SelectSVEArithImm(SDValue N, SDValue &Imm) {
232  return SelectSVEArithImm(N, VT, Imm);
233  }
234 
235  template <unsigned Low, unsigned High, bool AllowSaturation = false>
236  bool SelectSVEShiftImm(SDValue N, SDValue &Imm) {
237  return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm);
238  }
239 
240  bool SelectSVEShiftSplatImmR(SDValue N, SDValue &Imm) {
241  if (N->getOpcode() != ISD::SPLAT_VECTOR)
242  return false;
243 
244  EVT EltVT = N->getValueType(0).getVectorElementType();
245  return SelectSVEShiftImm(N->getOperand(0), /* Low */ 1,
246  /* High */ EltVT.getFixedSizeInBits(),
247  /* AllowSaturation */ true, Imm);
248  }
249 
250  // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
251  template<signed Min, signed Max, signed Scale, bool Shift>
252  bool SelectCntImm(SDValue N, SDValue &Imm) {
253  if (!isa<ConstantSDNode>(N))
254  return false;
255 
256  int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
257  if (Shift)
258  MulImm = 1LL << MulImm;
259 
260  if ((MulImm % std::abs(Scale)) != 0)
261  return false;
262 
263  MulImm /= Scale;
264  if ((MulImm >= Min) && (MulImm <= Max)) {
265  Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32);
266  return true;
267  }
268 
269  return false;
270  }
271 
272  template <signed Max, signed Scale>
273  bool SelectEXTImm(SDValue N, SDValue &Imm) {
274  if (!isa<ConstantSDNode>(N))
275  return false;
276 
277  int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
278 
279  if (MulImm >= 0 && MulImm <= Max) {
280  MulImm *= Scale;
281  Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32);
282  return true;
283  }
284 
285  return false;
286  }
287 
288  template <unsigned BaseReg> bool ImmToTile(SDValue N, SDValue &Imm) {
289  if (auto *CI = dyn_cast<ConstantSDNode>(N)) {
290  uint64_t C = CI->getZExtValue();
291  Imm = CurDAG->getRegister(BaseReg + C, MVT::Other);
292  return true;
293  }
294  return false;
295  }
296 
297  /// Form sequences of consecutive 64/128-bit registers for use in NEON
298  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
299  /// between 1 and 4 elements. If it contains a single element that is returned
300  /// unchanged; otherwise a REG_SEQUENCE value is returned.
303  // Form a sequence of SVE registers for instructions using list of vectors,
304  // e.g. structured loads and stores (ldN, stN).
305  SDValue createZTuple(ArrayRef<SDValue> Vecs);
306 
307  /// Generic helper for the createDTuple/createQTuple
308  /// functions. Those should almost always be called instead.
309  SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
310  const unsigned SubRegs[]);
311 
312  void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
313 
314  bool tryIndexedLoad(SDNode *N);
315 
316  bool trySelectStackSlotTagP(SDNode *N);
317  void SelectTagP(SDNode *N);
318 
319  void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
320  unsigned SubRegIdx);
321  void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
322  unsigned SubRegIdx);
323  void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
324  void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
325  void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
326  unsigned Opc_rr, unsigned Opc_ri,
327  bool IsIntr = false);
328 
329  bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
330  /// SVE Reg+Imm addressing mode.
331  template <int64_t Min, int64_t Max>
332  bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base,
333  SDValue &OffImm);
334  /// SVE Reg+Reg address mode.
335  template <unsigned Scale>
336  bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) {
337  return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
338  }
339 
340  template <unsigned Scale>
341  bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
342  return SelectSMETileSlice(N, Scale, Vector, Offset);
343  }
344 
345  void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
346  void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
347  void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
348  void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
349  void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale,
350  unsigned Opc_rr, unsigned Opc_ri);
351  std::tuple<unsigned, SDValue, SDValue>
352  findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri,
353  const SDValue &OldBase, const SDValue &OldOffset,
354  unsigned Scale);
355 
356  bool tryBitfieldExtractOp(SDNode *N);
357  bool tryBitfieldExtractOpFromSExt(SDNode *N);
358  bool tryBitfieldInsertOp(SDNode *N);
359  bool tryBitfieldInsertInZeroOp(SDNode *N);
360  bool tryShiftAmountMod(SDNode *N);
361  bool tryHighFPExt(SDNode *N);
362 
363  bool tryReadRegister(SDNode *N);
364  bool tryWriteRegister(SDNode *N);
365 
366 // Include the pieces autogenerated from the target description.
367 #include "AArch64GenDAGISel.inc"
368 
369 private:
370  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
371  SDValue &Shift);
372  bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
373  SDValue &OffImm) {
374  return SelectAddrModeIndexedBitWidth(N, true, 7, Size, Base, OffImm);
375  }
376  bool SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, unsigned BW,
377  unsigned Size, SDValue &Base,
378  SDValue &OffImm);
379  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
380  SDValue &OffImm);
381  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
382  SDValue &OffImm);
383  bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
384  SDValue &Offset, SDValue &SignExtend,
385  SDValue &DoShift);
386  bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
387  SDValue &Offset, SDValue &SignExtend,
388  SDValue &DoShift);
389  bool isWorthFolding(SDValue V) const;
390  bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
391  SDValue &Offset, SDValue &SignExtend);
392 
393  template<unsigned RegWidth>
394  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
395  return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
396  }
397 
398  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
399 
400  bool SelectCMP_SWAP(SDNode *N);
401 
402  bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
403  bool SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
404  bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm, bool Invert);
405 
406  bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
407  bool SelectSVEShiftImm(SDValue N, uint64_t Low, uint64_t High,
408  bool AllowSaturation, SDValue &Imm);
409 
410  bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
411  bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
412  SDValue &Offset);
413  bool SelectSMETileSlice(SDValue N, unsigned Scale, SDValue &Vector,
414  SDValue &Offset);
415 
416  bool SelectAllActivePredicate(SDValue N);
417 };
418 } // end anonymous namespace
419 
420 /// isIntImmediate - This method tests to see if the node is a constant
421 /// operand. If so Imm will receive the 32-bit value.
422 static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
423  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
424  Imm = C->getZExtValue();
425  return true;
426  }
427  return false;
428 }
429 
430 // isIntImmediate - This method tests to see if a constant operand.
431 // If so Imm will receive the value.
433  return isIntImmediate(N.getNode(), Imm);
434 }
435 
436 // isOpcWithIntImmediate - This method tests to see if the node is a specific
437 // opcode and that it has a immediate integer right operand.
438 // If so Imm will receive the 32 bit value.
439 static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
440  uint64_t &Imm) {
441  return N->getOpcode() == Opc &&
442  isIntImmediate(N->getOperand(1).getNode(), Imm);
443 }
444 
445 // isIntImmediateEq - This method tests to see if N is a constant operand that
446 // is equivalent to 'ImmExpected'.
447 #ifndef NDEBUG
448 static bool isIntImmediateEq(SDValue N, const uint64_t ImmExpected) {
449  uint64_t Imm;
450  if (!isIntImmediate(N.getNode(), Imm))
451  return false;
452  return Imm == ImmExpected;
453 }
454 #endif
455 
456 bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
457  const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
458  switch(ConstraintID) {
459  default:
460  llvm_unreachable("Unexpected asm memory constraint");
464  // We need to make sure that this one operand does not end up in XZR, thus
465  // require the address to be in a PointerRegClass register.
466  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
467  const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
468  SDLoc dl(Op);
469  SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
470  SDValue NewOp =
471  SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
472  dl, Op.getValueType(),
473  Op, RC), 0);
474  OutOps.push_back(NewOp);
475  return false;
476  }
477  return true;
478 }
479 
480 /// SelectArithImmed - Select an immediate value that can be represented as
481 /// a 12-bit value shifted left by either 0 or 12. If so, return true with
482 /// Val set to the 12-bit value and Shift set to the shifter operand.
483 bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
484  SDValue &Shift) {
485  // This function is called from the addsub_shifted_imm ComplexPattern,
486  // which lists [imm] as the list of opcode it's interested in, however
487  // we still need to check whether the operand is actually an immediate
488  // here because the ComplexPattern opcode list is only used in
489  // root-level opcode matching.
490  if (!isa<ConstantSDNode>(N.getNode()))
491  return false;
492 
493  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
494  unsigned ShiftAmt;
495 
496  if (Immed >> 12 == 0) {
497  ShiftAmt = 0;
498  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
499  ShiftAmt = 12;
500  Immed = Immed >> 12;
501  } else
502  return false;
503 
504  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
505  SDLoc dl(N);
506  Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
507  Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
508  return true;
509 }
510 
511 /// SelectNegArithImmed - As above, but negates the value before trying to
512 /// select it.
513 bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
514  SDValue &Shift) {
515  // This function is called from the addsub_shifted_imm ComplexPattern,
516  // which lists [imm] as the list of opcode it's interested in, however
517  // we still need to check whether the operand is actually an immediate
518  // here because the ComplexPattern opcode list is only used in
519  // root-level opcode matching.
520  if (!isa<ConstantSDNode>(N.getNode()))
521  return false;
522 
523  // The immediate operand must be a 24-bit zero-extended immediate.
524  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
525 
526  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
527  // have the opposite effect on the C flag, so this pattern mustn't match under
528  // those circumstances.
529  if (Immed == 0)
530  return false;
531 
532  if (N.getValueType() == MVT::i32)
533  Immed = ~((uint32_t)Immed) + 1;
534  else
535  Immed = ~Immed + 1ULL;
536  if (Immed & 0xFFFFFFFFFF000000ULL)
537  return false;
538 
539  Immed &= 0xFFFFFFULL;
540  return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
541  Shift);
542 }
543 
544 /// getShiftTypeForNode - Translate a shift node to the corresponding
545 /// ShiftType value.
547  switch (N.getOpcode()) {
548  default:
550  case ISD::SHL:
551  return AArch64_AM::LSL;
552  case ISD::SRL:
553  return AArch64_AM::LSR;
554  case ISD::SRA:
555  return AArch64_AM::ASR;
556  case ISD::ROTR:
557  return AArch64_AM::ROR;
558  }
559 }
560 
561 /// Determine whether it is worth it to fold SHL into the addressing
562 /// mode.
563 static bool isWorthFoldingSHL(SDValue V) {
564  assert(V.getOpcode() == ISD::SHL && "invalid opcode");
565  // It is worth folding logical shift of up to three places.
566  auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
567  if (!CSD)
568  return false;
569  unsigned ShiftVal = CSD->getZExtValue();
570  if (ShiftVal > 3)
571  return false;
572 
573  // Check if this particular node is reused in any non-memory related
574  // operation. If yes, do not try to fold this node into the address
575  // computation, since the computation will be kept.
576  const SDNode *Node = V.getNode();
577  for (SDNode *UI : Node->uses())
578  if (!isa<MemSDNode>(*UI))
579  for (SDNode *UII : UI->uses())
580  if (!isa<MemSDNode>(*UII))
581  return false;
582  return true;
583 }
584 
585 /// Determine whether it is worth to fold V into an extended register.
586 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
587  // Trivial if we are optimizing for code size or if there is only
588  // one use of the value.
589  if (CurDAG->shouldOptForSize() || V.hasOneUse())
590  return true;
591  // If a subtarget has a fastpath LSL we can fold a logical shift into
592  // the addressing mode and save a cycle.
593  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
595  return true;
596  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
597  const SDValue LHS = V.getOperand(0);
598  const SDValue RHS = V.getOperand(1);
599  if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
600  return true;
601  if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
602  return true;
603  }
604 
605  // It hurts otherwise, since the value will be reused.
606  return false;
607 }
608 
609 /// SelectShiftedRegister - Select a "shifted register" operand. If the value
610 /// is not shifted, set the Shift operand to default of "LSL 0". The logical
611 /// instructions allow the shifted register to be rotated, but the arithmetic
612 /// instructions do not. The AllowROR parameter specifies whether ROR is
613 /// supported.
614 bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
615  SDValue &Reg, SDValue &Shift) {
617  if (ShType == AArch64_AM::InvalidShiftExtend)
618  return false;
619  if (!AllowROR && ShType == AArch64_AM::ROR)
620  return false;
621 
622  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
623  unsigned BitSize = N.getValueSizeInBits();
624  unsigned Val = RHS->getZExtValue() & (BitSize - 1);
625  unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
626 
627  Reg = N.getOperand(0);
628  Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
629  return isWorthFolding(N);
630  }
631 
632  return false;
633 }
634 
635 /// getExtendTypeForNode - Translate an extend node to the corresponding
636 /// ExtendType value.
638 getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
639  if (N.getOpcode() == ISD::SIGN_EXTEND ||
640  N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
641  EVT SrcVT;
642  if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
643  SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
644  else
645  SrcVT = N.getOperand(0).getValueType();
646 
647  if (!IsLoadStore && SrcVT == MVT::i8)
648  return AArch64_AM::SXTB;
649  else if (!IsLoadStore && SrcVT == MVT::i16)
650  return AArch64_AM::SXTH;
651  else if (SrcVT == MVT::i32)
652  return AArch64_AM::SXTW;
653  assert(SrcVT != MVT::i64 && "extend from 64-bits?");
654 
656  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
657  N.getOpcode() == ISD::ANY_EXTEND) {
658  EVT SrcVT = N.getOperand(0).getValueType();
659  if (!IsLoadStore && SrcVT == MVT::i8)
660  return AArch64_AM::UXTB;
661  else if (!IsLoadStore && SrcVT == MVT::i16)
662  return AArch64_AM::UXTH;
663  else if (SrcVT == MVT::i32)
664  return AArch64_AM::UXTW;
665  assert(SrcVT != MVT::i64 && "extend from 64-bits?");
666 
668  } else if (N.getOpcode() == ISD::AND) {
669  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
670  if (!CSD)
672  uint64_t AndMask = CSD->getZExtValue();
673 
674  switch (AndMask) {
675  default:
677  case 0xFF:
678  return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
679  case 0xFFFF:
680  return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
681  case 0xFFFFFFFF:
682  return AArch64_AM::UXTW;
683  }
684  }
685 
687 }
688 
689 // Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
690 static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
691  if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
692  DL->getOpcode() != AArch64ISD::DUPLANE32)
693  return false;
694 
695  SDValue SV = DL->getOperand(0);
696  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
697  return false;
698 
699  SDValue EV = SV.getOperand(1);
700  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
701  return false;
702 
703  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
704  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
705  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
706  LaneOp = EV.getOperand(0);
707 
708  return true;
709 }
710 
711 // Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
712 // high lane extract.
713 static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
714  SDValue &LaneOp, int &LaneIdx) {
715 
716  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
717  std::swap(Op0, Op1);
718  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
719  return false;
720  }
721  StdOp = Op1;
722  return true;
723 }
724 
725 /// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
726 /// is a lane in the upper half of a 128-bit vector. Recognize and select this
727 /// so that we don't emit unnecessary lane extracts.
728 bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
729  SDLoc dl(N);
730  SDValue Op0 = N->getOperand(0);
731  SDValue Op1 = N->getOperand(1);
732  SDValue MLAOp1; // Will hold ordinary multiplicand for MLA.
733  SDValue MLAOp2; // Will hold lane-accessed multiplicand for MLA.
734  int LaneIdx = -1; // Will hold the lane index.
735 
736  if (Op1.getOpcode() != ISD::MUL ||
737  !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
738  LaneIdx)) {
739  std::swap(Op0, Op1);
740  if (Op1.getOpcode() != ISD::MUL ||
741  !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
742  LaneIdx))
743  return false;
744  }
745 
746  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
747 
748  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
749 
750  unsigned MLAOpc = ~0U;
751 
752  switch (N->getSimpleValueType(0).SimpleTy) {
753  default:
754  llvm_unreachable("Unrecognized MLA.");
755  case MVT::v4i16:
756  MLAOpc = AArch64::MLAv4i16_indexed;
757  break;
758  case MVT::v8i16:
759  MLAOpc = AArch64::MLAv8i16_indexed;
760  break;
761  case MVT::v2i32:
762  MLAOpc = AArch64::MLAv2i32_indexed;
763  break;
764  case MVT::v4i32:
765  MLAOpc = AArch64::MLAv4i32_indexed;
766  break;
767  }
768 
769  ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
770  return true;
771 }
772 
773 bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
774  SDLoc dl(N);
775  SDValue SMULLOp0;
776  SDValue SMULLOp1;
777  int LaneIdx;
778 
779  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
780  LaneIdx))
781  return false;
782 
783  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
784 
785  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
786 
787  unsigned SMULLOpc = ~0U;
788 
789  if (IntNo == Intrinsic::aarch64_neon_smull) {
790  switch (N->getSimpleValueType(0).SimpleTy) {
791  default:
792  llvm_unreachable("Unrecognized SMULL.");
793  case MVT::v4i32:
794  SMULLOpc = AArch64::SMULLv4i16_indexed;
795  break;
796  case MVT::v2i64:
797  SMULLOpc = AArch64::SMULLv2i32_indexed;
798  break;
799  }
800  } else if (IntNo == Intrinsic::aarch64_neon_umull) {
801  switch (N->getSimpleValueType(0).SimpleTy) {
802  default:
803  llvm_unreachable("Unrecognized SMULL.");
804  case MVT::v4i32:
805  SMULLOpc = AArch64::UMULLv4i16_indexed;
806  break;
807  case MVT::v2i64:
808  SMULLOpc = AArch64::UMULLv2i32_indexed;
809  break;
810  }
811  } else
812  llvm_unreachable("Unrecognized intrinsic.");
813 
814  ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
815  return true;
816 }
817 
818 /// Instructions that accept extend modifiers like UXTW expect the register
819 /// being extended to be a GPR32, but the incoming DAG might be acting on a
820 /// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
821 /// this is the case.
823  if (N.getValueType() == MVT::i32)
824  return N;
825 
826  SDLoc dl(N);
827  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
828  MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
829  dl, MVT::i32, N, SubReg);
830  return SDValue(Node, 0);
831 }
832 
833 // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
834 template<signed Low, signed High, signed Scale>
835 bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) {
836  if (!isa<ConstantSDNode>(N))
837  return false;
838 
839  int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
840  if ((MulImm % std::abs(Scale)) == 0) {
841  int64_t RDVLImm = MulImm / Scale;
842  if ((RDVLImm >= Low) && (RDVLImm <= High)) {
843  Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32);
844  return true;
845  }
846  }
847 
848  return false;
849 }
850 
851 /// SelectArithExtendedRegister - Select a "extended register" operand. This
852 /// operand folds in an extend followed by an optional left shift.
853 bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
854  SDValue &Shift) {
855  unsigned ShiftVal = 0;
857 
858  if (N.getOpcode() == ISD::SHL) {
859  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
860  if (!CSD)
861  return false;
862  ShiftVal = CSD->getZExtValue();
863  if (ShiftVal > 4)
864  return false;
865 
866  Ext = getExtendTypeForNode(N.getOperand(0));
868  return false;
869 
870  Reg = N.getOperand(0).getOperand(0);
871  } else {
874  return false;
875 
876  Reg = N.getOperand(0);
877 
878  // Don't match if free 32-bit -> 64-bit zext can be used instead. Use the
879  // isDef32 as a heuristic for when the operand is likely to be a 32bit def.
880  auto isDef32 = [](SDValue N) {
881  unsigned Opc = N.getOpcode();
882  return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
883  Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
884  Opc != ISD::AssertZext && Opc != ISD::AssertAlign &&
885  Opc != ISD::FREEZE;
886  };
887  if (Ext == AArch64_AM::UXTW && Reg->getValueType(0).getSizeInBits() == 32 &&
888  isDef32(Reg))
889  return false;
890  }
891 
892  // AArch64 mandates that the RHS of the operation must use the smallest
893  // register class that could contain the size being extended from. Thus,
894  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
895  // there might not be an actual 32-bit value in the program. We can
896  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
898  Reg = narrowIfNeeded(CurDAG, Reg);
899  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
900  MVT::i32);
901  return isWorthFolding(N);
902 }
903 
904 /// SelectArithUXTXRegister - Select a "UXTX register" operand. This
905 /// operand is refered by the instructions have SP operand
906 bool AArch64DAGToDAGISel::SelectArithUXTXRegister(SDValue N, SDValue &Reg,
907  SDValue &Shift) {
908  unsigned ShiftVal = 0;
910 
911  if (N.getOpcode() != ISD::SHL)
912  return false;
913 
914  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
915  if (!CSD)
916  return false;
917  ShiftVal = CSD->getZExtValue();
918  if (ShiftVal > 4)
919  return false;
920 
922  Reg = N.getOperand(0);
923  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
924  MVT::i32);
925  return isWorthFolding(N);
926 }
927 
928 /// If there's a use of this ADDlow that's not itself a load/store then we'll
929 /// need to create a real ADD instruction from it anyway and there's no point in
930 /// folding it into the mem op. Theoretically, it shouldn't matter, but there's
931 /// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
932 /// leads to duplicated ADRP instructions.
934  for (auto *Use : N->uses()) {
935  if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
936  Use->getOpcode() != ISD::ATOMIC_LOAD &&
937  Use->getOpcode() != ISD::ATOMIC_STORE)
938  return false;
939 
940  // ldar and stlr have much more restrictive addressing modes (just a
941  // register).
942  if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getSuccessOrdering()))
943  return false;
944  }
945 
946  return true;
947 }
948 
949 /// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit
950 /// immediate" address. The "Size" argument is the size in bytes of the memory
951 /// reference, which determines the scale.
952 bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm,
953  unsigned BW, unsigned Size,
954  SDValue &Base,
955  SDValue &OffImm) {
956  SDLoc dl(N);
957  const DataLayout &DL = CurDAG->getDataLayout();
958  const TargetLowering *TLI = getTargetLowering();
959  if (N.getOpcode() == ISD::FrameIndex) {
960  int FI = cast<FrameIndexSDNode>(N)->getIndex();
961  Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
962  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
963  return true;
964  }
965 
966  // As opposed to the (12-bit) Indexed addressing mode below, the 7/9-bit signed
967  // selected here doesn't support labels/immediates, only base+offset.
968  if (CurDAG->isBaseWithConstantOffset(N)) {
969  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
970  if (IsSignedImm) {
971  int64_t RHSC = RHS->getSExtValue();
972  unsigned Scale = Log2_32(Size);
973  int64_t Range = 0x1LL << (BW - 1);
974 
975  if ((RHSC & (Size - 1)) == 0 && RHSC >= -(Range << Scale) &&
976  RHSC < (Range << Scale)) {
977  Base = N.getOperand(0);
978  if (Base.getOpcode() == ISD::FrameIndex) {
979  int FI = cast<FrameIndexSDNode>(Base)->getIndex();
980  Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
981  }
982  OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
983  return true;
984  }
985  } else {
986  // unsigned Immediate
987  uint64_t RHSC = RHS->getZExtValue();
988  unsigned Scale = Log2_32(Size);
989  uint64_t Range = 0x1ULL << BW;
990 
991  if ((RHSC & (Size - 1)) == 0 && RHSC < (Range << Scale)) {
992  Base = N.getOperand(0);
993  if (Base.getOpcode() == ISD::FrameIndex) {
994  int FI = cast<FrameIndexSDNode>(Base)->getIndex();
995  Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
996  }
997  OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
998  return true;
999  }
1000  }
1001  }
1002  }
1003  // Base only. The address will be materialized into a register before
1004  // the memory is accessed.
1005  // add x0, Xbase, #offset
1006  // stp x1, x2, [x0]
1007  Base = N;
1008  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
1009  return true;
1010 }
1011 
1012 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
1013 /// immediate" address. The "Size" argument is the size in bytes of the memory
1014 /// reference, which determines the scale.
1015 bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
1016  SDValue &Base, SDValue &OffImm) {
1017  SDLoc dl(N);
1018  const DataLayout &DL = CurDAG->getDataLayout();
1019  const TargetLowering *TLI = getTargetLowering();
1020  if (N.getOpcode() == ISD::FrameIndex) {
1021  int FI = cast<FrameIndexSDNode>(N)->getIndex();
1022  Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
1023  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
1024  return true;
1025  }
1026 
1027  if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
1028  GlobalAddressSDNode *GAN =
1029  dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
1030  Base = N.getOperand(0);
1031  OffImm = N.getOperand(1);
1032  if (!GAN)
1033  return true;
1034 
1035  if (GAN->getOffset() % Size == 0 &&
1036  GAN->getGlobal()->getPointerAlignment(DL) >= Size)
1037  return true;
1038  }
1039 
1040  if (CurDAG->isBaseWithConstantOffset(N)) {
1041  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1042  int64_t RHSC = (int64_t)RHS->getZExtValue();
1043  unsigned Scale = Log2_32(Size);
1044  if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
1045  Base = N.getOperand(0);
1046  if (Base.getOpcode() == ISD::FrameIndex) {
1047  int FI = cast<FrameIndexSDNode>(Base)->getIndex();
1048  Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
1049  }
1050  OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
1051  return true;
1052  }
1053  }
1054  }
1055 
1056  // Before falling back to our general case, check if the unscaled
1057  // instructions can handle this. If so, that's preferable.
1058  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
1059  return false;
1060 
1061  // Base only. The address will be materialized into a register before
1062  // the memory is accessed.
1063  // add x0, Xbase, #offset
1064  // ldr x0, [x0]
1065  Base = N;
1066  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
1067  return true;
1068 }
1069 
1070 /// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
1071 /// immediate" address. This should only match when there is an offset that
1072 /// is not valid for a scaled immediate addressing mode. The "Size" argument
1073 /// is the size in bytes of the memory reference, which is needed here to know
1074 /// what is valid for a scaled immediate.
1075 bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
1076  SDValue &Base,
1077  SDValue &OffImm) {
1078  if (!CurDAG->isBaseWithConstantOffset(N))
1079  return false;
1080  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1081  int64_t RHSC = RHS->getSExtValue();
1082  // If the offset is valid as a scaled immediate, don't match here.
1083  if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
1084  RHSC < (0x1000 << Log2_32(Size)))
1085  return false;
1086  if (RHSC >= -256 && RHSC < 256) {
1087  Base = N.getOperand(0);
1088  if (Base.getOpcode() == ISD::FrameIndex) {
1089  int FI = cast<FrameIndexSDNode>(Base)->getIndex();
1090  const TargetLowering *TLI = getTargetLowering();
1091  Base = CurDAG->getTargetFrameIndex(
1092  FI, TLI->getPointerTy(CurDAG->getDataLayout()));
1093  }
1094  OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
1095  return true;
1096  }
1097  }
1098  return false;
1099 }
1100 
1101 static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
1102  SDLoc dl(N);
1103  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
1104  SDValue ImpDef = SDValue(
1105  CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
1106  MachineSDNode *Node = CurDAG->getMachineNode(
1107  TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
1108  return SDValue(Node, 0);
1109 }
1110 
1111 /// Check if the given SHL node (\p N), can be used to form an
1112 /// extended register for an addressing mode.
1113 bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
1114  bool WantExtend, SDValue &Offset,
1115  SDValue &SignExtend) {
1116  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
1117  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
1118  if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
1119  return false;
1120 
1121  SDLoc dl(N);
1122  if (WantExtend) {
1124  getExtendTypeForNode(N.getOperand(0), true);
1126  return false;
1127 
1128  Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
1129  SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1130  MVT::i32);
1131  } else {
1132  Offset = N.getOperand(0);
1133  SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
1134  }
1135 
1136  unsigned LegalShiftVal = Log2_32(Size);
1137  unsigned ShiftVal = CSD->getZExtValue();
1138 
1139  if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
1140  return false;
1141 
1142  return isWorthFolding(N);
1143 }
1144 
1145 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
1146  SDValue &Base, SDValue &Offset,
1147  SDValue &SignExtend,
1148  SDValue &DoShift) {
1149  if (N.getOpcode() != ISD::ADD)
1150  return false;
1151  SDValue LHS = N.getOperand(0);
1152  SDValue RHS = N.getOperand(1);
1153  SDLoc dl(N);
1154 
1155  // We don't want to match immediate adds here, because they are better lowered
1156  // to the register-immediate addressing modes.
1157  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
1158  return false;
1159 
1160  // Check if this particular node is reused in any non-memory related
1161  // operation. If yes, do not try to fold this node into the address
1162  // computation, since the computation will be kept.
1163  const SDNode *Node = N.getNode();
1164  for (SDNode *UI : Node->uses()) {
1165  if (!isa<MemSDNode>(*UI))
1166  return false;
1167  }
1168 
1169  // Remember if it is worth folding N when it produces extended register.
1170  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
1171 
1172  // Try to match a shifted extend on the RHS.
1173  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
1174  SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
1175  Base = LHS;
1176  DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
1177  return true;
1178  }
1179 
1180  // Try to match a shifted extend on the LHS.
1181  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
1182  SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
1183  Base = RHS;
1184  DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
1185  return true;
1186  }
1187 
1188  // There was no shift, whatever else we find.
1189  DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);
1190 
1192  // Try to match an unshifted extend on the LHS.
1193  if (IsExtendedRegisterWorthFolding &&
1194  (Ext = getExtendTypeForNode(LHS, true)) !=
1196  Base = RHS;
1197  Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
1198  SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1199  MVT::i32);
1200  if (isWorthFolding(LHS))
1201  return true;
1202  }
1203 
1204  // Try to match an unshifted extend on the RHS.
1205  if (IsExtendedRegisterWorthFolding &&
1206  (Ext = getExtendTypeForNode(RHS, true)) !=
1208  Base = LHS;
1209  Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
1210  SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1211  MVT::i32);
1212  if (isWorthFolding(RHS))
1213  return true;
1214  }
1215 
1216  return false;
1217 }
1218 
1219 // Check if the given immediate is preferred by ADD. If an immediate can be
1220 // encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
1221 // encoded by one MOVZ, return true.
1222 static bool isPreferredADD(int64_t ImmOff) {
1223  // Constant in [0x0, 0xfff] can be encoded in ADD.
1224  if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
1225  return true;
1226  // Check if it can be encoded in an "ADD LSL #12".
1227  if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
1228  // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
1229  return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
1230  (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
1231  return false;
1232 }
1233 
1234 bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
1235  SDValue &Base, SDValue &Offset,
1236  SDValue &SignExtend,
1237  SDValue &DoShift) {
1238  if (N.getOpcode() != ISD::ADD)
1239  return false;
1240  SDValue LHS = N.getOperand(0);
1241  SDValue RHS = N.getOperand(1);
1242  SDLoc DL(N);
1243 
1244  // Check if this particular node is reused in any non-memory related
1245  // operation. If yes, do not try to fold this node into the address
1246  // computation, since the computation will be kept.
1247  const SDNode *Node = N.getNode();
1248  for (SDNode *UI : Node->uses()) {
1249  if (!isa<MemSDNode>(*UI))
1250  return false;
1251  }
1252 
1253  // Watch out if RHS is a wide immediate, it can not be selected into
1254  // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
1255  // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
1256  // instructions like:
1257  // MOV X0, WideImmediate
1258  // ADD X1, BaseReg, X0
1259  // LDR X2, [X1, 0]
1260  // For such situation, using [BaseReg, XReg] addressing mode can save one
1261  // ADD/SUB:
1262  // MOV X0, WideImmediate
1263  // LDR X2, [BaseReg, X0]
1264  if (isa<ConstantSDNode>(RHS)) {
1265  int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
1266  unsigned Scale = Log2_32(Size);
1267  // Skip the immediate can be selected by load/store addressing mode.
1268  // Also skip the immediate can be encoded by a single ADD (SUB is also
1269  // checked by using -ImmOff).
1270  if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
1271  isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
1272  return false;
1273 
1274  SDValue Ops[] = { RHS };
1275  SDNode *MOVI =
1276  CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
1277  SDValue MOVIV = SDValue(MOVI, 0);
1278  // This ADD of two X register will be selected into [Reg+Reg] mode.
1279  N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
1280  }
1281 
1282  // Remember if it is worth folding N when it produces extended register.
1283  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
1284 
1285  // Try to match a shifted extend on the RHS.
1286  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
1287  SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
1288  Base = LHS;
1289  DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1290  return true;
1291  }
1292 
1293  // Try to match a shifted extend on the LHS.
1294  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
1295  SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
1296  Base = RHS;
1297  DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1298  return true;
1299  }
1300 
1301  // Match any non-shifted, non-extend, non-immediate add expression.
1302  Base = LHS;
1303  Offset = RHS;
1304  SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
1305  DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
1306  // Reg1 + Reg2 is free: no check needed.
1307  return true;
1308 }
1309 
1311  static const unsigned RegClassIDs[] = {
1312  AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
1313  static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
1314  AArch64::dsub2, AArch64::dsub3};
1315 
1316  return createTuple(Regs, RegClassIDs, SubRegs);
1317 }
1318 
1320  static const unsigned RegClassIDs[] = {
1321  AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
1322  static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
1323  AArch64::qsub2, AArch64::qsub3};
1324 
1325  return createTuple(Regs, RegClassIDs, SubRegs);
1326 }
1327 
1328 SDValue AArch64DAGToDAGISel::createZTuple(ArrayRef<SDValue> Regs) {
1329  static const unsigned RegClassIDs[] = {AArch64::ZPR2RegClassID,
1330  AArch64::ZPR3RegClassID,
1331  AArch64::ZPR4RegClassID};
1332  static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
1333  AArch64::zsub2, AArch64::zsub3};
1334 
1335  return createTuple(Regs, RegClassIDs, SubRegs);
1336 }
1337 
1339  const unsigned RegClassIDs[],
1340  const unsigned SubRegs[]) {
1341  // There's no special register-class for a vector-list of 1 element: it's just
1342  // a vector.
1343  if (Regs.size() == 1)
1344  return Regs[0];
1345 
1346  assert(Regs.size() >= 2 && Regs.size() <= 4);
1347 
1348  SDLoc DL(Regs[0]);
1349 
1351 
1352  // First operand of REG_SEQUENCE is the desired RegClass.
1353  Ops.push_back(
1354  CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));
1355 
1356  // Then we get pairs of source & subregister-position for the components.
1357  for (unsigned i = 0; i < Regs.size(); ++i) {
1358  Ops.push_back(Regs[i]);
1359  Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
1360  }
1361 
1362  SDNode *N =
1363  CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
1364  return SDValue(N, 0);
1365 }
1366 
1367 void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
1368  bool isExt) {
1369  SDLoc dl(N);
1370  EVT VT = N->getValueType(0);
1371 
1372  unsigned ExtOff = isExt;
1373 
1374  // Form a REG_SEQUENCE to force register allocation.
1375  unsigned Vec0Off = ExtOff + 1;
1376  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
1377  N->op_begin() + Vec0Off + NumVecs);
1378  SDValue RegSeq = createQTuple(Regs);
1379 
1381  if (isExt)
1382  Ops.push_back(N->getOperand(1));
1383  Ops.push_back(RegSeq);
1384  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
1385  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
1386 }
1387 
1388 bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
1389  LoadSDNode *LD = cast<LoadSDNode>(N);
1390  if (LD->isUnindexed())
1391  return false;
1392  EVT VT = LD->getMemoryVT();
1393  EVT DstVT = N->getValueType(0);
1394  ISD::MemIndexedMode AM = LD->getAddressingMode();
1395  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
1396 
1397  // We're not doing validity checking here. That was done when checking
1398  // if we should mark the load as indexed or not. We're just selecting
1399  // the right instruction.
1400  unsigned Opcode = 0;
1401 
1402  ISD::LoadExtType ExtType = LD->getExtensionType();
1403  bool InsertTo64 = false;
1404  if (VT == MVT::i64)
1405  Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
1406  else if (VT == MVT::i32) {
1407  if (ExtType == ISD::NON_EXTLOAD)
1408  Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1409  else if (ExtType == ISD::SEXTLOAD)
1410  Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
1411  else {
1412  Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1413  InsertTo64 = true;
1414  // The result of the load is only i32. It's the subreg_to_reg that makes
1415  // it into an i64.
1416  DstVT = MVT::i32;
1417  }
1418  } else if (VT == MVT::i16) {
1419  if (ExtType == ISD::SEXTLOAD) {
1420  if (DstVT == MVT::i64)
1421  Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
1422  else
1423  Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
1424  } else {
1425  Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
1426  InsertTo64 = DstVT == MVT::i64;
1427  // The result of the load is only i32. It's the subreg_to_reg that makes
1428  // it into an i64.
1429  DstVT = MVT::i32;
1430  }
1431  } else if (VT == MVT::i8) {
1432  if (ExtType == ISD::SEXTLOAD) {
1433  if (DstVT == MVT::i64)
1434  Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
1435  else
1436  Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
1437  } else {
1438  Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
1439  InsertTo64 = DstVT == MVT::i64;
1440  // The result of the load is only i32. It's the subreg_to_reg that makes
1441  // it into an i64.
1442  DstVT = MVT::i32;
1443  }
1444  } else if (VT == MVT::f16) {
1445  Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
1446  } else if (VT == MVT::bf16) {
1447  Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
1448  } else if (VT == MVT::f32) {
1449  Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
1450  } else if (VT == MVT::f64 || VT.is64BitVector()) {
1451  Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
1452  } else if (VT.is128BitVector()) {
1453  Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
1454  } else
1455  return false;
1456  SDValue Chain = LD->getChain();
1457  SDValue Base = LD->getBasePtr();
1458  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
1459  int OffsetVal = (int)OffsetOp->getZExtValue();
1460  SDLoc dl(N);
1461  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
1462  SDValue Ops[] = { Base, Offset, Chain };
1463  SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
1464  MVT::Other, Ops);
1465 
1466  // Transfer memoperands.
1467  MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
1468  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Res), {MemOp});
1469 
1470  // Either way, we're replacing the node, so tell the caller that.
1471  SDValue LoadedVal = SDValue(Res, 1);
1472  if (InsertTo64) {
1473  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
1474  LoadedVal =
1475  SDValue(CurDAG->getMachineNode(
1476  AArch64::SUBREG_TO_REG, dl, MVT::i64,
1477  CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
1478  SubReg),
1479  0);
1480  }
1481 
1482  ReplaceUses(SDValue(N, 0), LoadedVal);
1483  ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
1484  ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
1485  CurDAG->RemoveDeadNode(N);
1486  return true;
1487 }
1488 
1489 void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
1490  unsigned SubRegIdx) {
1491  SDLoc dl(N);
1492  EVT VT = N->getValueType(0);
1493  SDValue Chain = N->getOperand(0);
1494 
1495  SDValue Ops[] = {N->getOperand(2), // Mem operand;
1496  Chain};
1497 
1498  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1499 
1500  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1501  SDValue SuperReg = SDValue(Ld, 0);
1502  for (unsigned i = 0; i < NumVecs; ++i)
1503  ReplaceUses(SDValue(N, i),
1504  CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1505 
1506  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1507 
1508  // Transfer memoperands. In the case of AArch64::LD64B, there won't be one,
1509  // because it's too simple to have needed special treatment during lowering.
1510  if (auto *MemIntr = dyn_cast<MemIntrinsicSDNode>(N)) {
1511  MachineMemOperand *MemOp = MemIntr->getMemOperand();
1512  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
1513  }
1514 
1515  CurDAG->RemoveDeadNode(N);
1516 }
1517 
1518 void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
1519  unsigned Opc, unsigned SubRegIdx) {
1520  SDLoc dl(N);
1521  EVT VT = N->getValueType(0);
1522  SDValue Chain = N->getOperand(0);
1523 
1524  SDValue Ops[] = {N->getOperand(1), // Mem operand
1525  N->getOperand(2), // Incremental
1526  Chain};
1527 
1528  const EVT ResTys[] = {MVT::i64, // Type of the write back register
1530 
1531  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1532 
1533  // Update uses of write back register
1534  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1535 
1536  // Update uses of vector list
1537  SDValue SuperReg = SDValue(Ld, 1);
1538  if (NumVecs == 1)
1539  ReplaceUses(SDValue(N, 0), SuperReg);
1540  else
1541  for (unsigned i = 0; i < NumVecs; ++i)
1542  ReplaceUses(SDValue(N, i),
1543  CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1544 
1545  // Update the chain
1546  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1547  CurDAG->RemoveDeadNode(N);
1548 }
1549 
1550 /// Optimize \param OldBase and \param OldOffset selecting the best addressing
1551 /// mode. Returns a tuple consisting of an Opcode, an SDValue representing the
1552 /// new Base and an SDValue representing the new offset.
1553 std::tuple<unsigned, SDValue, SDValue>
1554 AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr,
1555  unsigned Opc_ri,
1556  const SDValue &OldBase,
1557  const SDValue &OldOffset,
1558  unsigned Scale) {
1559  SDValue NewBase = OldBase;
1560  SDValue NewOffset = OldOffset;
1561  // Detect a possible Reg+Imm addressing mode.
1562  const bool IsRegImm = SelectAddrModeIndexedSVE</*Min=*/-8, /*Max=*/7>(
1563  N, OldBase, NewBase, NewOffset);
1564 
1565  // Detect a possible reg+reg addressing mode, but only if we haven't already
1566  // detected a Reg+Imm one.
1567  const bool IsRegReg =
1568  !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset);
1569 
1570  // Select the instruction.
1571  return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
1572 }
1573 
1574 void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
1575  unsigned Scale, unsigned Opc_ri,
1576  unsigned Opc_rr, bool IsIntr) {
1577  assert(Scale < 4 && "Invalid scaling value.");
1578  SDLoc DL(N);
1579  EVT VT = N->getValueType(0);
1580  SDValue Chain = N->getOperand(0);
1581 
1582  // Optimize addressing mode.
1583  SDValue Base, Offset;
1584  unsigned Opc;
1585  std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
1586  N, Opc_rr, Opc_ri, N->getOperand(IsIntr ? 3 : 2),
1587  CurDAG->getTargetConstant(0, DL, MVT::i64), Scale);
1588 
1589  SDValue Ops[] = {N->getOperand(IsIntr ? 2 : 1), // Predicate
1590  Base, // Memory operand
1591  Offset, Chain};
1592 
1593  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1594 
1595  SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
1596  SDValue SuperReg = SDValue(Load, 0);
1597  for (unsigned i = 0; i < NumVecs; ++i)
1598  ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
1599  AArch64::zsub0 + i, DL, VT, SuperReg));
1600 
1601  // Copy chain
1602  unsigned ChainIdx = NumVecs;
1603  ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
1604  CurDAG->RemoveDeadNode(N);
1605 }
1606 
1607 void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
1608  unsigned Opc) {
1609  SDLoc dl(N);
1610  EVT VT = N->getOperand(2)->getValueType(0);
1611 
1612  // Form a REG_SEQUENCE to force register allocation.
1613  bool Is128Bit = VT.getSizeInBits() == 128;
1614  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1615  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1616 
1617  SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
1618  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
1619 
1620  // Transfer memoperands.
1621  MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1622  CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1623 
1624  ReplaceNode(N, St);
1625 }
1626 
1627 void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,
1628  unsigned Scale, unsigned Opc_rr,
1629  unsigned Opc_ri) {
1630  SDLoc dl(N);
1631 
1632  // Form a REG_SEQUENCE to force register allocation.
1633  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1634  SDValue RegSeq = createZTuple(Regs);
1635 
1636  // Optimize addressing mode.
1637  unsigned Opc;
1638  SDValue Offset, Base;
1639  std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
1640  N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3),
1641  CurDAG->getTargetConstant(0, dl, MVT::i64), Scale);
1642 
1643  SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate
1644  Base, // address
1645  Offset, // offset
1646  N->getOperand(0)}; // chain
1647  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
1648 
1649  ReplaceNode(N, St);
1650 }
1651 
1652 bool AArch64DAGToDAGISel::SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base,
1653  SDValue &OffImm) {
1654  SDLoc dl(N);
1655  const DataLayout &DL = CurDAG->getDataLayout();
1656  const TargetLowering *TLI = getTargetLowering();
1657 
1658  // Try to match it for the frame address
1659  if (auto FINode = dyn_cast<FrameIndexSDNode>(N)) {
1660  int FI = FINode->getIndex();
1661  Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
1662  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
1663  return true;
1664  }
1665 
1666  return false;
1667 }
1668 
1669 void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
1670  unsigned Opc) {
1671  SDLoc dl(N);
1672  EVT VT = N->getOperand(2)->getValueType(0);
1673  const EVT ResTys[] = {MVT::i64, // Type of the write back register
1674  MVT::Other}; // Type for the Chain
1675 
1676  // Form a REG_SEQUENCE to force register allocation.
1677  bool Is128Bit = VT.getSizeInBits() == 128;
1678  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1679  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1680 
1681  SDValue Ops[] = {RegSeq,
1682  N->getOperand(NumVecs + 1), // base register
1683  N->getOperand(NumVecs + 2), // Incremental
1684  N->getOperand(0)}; // Chain
1685  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1686 
1687  ReplaceNode(N, St);
1688 }
1689 
1690 namespace {
1691 /// WidenVector - Given a value in the V64 register class, produce the
1692 /// equivalent value in the V128 register class.
1693 class WidenVector {
1694  SelectionDAG &DAG;
1695 
1696 public:
1697  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
1698 
1699  SDValue operator()(SDValue V64Reg) {
1700  EVT VT = V64Reg.getValueType();
1701  unsigned NarrowSize = VT.getVectorNumElements();
1702  MVT EltTy = VT.getVectorElementType().getSimpleVT();
1703  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
1704  SDLoc DL(V64Reg);
1705 
1706  SDValue Undef =
1707  SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
1708  return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
1709  }
1710 };
1711 } // namespace
1712 
1713 /// NarrowVector - Given a value in the V128 register class, produce the
1714 /// equivalent value in the V64 register class.
1715 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
1716  EVT VT = V128Reg.getValueType();
1717  unsigned WideSize = VT.getVectorNumElements();
1718  MVT EltTy = VT.getVectorElementType().getSimpleVT();
1719  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
1720 
1721  return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
1722  V128Reg);
1723 }
1724 
1725 void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
1726  unsigned Opc) {
1727  SDLoc dl(N);
1728  EVT VT = N->getValueType(0);
1729  bool Narrow = VT.getSizeInBits() == 64;
1730 
1731  // Form a REG_SEQUENCE to force register allocation.
1732  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1733 
1734  if (Narrow)
1735  transform(Regs, Regs.begin(),
1736  WidenVector(*CurDAG));
1737 
1738  SDValue RegSeq = createQTuple(Regs);
1739 
1740  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1741 
1742  unsigned LaneNo =
1743  cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1744 
1745  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1746  N->getOperand(NumVecs + 3), N->getOperand(0)};
1747  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1748  SDValue SuperReg = SDValue(Ld, 0);
1749 
1750  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1751  static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1752  AArch64::qsub2, AArch64::qsub3 };
1753  for (unsigned i = 0; i < NumVecs; ++i) {
1754  SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
1755  if (Narrow)
1756  NV = NarrowVector(NV, *CurDAG);
1757  ReplaceUses(SDValue(N, i), NV);
1758  }
1759 
1760  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1761  CurDAG->RemoveDeadNode(N);
1762 }
1763 
1764 void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
1765  unsigned Opc) {
1766  SDLoc dl(N);
1767  EVT VT = N->getValueType(0);
1768  bool Narrow = VT.getSizeInBits() == 64;
1769 
1770  // Form a REG_SEQUENCE to force register allocation.
1771  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1772 
1773  if (Narrow)
1774  transform(Regs, Regs.begin(),
1775  WidenVector(*CurDAG));
1776 
1777  SDValue RegSeq = createQTuple(Regs);
1778 
1779  const EVT ResTys[] = {MVT::i64, // Type of the write back register
1780  RegSeq->getValueType(0), MVT::Other};
1781 
1782  unsigned LaneNo =
1783  cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1784 
1785  SDValue Ops[] = {RegSeq,
1786  CurDAG->getTargetConstant(LaneNo, dl,
1787  MVT::i64), // Lane Number
1788  N->getOperand(NumVecs + 2), // Base register
1789  N->getOperand(NumVecs + 3), // Incremental
1790  N->getOperand(0)};
1791  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1792 
1793  // Update uses of the write back register
1794  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1795 
1796  // Update uses of the vector list
1797  SDValue SuperReg = SDValue(Ld, 1);
1798  if (NumVecs == 1) {
1799  ReplaceUses(SDValue(N, 0),
1800  Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
1801  } else {
1802  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1803  static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1804  AArch64::qsub2, AArch64::qsub3 };
1805  for (unsigned i = 0; i < NumVecs; ++i) {
1806  SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
1807  SuperReg);
1808  if (Narrow)
1809  NV = NarrowVector(NV, *CurDAG);
1810  ReplaceUses(SDValue(N, i), NV);
1811  }
1812  }
1813 
1814  // Update the Chain
1815  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1816  CurDAG->RemoveDeadNode(N);
1817 }
1818 
1819 void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
1820  unsigned Opc) {
1821  SDLoc dl(N);
1822  EVT VT = N->getOperand(2)->getValueType(0);
1823  bool Narrow = VT.getSizeInBits() == 64;
1824 
1825  // Form a REG_SEQUENCE to force register allocation.
1826  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1827 
1828  if (Narrow)
1829  transform(Regs, Regs.begin(),
1830  WidenVector(*CurDAG));
1831 
1832  SDValue RegSeq = createQTuple(Regs);
1833 
1834  unsigned LaneNo =
1835  cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1836 
1837  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1838  N->getOperand(NumVecs + 3), N->getOperand(0)};
1839  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
1840 
1841  // Transfer memoperands.
1842  MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1843  CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1844 
1845  ReplaceNode(N, St);
1846 }
1847 
1848 void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
1849  unsigned Opc) {
1850  SDLoc dl(N);
1851  EVT VT = N->getOperand(2)->getValueType(0);
1852  bool Narrow = VT.getSizeInBits() == 64;
1853 
1854  // Form a REG_SEQUENCE to force register allocation.
1855  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1856 
1857  if (Narrow)
1858  transform(Regs, Regs.begin(),
1859  WidenVector(*CurDAG));
1860 
1861  SDValue RegSeq = createQTuple(Regs);
1862 
1863  const EVT ResTys[] = {MVT::i64, // Type of the write back register
1864  MVT::Other};
1865 
1866  unsigned LaneNo =
1867  cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1868 
1869  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1870  N->getOperand(NumVecs + 2), // Base Register
1871  N->getOperand(NumVecs + 3), // Incremental
1872  N->getOperand(0)};
1873  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1874 
1875  // Transfer memoperands.
1876  MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1877  CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1878 
1879  ReplaceNode(N, St);
1880 }
1881 
1883  unsigned &Opc, SDValue &Opd0,
1884  unsigned &LSB, unsigned &MSB,
1885  unsigned NumberOfIgnoredLowBits,
1886  bool BiggerPattern) {
1887  assert(N->getOpcode() == ISD::AND &&
1888  "N must be a AND operation to call this function");
1889 
1890  EVT VT = N->getValueType(0);
1891 
1892  // Here we can test the type of VT and return false when the type does not
1893  // match, but since it is done prior to that call in the current context
1894  // we turned that into an assert to avoid redundant code.
1895  assert((VT == MVT::i32 || VT == MVT::i64) &&
1896  "Type checking must have been done before calling this function");
1897 
1898  // FIXME: simplify-demanded-bits in DAGCombine will probably have
1899  // changed the AND node to a 32-bit mask operation. We'll have to
1900  // undo that as part of the transform here if we want to catch all
1901  // the opportunities.
1902  // Currently the NumberOfIgnoredLowBits argument helps to recover
1903  // form these situations when matching bigger pattern (bitfield insert).
1904 
1905  // For unsigned extracts, check for a shift right and mask
1906  uint64_t AndImm = 0;
1907  if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
1908  return false;
1909 
1910  const SDNode *Op0 = N->getOperand(0).getNode();
1911 
1912  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
1913  // simplified. Try to undo that
1914  AndImm |= maskTrailingOnes<uint64_t>(NumberOfIgnoredLowBits);
1915 
1916  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
1917  if (AndImm & (AndImm + 1))
1918  return false;
1919 
1920  bool ClampMSB = false;
1921  uint64_t SrlImm = 0;
1922  // Handle the SRL + ANY_EXTEND case.
1923  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
1924  isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
1925  // Extend the incoming operand of the SRL to 64-bit.
1926  Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
1927  // Make sure to clamp the MSB so that we preserve the semantics of the
1928  // original operations.
1929  ClampMSB = true;
1930  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
1932  SrlImm)) {
1933  // If the shift result was truncated, we can still combine them.
1934  Opd0 = Op0->getOperand(0).getOperand(0);
1935 
1936  // Use the type of SRL node.
1937  VT = Opd0->getValueType(0);
1938  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
1939  Opd0 = Op0->getOperand(0);
1940  ClampMSB = (VT == MVT::i32);
1941  } else if (BiggerPattern) {
1942  // Let's pretend a 0 shift right has been performed.
1943  // The resulting code will be at least as good as the original one
1944  // plus it may expose more opportunities for bitfield insert pattern.
1945  // FIXME: Currently we limit this to the bigger pattern, because
1946  // some optimizations expect AND and not UBFM.
1947  Opd0 = N->getOperand(0);
1948  } else
1949  return false;
1950 
1951  // Bail out on large immediates. This happens when no proper
1952  // combining/constant folding was performed.
1953  if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
1954  LLVM_DEBUG(
1955  (dbgs() << N
1956  << ": Found large shift immediate, this should not happen\n"));
1957  return false;
1958  }
1959 
1960  LSB = SrlImm;
1961  MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
1962  : countTrailingOnes<uint64_t>(AndImm)) -
1963  1;
1964  if (ClampMSB)
1965  // Since we're moving the extend before the right shift operation, we need
1966  // to clamp the MSB to make sure we don't shift in undefined bits instead of
1967  // the zeros which would get shifted in with the original right shift
1968  // operation.
1969  MSB = MSB > 31 ? 31 : MSB;
1970 
1971  Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
1972  return true;
1973 }
1974 
1975 static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
1976  SDValue &Opd0, unsigned &Immr,
1977  unsigned &Imms) {
1978  assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
1979 
1980  EVT VT = N->getValueType(0);
1981  unsigned BitWidth = VT.getSizeInBits();
1982  assert((VT == MVT::i32 || VT == MVT::i64) &&
1983  "Type checking must have been done before calling this function");
1984 
1985  SDValue Op = N->getOperand(0);
1986  if (Op->getOpcode() == ISD::TRUNCATE) {
1987  Op = Op->getOperand(0);
1988  VT = Op->getValueType(0);
1989  BitWidth = VT.getSizeInBits();
1990  }
1991 
1992  uint64_t ShiftImm;
1993  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
1994  !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
1995  return false;
1996 
1997  unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
1998  if (ShiftImm + Width > BitWidth)
1999  return false;
2000 
2001  Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
2002  Opd0 = Op.getOperand(0);
2003  Immr = ShiftImm;
2004  Imms = ShiftImm + Width - 1;
2005  return true;
2006 }
2007 
2008 static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
2009  SDValue &Opd0, unsigned &LSB,
2010  unsigned &MSB) {
2011  // We are looking for the following pattern which basically extracts several
2012  // continuous bits from the source value and places it from the LSB of the
2013  // destination value, all other bits of the destination value or set to zero:
2014  //
2015  // Value2 = AND Value, MaskImm
2016  // SRL Value2, ShiftImm
2017  //
2018  // with MaskImm >> ShiftImm to search for the bit width.
2019  //
2020  // This gets selected into a single UBFM:
2021  //
2022  // UBFM Value, ShiftImm, BitWide + SrlImm -1
2023  //
2024 
2025  if (N->getOpcode() != ISD::SRL)
2026  return false;
2027 
2028  uint64_t AndMask = 0;
2029  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
2030  return false;
2031 
2032  Opd0 = N->getOperand(0).getOperand(0);
2033 
2034  uint64_t SrlImm = 0;
2035  if (!isIntImmediate(N->getOperand(1), SrlImm))
2036  return false;
2037 
2038  // Check whether we really have several bits extract here.
2039  unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
2040  if (BitWide && isMask_64(AndMask >> SrlImm)) {
2041  if (N->getValueType(0) == MVT::i32)
2042  Opc = AArch64::UBFMWri;
2043  else
2044  Opc = AArch64::UBFMXri;
2045 
2046  LSB = SrlImm;
2047  MSB = BitWide + SrlImm - 1;
2048  return true;
2049  }
2050 
2051  return false;
2052 }
2053 
2054 static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
2055  unsigned &Immr, unsigned &Imms,
2056  bool BiggerPattern) {
2057  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
2058  "N must be a SHR/SRA operation to call this function");
2059 
2060  EVT VT = N->getValueType(0);
2061 
2062  // Here we can test the type of VT and return false when the type does not
2063  // match, but since it is done prior to that call in the current context
2064  // we turned that into an assert to avoid redundant code.
2065  assert((VT == MVT::i32 || VT == MVT::i64) &&
2066  "Type checking must have been done before calling this function");
2067 
2068  // Check for AND + SRL doing several bits extract.
2069  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
2070  return true;
2071 
2072  // We're looking for a shift of a shift.
2073  uint64_t ShlImm = 0;
2074  uint64_t TruncBits = 0;
2075  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
2076  Opd0 = N->getOperand(0).getOperand(0);
2077  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
2078  N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
2079  // We are looking for a shift of truncate. Truncate from i64 to i32 could
2080  // be considered as setting high 32 bits as zero. Our strategy here is to
2081  // always generate 64bit UBFM. This consistency will help the CSE pass
2082  // later find more redundancy.
2083  Opd0 = N->getOperand(0).getOperand(0);
2084  TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
2085  VT = Opd0.getValueType();
2086  assert(VT == MVT::i64 && "the promoted type should be i64");
2087  } else if (BiggerPattern) {
2088  // Let's pretend a 0 shift left has been performed.
2089  // FIXME: Currently we limit this to the bigger pattern case,
2090  // because some optimizations expect AND and not UBFM
2091  Opd0 = N->getOperand(0);
2092  } else
2093  return false;
2094 
2095  // Missing combines/constant folding may have left us with strange
2096  // constants.
2097  if (ShlImm >= VT.getSizeInBits()) {
2098  LLVM_DEBUG(
2099  (dbgs() << N
2100  << ": Found large shift immediate, this should not happen\n"));
2101  return false;
2102  }
2103 
2104  uint64_t SrlImm = 0;
2105  if (!isIntImmediate(N->getOperand(1), SrlImm))
2106  return false;
2107 
2108  assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
2109  "bad amount in shift node!");
2110  int immr = SrlImm - ShlImm;
2111  Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
2112  Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
2113  // SRA requires a signed extraction
2114  if (VT == MVT::i32)
2115  Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
2116  else
2117  Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
2118  return true;
2119 }
2120 
2121 bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
2122  assert(N->getOpcode() == ISD::SIGN_EXTEND);
2123 
2124  EVT VT = N->getValueType(0);
2125  EVT NarrowVT = N->getOperand(0)->getValueType(0);
2126  if (VT != MVT::i64 || NarrowVT != MVT::i32)
2127  return false;
2128 
2129  uint64_t ShiftImm;
2130  SDValue Op = N->getOperand(0);
2131  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
2132  return false;
2133 
2134  SDLoc dl(N);
2135  // Extend the incoming operand of the shift to 64-bits.
2136  SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
2137  unsigned Immr = ShiftImm;
2138  unsigned Imms = NarrowVT.getSizeInBits() - 1;
2139  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
2140  CurDAG->getTargetConstant(Imms, dl, VT)};
2141  CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
2142  return true;
2143 }
2144 
2145 /// Try to form fcvtl2 instructions from a floating-point extend of a high-half
2146 /// extract of a subvector.
2147 bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) {
2148  assert(N->getOpcode() == ISD::FP_EXTEND);
2149 
2150  // There are 2 forms of fcvtl2 - extend to double or extend to float.
2151  SDValue Extract = N->getOperand(0);
2152  EVT VT = N->getValueType(0);
2153  EVT NarrowVT = Extract.getValueType();
2154  if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) &&
2155  (VT != MVT::v4f32 || NarrowVT != MVT::v4f16))
2156  return false;
2157 
2158  // Optionally look past a bitcast.
2159  Extract = peekThroughBitcasts(Extract);
2160  if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
2161  return false;
2162 
2163  // Match extract from start of high half index.
2164  // Example: v8i16 -> v4i16 means the extract must begin at index 4.
2165  unsigned ExtractIndex = Extract.getConstantOperandVal(1);
2166  if (ExtractIndex != Extract.getValueType().getVectorNumElements())
2167  return false;
2168 
2169  auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16;
2170  CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0));
2171  return true;
2172 }
2173 
2174 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
2175  SDValue &Opd0, unsigned &Immr, unsigned &Imms,
2176  unsigned NumberOfIgnoredLowBits = 0,
2177  bool BiggerPattern = false) {
2178  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
2179  return false;
2180 
2181  switch (N->getOpcode()) {
2182  default:
2183  if (!N->isMachineOpcode())
2184  return false;
2185  break;
2186  case ISD::AND:
2187  return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
2188  NumberOfIgnoredLowBits, BiggerPattern);
2189  case ISD::SRL:
2190  case ISD::SRA:
2191  return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
2192 
2194  return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
2195  }
2196 
2197  unsigned NOpc = N->getMachineOpcode();
2198  switch (NOpc) {
2199  default:
2200  return false;
2201  case AArch64::SBFMWri:
2202  case AArch64::UBFMWri:
2203  case AArch64::SBFMXri:
2204  case AArch64::UBFMXri:
2205  Opc = NOpc;
2206  Opd0 = N->getOperand(0);
2207  Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
2208  Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
2209  return true;
2210  }
2211  // Unreachable
2212  return false;
2213 }
2214 
2215 bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
2216  unsigned Opc, Immr, Imms;
2217  SDValue Opd0;
2218  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
2219  return false;
2220 
2221  EVT VT = N->getValueType(0);
2222  SDLoc dl(N);
2223 
2224  // If the bit extract operation is 64bit but the original type is 32bit, we
2225  // need to add one EXTRACT_SUBREG.
2226  if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
2227  SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
2228  CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
2229 
2230  SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
2231  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
2232  ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
2233  MVT::i32, SDValue(BFM, 0), SubReg));
2234  return true;
2235  }
2236 
2237  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
2238  CurDAG->getTargetConstant(Imms, dl, VT)};
2239  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2240  return true;
2241 }
2242 
2243 /// Does DstMask form a complementary pair with the mask provided by
2244 /// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
2245 /// this asks whether DstMask zeroes precisely those bits that will be set by
2246 /// the other half.
2247 static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
2248  unsigned NumberOfIgnoredHighBits, EVT VT) {
2249  assert((VT == MVT::i32 || VT == MVT::i64) &&
2250  "i32 or i64 mask type expected!");
2251  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
2252 
2253  APInt SignificantDstMask = APInt(BitWidth, DstMask);
2254  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
2255 
2256  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
2257  (SignificantDstMask | SignificantBitsToBeInserted).isAllOnes();
2258 }
2259 
2260 // Look for bits that will be useful for later uses.
2261 // A bit is consider useless as soon as it is dropped and never used
2262 // before it as been dropped.
2263 // E.g., looking for useful bit of x
2264 // 1. y = x & 0x7
2265 // 2. z = y >> 2
2266 // After #1, x useful bits are 0x7, then the useful bits of x, live through
2267 // y.
2268 // After #2, the useful bits of x are 0x4.
2269 // However, if x is used on an unpredicatable instruction, then all its bits
2270 // are useful.
2271 // E.g.
2272 // 1. y = x & 0x7
2273 // 2. z = y >> 2
2274 // 3. str x, [@x]
2275 static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
2276 
2278  unsigned Depth) {
2279  uint64_t Imm =
2280  cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
2282  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
2283  getUsefulBits(Op, UsefulBits, Depth + 1);
2284 }
2285 
2287  uint64_t Imm, uint64_t MSB,
2288  unsigned Depth) {
2289  // inherit the bitwidth value
2290  APInt OpUsefulBits(UsefulBits);
2291  OpUsefulBits = 1;
2292 
2293  if (MSB >= Imm) {
2294  OpUsefulBits <<= MSB - Imm + 1;
2295  --OpUsefulBits;
2296  // The interesting part will be in the lower part of the result
2297  getUsefulBits(Op, OpUsefulBits, Depth + 1);
2298  // The interesting part was starting at Imm in the argument
2299  OpUsefulBits <<= Imm;
2300  } else {
2301  OpUsefulBits <<= MSB + 1;
2302  --OpUsefulBits;
2303  // The interesting part will be shifted in the result
2304  OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm;
2305  getUsefulBits(Op, OpUsefulBits, Depth + 1);
2306  // The interesting part was at zero in the argument
2307  OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm);
2308  }
2309 
2310  UsefulBits &= OpUsefulBits;
2311 }
2312 
2313 static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
2314  unsigned Depth) {
2315  uint64_t Imm =
2316  cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
2317  uint64_t MSB =
2318  cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2319 
2320  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
2321 }
2322 
2324  unsigned Depth) {
2325  uint64_t ShiftTypeAndValue =
2326  cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2327  APInt Mask(UsefulBits);
2328  Mask.clearAllBits();
2329  Mask.flipAllBits();
2330 
2331  if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
2332  // Shift Left
2333  uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
2334  Mask <<= ShiftAmt;
2335  getUsefulBits(Op, Mask, Depth + 1);
2336  Mask.lshrInPlace(ShiftAmt);
2337  } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
2338  // Shift Right
2339  // We do not handle AArch64_AM::ASR, because the sign will change the
2340  // number of useful bits
2341  uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
2342  Mask.lshrInPlace(ShiftAmt);
2343  getUsefulBits(Op, Mask, Depth + 1);
2344  Mask <<= ShiftAmt;
2345  } else
2346  return;
2347 
2348  UsefulBits &= Mask;
2349 }
2350 
2351 static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
2352  unsigned Depth) {
2353  uint64_t Imm =
2354  cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2355  uint64_t MSB =
2356  cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
2357 
2358  APInt OpUsefulBits(UsefulBits);
2359  OpUsefulBits = 1;
2360 
2361  APInt ResultUsefulBits(UsefulBits.getBitWidth(), 0);
2362  ResultUsefulBits.flipAllBits();
2363  APInt Mask(UsefulBits.getBitWidth(), 0);
2364 
2365  getUsefulBits(Op, ResultUsefulBits, Depth + 1);
2366 
2367  if (MSB >= Imm) {
2368  // The instruction is a BFXIL.
2369  uint64_t Width = MSB - Imm + 1;
2370  uint64_t LSB = Imm;
2371 
2372  OpUsefulBits <<= Width;
2373  --OpUsefulBits;
2374 
2375  if (Op.getOperand(1) == Orig) {
2376  // Copy the low bits from the result to bits starting from LSB.
2377  Mask = ResultUsefulBits & OpUsefulBits;
2378  Mask <<= LSB;
2379  }
2380 
2381  if (Op.getOperand(0) == Orig)
2382  // Bits starting from LSB in the input contribute to the result.
2383  Mask |= (ResultUsefulBits & ~OpUsefulBits);
2384  } else {
2385  // The instruction is a BFI.
2386  uint64_t Width = MSB + 1;
2387  uint64_t LSB = UsefulBits.getBitWidth() - Imm;
2388 
2389  OpUsefulBits <<= Width;
2390  --OpUsefulBits;
2391  OpUsefulBits <<= LSB;
2392 
2393  if (Op.getOperand(1) == Orig) {
2394  // Copy the bits from the result to the zero bits.
2395  Mask = ResultUsefulBits & OpUsefulBits;
2396  Mask.lshrInPlace(LSB);
2397  }
2398 
2399  if (Op.getOperand(0) == Orig)
2400  Mask |= (ResultUsefulBits & ~OpUsefulBits);
2401  }
2402 
2403  UsefulBits &= Mask;
2404 }
2405 
2406 static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
2407  SDValue Orig, unsigned Depth) {
2408 
2409  // Users of this node should have already been instruction selected
2410  // FIXME: Can we turn that into an assert?
2411  if (!UserNode->isMachineOpcode())
2412  return;
2413 
2414  switch (UserNode->getMachineOpcode()) {
2415  default:
2416  return;
2417  case AArch64::ANDSWri:
2418  case AArch64::ANDSXri:
2419  case AArch64::ANDWri:
2420  case AArch64::ANDXri:
2421  // We increment Depth only when we call the getUsefulBits
2422  return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
2423  Depth);
2424  case AArch64::UBFMWri:
2425  case AArch64::UBFMXri:
2426  return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
2427 
2428  case AArch64::ORRWrs:
2429  case AArch64::ORRXrs:
2430  if (UserNode->getOperand(0) != Orig && UserNode->getOperand(1) == Orig)
2431  getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
2432  Depth);
2433  return;
2434  case AArch64::BFMWri:
2435  case AArch64::BFMXri:
2436  return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
2437 
2438  case AArch64::STRBBui:
2439  case AArch64::STURBBi:
2440  if (UserNode->getOperand(0) != Orig)
2441  return;
2442  UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
2443  return;
2444 
2445  case AArch64::STRHHui:
2446  case AArch64::STURHHi:
2447  if (UserNode->getOperand(0) != Orig)
2448  return;
2449  UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
2450  return;
2451  }
2452 }
2453 
2454 static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
2456  return;
2457  // Initialize UsefulBits
2458  if (!Depth) {
2459  unsigned Bitwidth = Op.getScalarValueSizeInBits();
2460  // At the beginning, assume every produced bits is useful
2461  UsefulBits = APInt(Bitwidth, 0);
2462  UsefulBits.flipAllBits();
2463  }
2464  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
2465 
2466  for (SDNode *Node : Op.getNode()->uses()) {
2467  // A use cannot produce useful bits
2468  APInt UsefulBitsForUse = APInt(UsefulBits);
2469  getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
2470  UsersUsefulBits |= UsefulBitsForUse;
2471  }
2472  // UsefulBits contains the produced bits that are meaningful for the
2473  // current definition, thus a user cannot make a bit meaningful at
2474  // this point
2475  UsefulBits &= UsersUsefulBits;
2476 }
2477 
2478 /// Create a machine node performing a notional SHL of Op by ShlAmount. If
2479 /// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
2480 /// 0, return Op unchanged.
2481 static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
2482  if (ShlAmount == 0)
2483  return Op;
2484 
2485  EVT VT = Op.getValueType();
2486  SDLoc dl(Op);
2487  unsigned BitWidth = VT.getSizeInBits();
2488  unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
2489 
2490  SDNode *ShiftNode;
2491  if (ShlAmount > 0) {
2492  // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
2493  ShiftNode = CurDAG->getMachineNode(
2494  UBFMOpc, dl, VT, Op,
2495  CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
2496  CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
2497  } else {
2498  // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
2499  assert(ShlAmount < 0 && "expected right shift");
2500  int ShrAmount = -ShlAmount;
2501  ShiftNode = CurDAG->getMachineNode(
2502  UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
2503  CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
2504  }
2505 
2506  return SDValue(ShiftNode, 0);
2507 }
2508 
2509 // For bit-field-positioning pattern "(and (shl VAL, N), ShiftedMask)".
2511  bool BiggerPattern,
2512  const uint64_t NonZeroBits,
2513  SDValue &Src, int &DstLSB,
2514  int &Width);
2515 
2516 // For bit-field-positioning pattern "shl VAL, N)".
2518  bool BiggerPattern,
2519  const uint64_t NonZeroBits,
2520  SDValue &Src, int &DstLSB,
2521  int &Width);
2522 
2523 /// Does this tree qualify as an attempt to move a bitfield into position,
2524 /// essentially "(and (shl VAL, N), Mask)" or (shl VAL, N).
2526  bool BiggerPattern, SDValue &Src,
2527  int &DstLSB, int &Width) {
2528  EVT VT = Op.getValueType();
2529  unsigned BitWidth = VT.getSizeInBits();
2530  (void)BitWidth;
2531  assert(BitWidth == 32 || BitWidth == 64);
2532 
2533  KnownBits Known = CurDAG->computeKnownBits(Op);
2534 
2535  // Non-zero in the sense that they're not provably zero, which is the key
2536  // point if we want to use this value
2537  const uint64_t NonZeroBits = (~Known.Zero).getZExtValue();
2538  if (!isShiftedMask_64(NonZeroBits))
2539  return false;
2540 
2541  switch (Op.getOpcode()) {
2542  default:
2543  break;
2544  case ISD::AND:
2545  return isBitfieldPositioningOpFromAnd(CurDAG, Op, BiggerPattern,
2546  NonZeroBits, Src, DstLSB, Width);
2547  case ISD::SHL:
2548  return isBitfieldPositioningOpFromShl(CurDAG, Op, BiggerPattern,
2549  NonZeroBits, Src, DstLSB, Width);
2550  }
2551 
2552  return false;
2553 }
2554 
2556  bool BiggerPattern,
2557  const uint64_t NonZeroBits,
2558  SDValue &Src, int &DstLSB,
2559  int &Width) {
2560  assert(isShiftedMask_64(NonZeroBits) && "Caller guaranteed");
2561 
2562  EVT VT = Op.getValueType();
2563  assert((VT == MVT::i32 || VT == MVT::i64) &&
2564  "Caller guarantees VT is one of i32 or i64");
2565  (void)VT;
2566 
2567  uint64_t AndImm;
2568  if (!isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm))
2569  return false;
2570 
2571  // If (~AndImm & NonZeroBits) is not zero at POS, we know that
2572  // 1) (AndImm & (1 << POS) == 0)
2573  // 2) the result of AND is not zero at POS bit (according to NonZeroBits)
2574  //
2575  // 1) and 2) don't agree so something must be wrong (e.g., in
2576  // 'SelectionDAG::computeKnownBits')
2577  assert((~AndImm & NonZeroBits) == 0 &&
2578  "Something must be wrong (e.g., in SelectionDAG::computeKnownBits)");
2579 
2580  SDValue AndOp0 = Op.getOperand(0);
2581 
2582  uint64_t ShlImm;
2583  SDValue ShlOp0;
2584  if (isOpcWithIntImmediate(AndOp0.getNode(), ISD::SHL, ShlImm)) {
2585  // For pattern "and(shl(val, N), shifted-mask)", 'ShlOp0' is set to 'val'.
2586  ShlOp0 = AndOp0.getOperand(0);
2587  } else if (VT == MVT::i64 && AndOp0.getOpcode() == ISD::ANY_EXTEND &&
2589  ShlImm)) {
2590  // For pattern "and(any_extend(shl(val, N)), shifted-mask)"
2591 
2592  // ShlVal == shl(val, N), which is a left shift on a smaller type.
2593  SDValue ShlVal = AndOp0.getOperand(0);
2594 
2595  // Since this is after type legalization and ShlVal is extended to MVT::i64,
2596  // expect VT to be MVT::i32.
2597  assert((ShlVal.getValueType() == MVT::i32) && "Expect VT to be MVT::i32.");
2598 
2599  // Widens 'val' to MVT::i64 as the source of bit field positioning.
2600  ShlOp0 = Widen(CurDAG, ShlVal.getOperand(0));
2601  } else
2602  return false;
2603 
2604  // For !BiggerPattern, bail out if the AndOp0 has more than one use, since
2605  // then we'll end up generating AndOp0+UBFIZ instead of just keeping
2606  // AndOp0+AND.
2607  if (!BiggerPattern && !AndOp0.hasOneUse())
2608  return false;
2609 
2610  DstLSB = countTrailingZeros(NonZeroBits);
2611  Width = countTrailingOnes(NonZeroBits >> DstLSB);
2612 
2613  // Bail out on large Width. This happens when no proper combining / constant
2614  // folding was performed.
2615  if (Width >= (int)VT.getSizeInBits()) {
2616  // If VT is i64, Width > 64 is insensible since NonZeroBits is uint64_t, and
2617  // Width == 64 indicates a missed dag-combine from "(and val, AllOnes)" to
2618  // "val".
2619  // If VT is i32, what Width >= 32 means:
2620  // - For "(and (any_extend(shl val, N)), shifted-mask)", the`and` Op
2621  // demands at least 'Width' bits (after dag-combiner). This together with
2622  // `any_extend` Op (undefined higher bits) indicates missed combination
2623  // when lowering the 'and' IR instruction to an machine IR instruction.
2624  LLVM_DEBUG(
2625  dbgs()
2626  << "Found large Width in bit-field-positioning -- this indicates no "
2627  "proper combining / constant folding was performed\n");
2628  return false;
2629  }
2630 
2631  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
2632  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
2633  // amount. BiggerPattern is true when this pattern is being matched for BFI,
2634  // BiggerPattern is false when this pattern is being matched for UBFIZ, in
2635  // which case it is not profitable to insert an extra shift.
2636  if (ShlImm != uint64_t(DstLSB) && !BiggerPattern)
2637  return false;
2638 
2639  Src = getLeftShift(CurDAG, ShlOp0, ShlImm - DstLSB);
2640  return true;
2641 }
2642 
2643 // For node (shl (and val, mask), N)), returns true if the node is equivalent to
2644 // UBFIZ.
2646  SDValue &Src, int &DstLSB,
2647  int &Width) {
2648  // Caller should have verified that N is a left shift with constant shift
2649  // amount; asserts that.
2650  assert(Op.getOpcode() == ISD::SHL &&
2651  "Op.getNode() should be a SHL node to call this function");
2652  assert(isIntImmediateEq(Op.getOperand(1), ShlImm) &&
2653  "Op.getNode() should shift ShlImm to call this function");
2654 
2655  uint64_t AndImm = 0;
2656  SDValue Op0 = Op.getOperand(0);
2657  if (!isOpcWithIntImmediate(Op0.getNode(), ISD::AND, AndImm))
2658  return false;
2659 
2660  const uint64_t ShiftedAndImm = ((AndImm << ShlImm) >> ShlImm);
2661  if (isMask_64(ShiftedAndImm)) {
2662  // AndImm is a superset of (AllOnes >> ShlImm); in other words, AndImm
2663  // should end with Mask, and could be prefixed with random bits if those
2664  // bits are shifted out.
2665  //
2666  // For example, xyz11111 (with {x,y,z} being 0 or 1) is fine if ShlImm >= 3;
2667  // the AND result corresponding to those bits are shifted out, so it's fine
2668  // to not extract them.
2669  Width = countTrailingOnes(ShiftedAndImm);
2670  DstLSB = ShlImm;
2671  Src = Op0.getOperand(0);
2672  return true;
2673  }
2674  return false;
2675 }
2676 
2678  bool BiggerPattern,
2679  const uint64_t NonZeroBits,
2680  SDValue &Src, int &DstLSB,
2681  int &Width) {
2682  assert(isShiftedMask_64(NonZeroBits) && "Caller guaranteed");
2683 
2684  EVT VT = Op.getValueType();
2685  assert((VT == MVT::i32 || VT == MVT::i64) &&
2686  "Caller guarantees that type is i32 or i64");
2687  (void)VT;
2688 
2689  uint64_t ShlImm;
2690  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
2691  return false;
2692 
2693  if (!BiggerPattern && !Op.hasOneUse())
2694  return false;
2695 
2696  if (isSeveralBitsPositioningOpFromShl(ShlImm, Op, Src, DstLSB, Width))
2697  return true;
2698 
2699  DstLSB = countTrailingZeros(NonZeroBits);
2700  Width = countTrailingOnes(NonZeroBits >> DstLSB);
2701 
2702  if (ShlImm != uint64_t(DstLSB) && !BiggerPattern)
2703  return false;
2704 
2705  Src = getLeftShift(CurDAG, Op.getOperand(0), ShlImm - DstLSB);
2706  return true;
2707 }
2708 
2709 static bool isShiftedMask(uint64_t Mask, EVT VT) {
2710  assert(VT == MVT::i32 || VT == MVT::i64);
2711  if (VT == MVT::i32)
2712  return isShiftedMask_32(Mask);
2713  return isShiftedMask_64(Mask);
2714 }
2715 
2716 // Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
2717 // inserted only sets known zero bits.
2719  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2720 
2721  EVT VT = N->getValueType(0);
2722  if (VT != MVT::i32 && VT != MVT::i64)
2723  return false;
2724 
2725  unsigned BitWidth = VT.getSizeInBits();
2726 
2727  uint64_t OrImm;
2728  if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
2729  return false;
2730 
2731  // Skip this transformation if the ORR immediate can be encoded in the ORR.
2732  // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
2733  // performance neutral.
2735  return false;
2736 
2737  uint64_t MaskImm;
2738  SDValue And = N->getOperand(0);
2739  // Must be a single use AND with an immediate operand.
2740  if (!And.hasOneUse() ||
2741  !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
2742  return false;
2743 
2744  // Compute the Known Zero for the AND as this allows us to catch more general
2745  // cases than just looking for AND with imm.
2746  KnownBits Known = CurDAG->computeKnownBits(And);
2747 
2748  // Non-zero in the sense that they're not provably zero, which is the key
2749  // point if we want to use this value.
2750  uint64_t NotKnownZero = (~Known.Zero).getZExtValue();
2751 
2752  // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
2753  if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
2754  return false;
2755 
2756  // The bits being inserted must only set those bits that are known to be zero.
2757  if ((OrImm & NotKnownZero) != 0) {
2758  // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
2759  // currently handle this case.
2760  return false;
2761  }
2762 
2763  // BFI/BFXIL dst, src, #lsb, #width.
2764  int LSB = countTrailingOnes(NotKnownZero);
2765  int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
2766 
2767  // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
2768  unsigned ImmR = (BitWidth - LSB) % BitWidth;
2769  unsigned ImmS = Width - 1;
2770 
2771  // If we're creating a BFI instruction avoid cases where we need more
2772  // instructions to materialize the BFI constant as compared to the original
2773  // ORR. A BFXIL will use the same constant as the original ORR, so the code
2774  // should be no worse in this case.
2775  bool IsBFI = LSB != 0;
2776  uint64_t BFIImm = OrImm >> LSB;
2777  if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
2778  // We have a BFI instruction and we know the constant can't be materialized
2779  // with a ORR-immediate with the zero register.
2780  unsigned OrChunks = 0, BFIChunks = 0;
2781  for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
2782  if (((OrImm >> Shift) & 0xFFFF) != 0)
2783  ++OrChunks;
2784  if (((BFIImm >> Shift) & 0xFFFF) != 0)
2785  ++BFIChunks;
2786  }
2787  if (BFIChunks > OrChunks)
2788  return false;
2789  }
2790 
2791  // Materialize the constant to be inserted.
2792  SDLoc DL(N);
2793  unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
2794  SDNode *MOVI = CurDAG->getMachineNode(
2795  MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
2796 
2797  // Create the BFI/BFXIL instruction.
2798  SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
2799  CurDAG->getTargetConstant(ImmR, DL, VT),
2800  CurDAG->getTargetConstant(ImmS, DL, VT)};
2801  unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2802  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2803  return true;
2804 }
2805 
2807  SDValue &ShiftedOperand,
2808  uint64_t &EncodedShiftImm) {
2809  // Avoid folding Dst into ORR-with-shift if Dst has other uses than ORR.
2810  if (!Dst.hasOneUse())
2811  return false;
2812 
2813  EVT VT = Dst.getValueType();
2814  assert((VT == MVT::i32 || VT == MVT::i64) &&
2815  "Caller should guarantee that VT is one of i32 or i64");
2816  const unsigned SizeInBits = VT.getSizeInBits();
2817 
2818  SDLoc DL(Dst.getNode());
2819  uint64_t AndImm, ShlImm;
2820  if (isOpcWithIntImmediate(Dst.getNode(), ISD::AND, AndImm) &&
2821  isShiftedMask_64(AndImm)) {
2822  // Avoid transforming 'DstOp0' if it has other uses than the AND node.
2823  SDValue DstOp0 = Dst.getOperand(0);
2824  if (!DstOp0.hasOneUse())
2825  return false;
2826 
2827  // An example to illustrate the transformation
2828  // From:
2829  // lsr x8, x1, #1
2830  // and x8, x8, #0x3f80
2831  // bfxil x8, x1, #0, #7
2832  // To:
2833  // and x8, x23, #0x7f
2834  // ubfx x9, x23, #8, #7
2835  // orr x23, x8, x9, lsl #7
2836  //
2837  // The number of instructions remains the same, but ORR is faster than BFXIL
2838  // on many AArch64 processors (or as good as BFXIL if not faster). Besides,
2839  // the dependency chain is improved after the transformation.
2840  uint64_t SrlImm;
2841  if (isOpcWithIntImmediate(DstOp0.getNode(), ISD::SRL, SrlImm)) {
2842  uint64_t NumTrailingZeroInShiftedMask = countTrailingZeros(AndImm);
2843  if ((SrlImm + NumTrailingZeroInShiftedMask) < SizeInBits) {
2844  unsigned MaskWidth =
2845  countTrailingOnes(AndImm >> NumTrailingZeroInShiftedMask);
2846  unsigned UBFMOpc =
2847  (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2848  SDNode *UBFMNode = CurDAG->getMachineNode(
2849  UBFMOpc, DL, VT, DstOp0.getOperand(0),
2850  CurDAG->getTargetConstant(SrlImm + NumTrailingZeroInShiftedMask, DL,
2851  VT),
2852  CurDAG->getTargetConstant(
2853  SrlImm + NumTrailingZeroInShiftedMask + MaskWidth - 1, DL, VT));
2854  ShiftedOperand = SDValue(UBFMNode, 0);
2855  EncodedShiftImm = AArch64_AM::getShifterImm(
2856  AArch64_AM::LSL, NumTrailingZeroInShiftedMask);
2857  return true;
2858  }
2859  }
2860  return false;
2861  }
2862 
2863  if (isOpcWithIntImmediate(Dst.getNode(), ISD::SHL, ShlImm)) {
2864  ShiftedOperand = Dst.getOperand(0);
2865  EncodedShiftImm = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShlImm);
2866  return true;
2867  }
2868 
2869  uint64_t SrlImm;
2870  if (isOpcWithIntImmediate(Dst.getNode(), ISD::SRL, SrlImm)) {
2871  ShiftedOperand = Dst.getOperand(0);
2872  EncodedShiftImm = AArch64_AM::getShifterImm(AArch64_AM::LSR, SrlImm);
2873  return true;
2874  }
2875  return false;
2876 }
2877 
2878 // Given an 'ISD::OR' node that is going to be selected as BFM, analyze
2879 // the operands and select it to AArch64::ORR with shifted registers if
2880 // that's more efficient. Returns true iff selection to AArch64::ORR happens.
2881 static bool tryOrrWithShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
2882  SDValue Src, SDValue Dst, SelectionDAG *CurDAG,
2883  const bool BiggerPattern) {
2884  EVT VT = N->getValueType(0);
2885  assert(N->getOpcode() == ISD::OR && "Expect N to be an OR node");
2886  assert(((N->getOperand(0) == OrOpd0 && N->getOperand(1) == OrOpd1) ||
2887  (N->getOperand(1) == OrOpd0 && N->getOperand(0) == OrOpd1)) &&
2888  "Expect OrOpd0 and OrOpd1 to be operands of ISD::OR");
2889  assert((VT == MVT::i32 || VT == MVT::i64) &&
2890  "Expect result type to be i32 or i64 since N is combinable to BFM");
2891  SDLoc DL(N);
2892 
2893  // Bail out if BFM simplifies away one node in BFM Dst.
2894  if (OrOpd1 != Dst)
2895  return false;
2896 
2897  const unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
2898  // For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer
2899  // nodes from Rn (or inserts additional shift node) if BiggerPattern is true.
2900  if (BiggerPattern) {
2901  uint64_t SrcAndImm;
2902  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::AND, SrcAndImm) &&
2903  isMask_64(SrcAndImm) && OrOpd0.getOperand(0) == Src) {
2904  // OrOpd0 = AND Src, #Mask
2905  // So BFM simplifies away one AND node from Src and doesn't simplify away
2906  // nodes from Dst. If ORR with left-shifted operand also simplifies away
2907  // one node (from Rd), ORR is better since it has higher throughput and
2908  // smaller latency than BFM on many AArch64 processors (and for the rest
2909  // ORR is at least as good as BFM).
2910  SDValue ShiftedOperand;
2911  uint64_t EncodedShiftImm;
2912  if (isWorthFoldingIntoOrrWithShift(Dst, CurDAG, ShiftedOperand,
2913  EncodedShiftImm)) {
2914  SDValue Ops[] = {OrOpd0, ShiftedOperand,
2915  CurDAG->getTargetConstant(EncodedShiftImm, DL, VT)};
2916  CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
2917  return true;
2918  }
2919  }
2920  return false;
2921  }
2922 
2923  assert((!BiggerPattern) && "BiggerPattern should be handled above");
2924 
2925  uint64_t ShlImm;
2926  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm)) {
2927  if (OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
2928  SDValue Ops[] = {
2929  Dst, Src,
2930  CurDAG->getTargetConstant(
2932  CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
2933  return true;
2934  }
2935 
2936  // Select the following pattern to left-shifted operand rather than BFI.
2937  // %val1 = op ..
2938  // %val2 = shl %val1, #imm
2939  // %res = or %val1, %val2
2940  //
2941  // If N is selected to be BFI, we know that
2942  // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into
2943  // BFI) 2) OrOpd1 would be the destination operand (i.e., preserved)
2944  //
2945  // Instead of selecting N to BFI, fold OrOpd0 as a left shift directly.
2946  if (OrOpd0.getOperand(0) == OrOpd1) {
2947  SDValue Ops[] = {
2948  OrOpd1, OrOpd1,
2949  CurDAG->getTargetConstant(
2951  CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
2952  return true;
2953  }
2954  }
2955 
2956  uint64_t SrlImm;
2957  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SRL, SrlImm)) {
2958  // Select the following pattern to right-shifted operand rather than BFXIL.
2959  // %val1 = op ..
2960  // %val2 = lshr %val1, #imm
2961  // %res = or %val1, %val2
2962  //
2963  // If N is selected to be BFXIL, we know that
2964  // 1) OrOpd0 would be the operand from which extract bits (i.e., folded into
2965  // BFXIL) 2) OrOpd1 would be the destination operand (i.e., preserved)
2966  //
2967  // Instead of selecting N to BFXIL, fold OrOpd0 as a right shift directly.
2968  if (OrOpd0.getOperand(0) == OrOpd1) {
2969  SDValue Ops[] = {
2970  OrOpd1, OrOpd1,
2971  CurDAG->getTargetConstant(
2973  CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
2974  return true;
2975  }
2976  }
2977 
2978  return false;
2979 }
2980 
2981 static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
2982  SelectionDAG *CurDAG) {
2983  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2984 
2985  EVT VT = N->getValueType(0);
2986  if (VT != MVT::i32 && VT != MVT::i64)
2987  return false;
2988 
2989  unsigned BitWidth = VT.getSizeInBits();
2990 
2991  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
2992  // have the expected shape. Try to undo that.
2993 
2994  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
2995  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
2996 
2997  // Given a OR operation, check if we have the following pattern
2998  // ubfm c, b, imm, imm2 (or something that does the same jobs, see
2999  // isBitfieldExtractOp)
3000  // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
3001  // countTrailingZeros(mask2) == imm2 - imm + 1
3002  // f = d | c
3003  // if yes, replace the OR instruction with:
3004  // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
3005 
3006  // OR is commutative, check all combinations of operand order and values of
3007  // BiggerPattern, i.e.
3008  // Opd0, Opd1, BiggerPattern=false
3009  // Opd1, Opd0, BiggerPattern=false
3010  // Opd0, Opd1, BiggerPattern=true
3011  // Opd1, Opd0, BiggerPattern=true
3012  // Several of these combinations may match, so check with BiggerPattern=false
3013  // first since that will produce better results by matching more instructions
3014  // and/or inserting fewer extra instructions.
3015  for (int I = 0; I < 4; ++I) {
3016 
3017  SDValue Dst, Src;
3018  unsigned ImmR, ImmS;
3019  bool BiggerPattern = I / 2;
3020  SDValue OrOpd0Val = N->getOperand(I % 2);
3021  SDNode *OrOpd0 = OrOpd0Val.getNode();
3022  SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
3023  SDNode *OrOpd1 = OrOpd1Val.getNode();
3024 
3025  unsigned BFXOpc;
3026  int DstLSB, Width;
3027  if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
3028  NumberOfIgnoredLowBits, BiggerPattern)) {
3029  // Check that the returned opcode is compatible with the pattern,
3030  // i.e., same type and zero extended (U and not S)
3031  if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
3032  (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
3033  continue;
3034 
3035  // Compute the width of the bitfield insertion
3036  DstLSB = 0;
3037  Width = ImmS - ImmR + 1;
3038  // FIXME: This constraint is to catch bitfield insertion we may
3039  // want to widen the pattern if we want to grab general bitfied
3040  // move case
3041  if (Width <= 0)
3042  continue;
3043 
3044  // If the mask on the insertee is correct, we have a BFXIL operation. We
3045  // can share the ImmR and ImmS values from the already-computed UBFM.
3046  } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
3047  BiggerPattern,
3048  Src, DstLSB, Width)) {
3049  ImmR = (BitWidth - DstLSB) % BitWidth;
3050  ImmS = Width - 1;
3051  } else
3052  continue;
3053 
3054  // Check the second part of the pattern
3055  EVT VT = OrOpd1Val.getValueType();
3056  assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
3057 
3058  // Compute the Known Zero for the candidate of the first operand.
3059  // This allows to catch more general case than just looking for
3060  // AND with imm. Indeed, simplify-demanded-bits may have removed
3061  // the AND instruction because it proves it was useless.
3062  KnownBits Known = CurDAG->computeKnownBits(OrOpd1Val);
3063 
3064  // Check if there is enough room for the second operand to appear
3065  // in the first one
3066  APInt BitsToBeInserted =
3067  APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width);
3068 
3069  if ((BitsToBeInserted & ~Known.Zero) != 0)
3070  continue;
3071 
3072  // Set the first operand
3073  uint64_t Imm;
3074  if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
3075  isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
3076  // In that case, we can eliminate the AND
3077  Dst = OrOpd1->getOperand(0);
3078  else
3079  // Maybe the AND has been removed by simplify-demanded-bits
3080  // or is useful because it discards more bits
3081  Dst = OrOpd1Val;
3082 
3083  // Before selecting ISD::OR node to AArch64::BFM, see if an AArch64::ORR
3084  // with shifted operand is more efficient.
3085  if (tryOrrWithShift(N, OrOpd0Val, OrOpd1Val, Src, Dst, CurDAG,
3086  BiggerPattern))
3087  return true;
3088 
3089  // both parts match
3090  SDLoc DL(N);
3091  SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
3092  CurDAG->getTargetConstant(ImmS, DL, VT)};
3093  unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
3094  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
3095  return true;
3096  }
3097 
3098  // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
3099  // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
3100  // mask (e.g., 0x000ffff0).
3101  uint64_t Mask0Imm, Mask1Imm;
3102  SDValue And0 = N->getOperand(0);
3103  SDValue And1 = N->getOperand(1);
3104  if (And0.hasOneUse() && And1.hasOneUse() &&
3105  isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
3106  isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
3107  APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
3108  (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
3109 
3110  // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
3111  // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
3112  // bits to be inserted.
3113  if (isShiftedMask(Mask0Imm, VT)) {
3114  std::swap(And0, And1);
3115  std::swap(Mask0Imm, Mask1Imm);
3116  }
3117 
3118  SDValue Src = And1->getOperand(0);
3119  SDValue Dst = And0->getOperand(0);
3120  unsigned LSB = countTrailingZeros(Mask1Imm);
3121  int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
3122 
3123  // The BFXIL inserts the low-order bits from a source register, so right
3124  // shift the needed bits into place.
3125  SDLoc DL(N);
3126  unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
3127  uint64_t LsrImm = LSB;
3128  if (Src->hasOneUse() &&
3129  isOpcWithIntImmediate(Src.getNode(), ISD::SRL, LsrImm) &&
3130  (LsrImm + LSB) < BitWidth) {
3131  Src = Src->getOperand(0);
3132  LsrImm += LSB;
3133  }
3134 
3135  SDNode *LSR = CurDAG->getMachineNode(
3136  ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LsrImm, DL, VT),
3137  CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
3138 
3139  // BFXIL is an alias of BFM, so translate to BFM operands.
3140  unsigned ImmR = (BitWidth - LSB) % BitWidth;
3141  unsigned ImmS = Width - 1;
3142 
3143  // Create the BFXIL instruction.
3144  SDValue Ops[] = {Dst, SDValue(LSR, 0),
3145  CurDAG->getTargetConstant(ImmR, DL, VT),
3146  CurDAG->getTargetConstant(ImmS, DL, VT)};
3147  unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
3148  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
3149  return true;
3150  }
3151 
3152  return false;
3153 }
3154 
3155 bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
3156  if (N->getOpcode() != ISD::OR)
3157  return false;
3158 
3159  APInt NUsefulBits;
3160  getUsefulBits(SDValue(N, 0), NUsefulBits);
3161 
3162  // If all bits are not useful, just return UNDEF.
3163  if (!NUsefulBits) {
3164  CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
3165  return true;
3166  }
3167 
3168  if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
3169  return true;
3170 
3171  return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
3172 }
3173 
3174 /// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
3175 /// equivalent of a left shift by a constant amount followed by an and masking
3176 /// out a contiguous set of bits.
3177 bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
3178  if (N->getOpcode() != ISD::AND)
3179  return false;
3180 
3181  EVT VT = N->getValueType(0);
3182  if (VT != MVT::i32 && VT != MVT::i64)
3183  return false;
3184 
3185  SDValue Op0;
3186  int DstLSB, Width;
3187  if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
3188  Op0, DstLSB, Width))
3189  return false;
3190 
3191  // ImmR is the rotate right amount.
3192  unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
3193  // ImmS is the most significant bit of the source to be moved.
3194  unsigned ImmS = Width - 1;
3195 
3196  SDLoc DL(N);
3197  SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
3198  CurDAG->getTargetConstant(ImmS, DL, VT)};
3199  unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
3200  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
3201  return true;
3202 }
3203 
3204 /// tryShiftAmountMod - Take advantage of built-in mod of shift amount in
3205 /// variable shift/rotate instructions.
3206 bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3207  EVT VT = N->getValueType(0);
3208 
3209  unsigned Opc;
3210  switch (N->getOpcode()) {
3211  case ISD::ROTR:
3212  Opc = (VT == MVT::i32) ? AArch64::RORVWr : AArch64::RORVXr;
3213  break;
3214  case ISD::SHL:
3215  Opc = (VT == MVT::i32) ? AArch64::LSLVWr : AArch64::LSLVXr;
3216  break;
3217  case ISD::SRL:
3218  Opc = (VT == MVT::i32) ? AArch64::LSRVWr : AArch64::LSRVXr;
3219  break;
3220  case ISD::SRA:
3221  Opc = (VT == MVT::i32) ? AArch64::ASRVWr : AArch64::ASRVXr;
3222  break;
3223  default:
3224  return false;
3225  }
3226 
3227  uint64_t Size;
3228  uint64_t Bits;
3229  if (VT == MVT::i32) {
3230  Bits = 5;
3231  Size = 32;
3232  } else if (VT == MVT::i64) {
3233  Bits = 6;
3234  Size = 64;
3235  } else
3236  return false;
3237 
3238  SDValue ShiftAmt = N->getOperand(1);
3239  SDLoc DL(N);
3240  SDValue NewShiftAmt;
3241 
3242  // Skip over an extend of the shift amount.
3243  if (ShiftAmt->getOpcode() == ISD::ZERO_EXTEND ||
3244  ShiftAmt->getOpcode() == ISD::ANY_EXTEND)
3245  ShiftAmt = ShiftAmt->getOperand(0);
3246 
3247  if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3248  SDValue Add0 = ShiftAmt->getOperand(0);
3249  SDValue Add1 = ShiftAmt->getOperand(1);
3250  uint64_t Add0Imm;
3251  uint64_t Add1Imm;
3252  if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0)) {
3253  // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3254  // to avoid the ADD/SUB.
3255  NewShiftAmt = Add0;
3256  } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3257  isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
3258  (Add0Imm % Size == 0)) {
3259  // If we are shifting by N-X where N == 0 mod Size, then just shift by -X
3260  // to generate a NEG instead of a SUB from a constant.
3261  unsigned NegOpc;
3262  unsigned ZeroReg;
3263  EVT SubVT = ShiftAmt->getValueType(0);
3264  if (SubVT == MVT::i32) {
3265  NegOpc = AArch64::SUBWrr;
3266  ZeroReg = AArch64::WZR;
3267  } else {
3268  assert(SubVT == MVT::i64);
3269  NegOpc = AArch64::SUBXrr;
3270  ZeroReg = AArch64::XZR;
3271  }
3272  SDValue Zero =
3273  CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
3274  MachineSDNode *Neg =
3275  CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
3276  NewShiftAmt = SDValue(Neg, 0);
3277  } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3278  isIntImmediate(Add0, Add0Imm) && (Add0Imm % Size == Size - 1)) {
3279  // If we are shifting by N-X where N == -1 mod Size, then just shift by ~X
3280  // to generate a NOT instead of a SUB from a constant.
3281  unsigned NotOpc;
3282  unsigned ZeroReg;
3283  EVT SubVT = ShiftAmt->getValueType(0);
3284  if (SubVT == MVT::i32) {
3285  NotOpc = AArch64::ORNWrr;
3286  ZeroReg = AArch64::WZR;
3287  } else {
3288  assert(SubVT == MVT::i64);
3289  NotOpc = AArch64::ORNXrr;
3290  ZeroReg = AArch64::XZR;
3291  }
3292  SDValue Zero =
3293  CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
3294  MachineSDNode *Not =
3295  CurDAG->getMachineNode(NotOpc, DL, SubVT, Zero, Add1);
3296  NewShiftAmt = SDValue(Not, 0);
3297  } else
3298  return false;
3299  } else {
3300  // If the shift amount is masked with an AND, check that the mask covers the
3301  // bits that are implicitly ANDed off by the above opcodes and if so, skip
3302  // the AND.
3303  uint64_t MaskImm;
3304  if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
3305  !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
3306  return false;
3307 
3308  if (countTrailingOnes(MaskImm) < Bits)
3309  return false;
3310 
3311  NewShiftAmt = ShiftAmt->getOperand(0);
3312  }
3313 
3314  // Narrow/widen the shift amount to match the size of the shift operation.
3315  if (VT == MVT::i32)
3316  NewShiftAmt = narrowIfNeeded(CurDAG, NewShiftAmt);
3317  else if (VT == MVT::i64 && NewShiftAmt->getValueType(0) == MVT::i32) {
3318  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, DL, MVT::i32);
3319  MachineSDNode *Ext = CurDAG->getMachineNode(
3320  AArch64::SUBREG_TO_REG, DL, VT,
3321  CurDAG->getTargetConstant(0, DL, MVT::i64), NewShiftAmt, SubReg);
3322  NewShiftAmt = SDValue(Ext, 0);
3323  }
3324 
3325  SDValue Ops[] = {N->getOperand(0), NewShiftAmt};
3326  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
3327  return true;
3328 }
3329 
3330 bool
3331 AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
3332  unsigned RegWidth) {
3333  APFloat FVal(0.0);
3334  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
3335  FVal = CN->getValueAPF();
3336  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
3337  // Some otherwise illegal constants are allowed in this case.
3338  if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
3339  !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
3340  return false;
3341 
3342  ConstantPoolSDNode *CN =
3343  dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
3344  FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
3345  } else
3346  return false;
3347 
3348  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
3349  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
3350  // x-register.
3351  //
3352  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
3353  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
3354  // integers.
3355  bool IsExact;
3356 
3357  // fbits is between 1 and 64 in the worst-case, which means the fmul
3358  // could have 2^64 as an actual operand. Need 65 bits of precision.
3359  APSInt IntVal(65, true);
3360  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
3361 
3362  // N.b. isPowerOf2 also checks for > 0.
3363  if (!IsExact || !IntVal.isPowerOf2()) return false;
3364  unsigned FBits = IntVal.logBase2();
3365 
3366  // Checks above should have guaranteed that we haven't lost information in
3367  // finding FBits, but it must still be in range.
3368  if (FBits == 0 || FBits > RegWidth) return false;
3369 
3370  FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
3371  return true;
3372 }
3373 
3374 // Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
3375 // of the string and obtains the integer values from them and combines these
3376 // into a single value to be used in the MRS/MSR instruction.
3379  RegString.split(Fields, ':');
3380 
3381  if (Fields.size() == 1)
3382  return -1;
3383 
3384  assert(Fields.size() == 5
3385  && "Invalid number of fields in read register string");
3386 
3387  SmallVector<int, 5> Ops;
3388  bool AllIntFields = true;
3389 
3390  for (StringRef Field : Fields) {
3391  unsigned IntField;
3392  AllIntFields &= !Field.getAsInteger(10, IntField);
3393  Ops.push_back(IntField);
3394  }
3395 
3396  assert(AllIntFields &&
3397  "Unexpected non-integer value in special register string.");
3398  (void)AllIntFields;
3399 
3400  // Need to combine the integer fields of the string into a single value
3401  // based on the bit encoding of MRS/MSR instruction.
3402  return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
3403  (Ops[3] << 3) | (Ops[4]);
3404 }
3405 
3406 // Lower the read_register intrinsic to an MRS instruction node if the special
3407 // register string argument is either of the form detailed in the ALCE (the
3408 // form described in getIntOperandsFromRegsterString) or is a named register
3409 // known by the MRS SysReg mapper.
3410 bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
3411  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
3412  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
3413  SDLoc DL(N);
3414 
3415  int Reg = getIntOperandFromRegisterString(RegString->getString());
3416  if (Reg != -1) {
3417  ReplaceNode(N, CurDAG->getMachineNode(
3418  AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
3419  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
3420  N->getOperand(0)));
3421  return true;
3422  }
3423 
3424  // Use the sysreg mapper to map the remaining possible strings to the
3425  // value for the register to be used for the instruction operand.
3426  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
3427  if (TheReg && TheReg->Readable &&
3428  TheReg->haveFeatures(Subtarget->getFeatureBits()))
3429  Reg = TheReg->Encoding;
3430  else
3431  Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
3432 
3433  if (Reg != -1) {
3434  ReplaceNode(N, CurDAG->getMachineNode(
3435  AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
3436  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
3437  N->getOperand(0)));
3438  return true;
3439  }
3440 
3441  if (RegString->getString() == "pc") {
3442  ReplaceNode(N, CurDAG->getMachineNode(
3443  AArch64::ADR, DL, N->getSimpleValueType(0), MVT::Other,
3444  CurDAG->getTargetConstant(0, DL, MVT::i32),
3445  N->getOperand(0)));
3446  return true;
3447  }
3448 
3449  return false;
3450 }
3451 
3452 // Lower the write_register intrinsic to an MSR instruction node if the special
3453 // register string argument is either of the form detailed in the ALCE (the
3454 // form described in getIntOperandsFromRegsterString) or is a named register
3455 // known by the MSR SysReg mapper.
3456 bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
3457  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
3458  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
3459  SDLoc DL(N);
3460 
3461  int Reg = getIntOperandFromRegisterString(RegString->getString());
3462  if (Reg != -1) {
3463  ReplaceNode(
3464  N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
3465  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
3466  N->getOperand(2), N->getOperand(0)));
3467  return true;
3468  }
3469 
3470  // Check if the register was one of those allowed as the pstatefield value in
3471  // the MSR (immediate) instruction. To accept the values allowed in the
3472  // pstatefield for the MSR (immediate) instruction, we also require that an
3473  // immediate value has been provided as an argument, we know that this is
3474  // the case as it has been ensured by semantic checking.
3475  auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());
3476  if (PMapper) {
3477  assert (isa<ConstantSDNode>(N->getOperand(2))
3478  && "Expected a constant integer expression.");
3479  unsigned Reg = PMapper->Encoding;
3480  uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
3481  unsigned State;
3482  if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO || Reg == AArch64PState::SSBS) {
3483  assert(Immed < 2 && "Bad imm");
3484  State = AArch64::MSRpstateImm1;
3485  } else {
3486  assert(Immed < 16 && "Bad imm");
3487  State = AArch64::MSRpstateImm4;
3488  }
3489  ReplaceNode(N, CurDAG->getMachineNode(
3490  State, DL, MVT::Other,
3491  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
3492  CurDAG->getTargetConstant(Immed, DL, MVT::i16),
3493  N->getOperand(0)));
3494  return true;
3495  }
3496 
3497  // Use the sysreg mapper to attempt to map the remaining possible strings
3498  // to the value for the register to be used for the MSR (register)
3499  // instruction operand.
3500  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
3501  if (TheReg && TheReg->Writeable &&
3502  TheReg->haveFeatures(Subtarget->getFeatureBits()))
3503  Reg = TheReg->Encoding;
3504  else
3505  Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
3506  if (Reg != -1) {
3507  ReplaceNode(N, CurDAG->getMachineNode(
3508  AArch64::MSR, DL, MVT::Other,
3509  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
3510  N->getOperand(2), N->getOperand(0)));
3511  return true;
3512  }
3513 
3514  return false;
3515 }
3516 
3517 /// We've got special pseudo-instructions for these
3518 bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
3519  unsigned Opcode;
3520  EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
3521 
3522  // Leave IR for LSE if subtarget supports it.
3523  if (Subtarget->hasLSE()) return false;
3524 
3525  if (MemTy == MVT::i8)
3526  Opcode = AArch64::CMP_SWAP_8;
3527  else if (MemTy == MVT::i16)
3528  Opcode = AArch64::CMP_SWAP_16;
3529  else if (MemTy == MVT::i32)
3530  Opcode = AArch64::CMP_SWAP_32;
3531  else if (MemTy == MVT::i64)
3532  Opcode = AArch64::CMP_SWAP_64;
3533  else
3534  llvm_unreachable("Unknown AtomicCmpSwap type");
3535 
3536  MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
3537  SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
3538  N->getOperand(0)};
3539  SDNode *CmpSwap = CurDAG->getMachineNode(
3540  Opcode, SDLoc(N),
3541  CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
3542 
3543  MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
3544  CurDAG->setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
3545 
3546  ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
3547  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
3548  CurDAG->RemoveDeadNode(N);
3549 
3550  return true;
3551 }
3552 
3553 bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm,
3554  SDValue &Shift) {
3555  if (!isa<ConstantSDNode>(N))
3556  return false;
3557 
3558  SDLoc DL(N);
3559  uint64_t Val = cast<ConstantSDNode>(N)
3560  ->getAPIntValue()
3561  .trunc(VT.getFixedSizeInBits())
3562  .getZExtValue();
3563 
3564  switch (VT.SimpleTy) {
3565  case MVT::i8:
3566  // All immediates are supported.
3567  Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
3568  Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
3569  return true;
3570  case MVT::i16:
3571  case MVT::i32:
3572  case MVT::i64:
3573  // Support 8bit unsigned immediates.
3574  if (Val <= 255) {
3575  Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
3576  Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32);
3577  return true;
3578  }
3579  // Support 16bit unsigned immediates that are a multiple of 256.
3580  if (Val <= 65280 && Val % 256 == 0) {
3581  Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
3582  Imm = CurDAG->getTargetConstant(Val >> 8, DL, MVT::i32);
3583  return true;
3584  }
3585  break;
3586  default:
3587  break;
3588  }
3589 
3590  return false;
3591 }
3592 
3593 bool AArch64DAGToDAGISel::SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm,
3594  SDValue &Shift) {
3595  if (!isa<ConstantSDNode>(N))
3596  return false;
3597 
3598  SDLoc DL(N);
3599  int64_t Val = cast<ConstantSDNode>(N)
3600  ->getAPIntValue()
3601  .trunc(VT.getFixedSizeInBits())
3602  .getSExtValue();
3603 
3604  switch (VT.SimpleTy) {
3605  case MVT::i8:
3606  // All immediates are supported.
3607  Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
3608  Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
3609  return true;
3610  case MVT::i16:
3611  case MVT::i32:
3612  case MVT::i64:
3613  // Support 8bit signed immediates.
3614  if (Val >= -128 && Val <= 127) {
3615  Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
3616  Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
3617  return true;
3618  }
3619  // Support 16bit signed immediates that are a multiple of 256.
3620  if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) {
3621  Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
3622  Imm = CurDAG->getTargetConstant((Val >> 8) & 0xFF, DL, MVT::i32);
3623  return true;
3624  }
3625  break;
3626  default:
3627  break;
3628  }
3629 
3630  return false;
3631 }
3632 
3633 bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
3634  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3635  int64_t ImmVal = CNode->getSExtValue();
3636  SDLoc DL(N);
3637  if (ImmVal >= -128 && ImmVal < 128) {
3638  Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
3639  return true;
3640  }
3641  }
3642  return false;
3643 }
3644 
3645 bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm) {
3646  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3647  uint64_t ImmVal = CNode->getZExtValue();
3648 
3649  switch (VT.SimpleTy) {
3650  case MVT::i8:
3651  ImmVal &= 0xFF;
3652  break;
3653  case MVT::i16:
3654  ImmVal &= 0xFFFF;
3655  break;
3656  case MVT::i32:
3657  ImmVal &= 0xFFFFFFFF;
3658  break;
3659  case MVT::i64:
3660  break;
3661  default:
3662  llvm_unreachable("Unexpected type");
3663  }
3664 
3665  if (ImmVal < 256) {
3666  Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32);
3667  return true;
3668  }
3669  }
3670  return false;
3671 }
3672 
3673 bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm,
3674  bool Invert) {
3675  if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3676  uint64_t ImmVal = CNode->getZExtValue();
3677  SDLoc DL(N);
3678 
3679  if (Invert)
3680  ImmVal = ~ImmVal;
3681 
3682  // Shift mask depending on type size.
3683  switch (VT.SimpleTy) {
3684  case MVT::i8:
3685  ImmVal &= 0xFF;
3686  ImmVal |= ImmVal << 8;
3687  ImmVal |= ImmVal << 16;
3688  ImmVal |= ImmVal << 32;
3689  break;
3690  case MVT::i16:
3691  ImmVal &= 0xFFFF;
3692  ImmVal |= ImmVal << 16;
3693  ImmVal |= ImmVal << 32;
3694  break;
3695  case MVT::i32:
3696  ImmVal &= 0xFFFFFFFF;
3697  ImmVal |= ImmVal << 32;
3698  break;
3699  case MVT::i64:
3700  break;
3701  default:
3702  llvm_unreachable("Unexpected type");
3703  }
3704 
3705  uint64_t encoding;
3706  if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) {
3707  Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64);
3708  return true;
3709  }
3710  }
3711  return false;
3712 }
3713 
3714 // SVE shift intrinsics allow shift amounts larger than the element's bitwidth.
3715 // Rather than attempt to normalise everything we can sometimes saturate the
3716 // shift amount during selection. This function also allows for consistent
3717 // isel patterns by ensuring the resulting "Imm" node is of the i32 type
3718 // required by the instructions.
3719 bool AArch64DAGToDAGISel::SelectSVEShiftImm(SDValue N, uint64_t Low,
3720  uint64_t High, bool AllowSaturation,
3721  SDValue &Imm) {
3722  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
3723  uint64_t ImmVal = CN->getZExtValue();
3724 
3725  // Reject shift amounts that are too small.
3726  if (ImmVal < Low)
3727  return false;
3728 
3729  // Reject or saturate shift amounts that are too big.
3730  if (ImmVal > High) {
3731  if (!AllowSaturation)
3732  return false;
3733  ImmVal = High;
3734  }
3735 
3736  Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32);
3737  return true;
3738  }
3739 
3740  return false;
3741 }
3742 
3743 bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
3744  // tagp(FrameIndex, IRGstack, tag_offset):
3745  // since the offset between FrameIndex and IRGstack is a compile-time
3746  // constant, this can be lowered to a single ADDG instruction.
3747  if (!(isa<FrameIndexSDNode>(N->getOperand(1)))) {
3748  return false;
3749  }
3750 
3751  SDValue IRG_SP = N->getOperand(2);
3752  if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN ||
3753  cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
3754  Intrinsic::aarch64_irg_sp) {
3755  return false;
3756  }
3757 
3758  const TargetLowering *TLI = getTargetLowering();
3759  SDLoc DL(N);
3760  int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
3761  SDValue FiOp = CurDAG->getTargetFrameIndex(
3762  FI, TLI->getPointerTy(CurDAG->getDataLayout()));
3763  int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
3764 
3765  SDNode *Out = CurDAG->getMachineNode(
3766  AArch64::TAGPstack, DL, MVT::i64,
3767  {FiOp, CurDAG->getTargetConstant(0, DL, MVT::i64), N->getOperand(2),
3768  CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
3769  ReplaceNode(N, Out);
3770  return true;
3771 }
3772 
3773 void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
3774  assert(isa<ConstantSDNode>(N->getOperand(3)) &&
3775  "llvm.aarch64.tagp third argument must be an immediate");
3776  if (trySelectStackSlotTagP(N))
3777  return;
3778  // FIXME: above applies in any case when offset between Op1 and Op2 is a
3779  // compile-time constant, not just for stack allocations.
3780 
3781  // General case for unrelated pointers in Op1 and Op2.
3782  SDLoc DL(N);
3783  int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
3784  SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
3785  {N->getOperand(1), N->getOperand(2)});
3786  SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
3787  {SDValue(N1, 0), N->getOperand(2)});
3788  SDNode *N3 = CurDAG->getMachineNode(
3789  AArch64::ADDG, DL, MVT::i64,
3790  {SDValue(N2, 0), CurDAG->getTargetConstant(0, DL, MVT::i64),
3791  CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
3792  ReplaceNode(N, N3);
3793 }
3794 
3795 // NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length
3796 // vector types larger than NEON don't have a matching SubRegIndex.
3801  "Expected to extract from a packed scalable vector!");
3802  assert(VT.isFixedLengthVector() &&
3803  "Expected to extract a fixed length vector!");
3804 
3805  SDLoc DL(V);
3806  switch (VT.getSizeInBits()) {
3807  case 64: {
3808  auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
3809  return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
3810  }
3811  case 128: {
3812  auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
3813  return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
3814  }
3815  default: {
3816  auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
3817  return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
3818  }
3819  }
3820 }
3821 
3822 // NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length
3823 // vector types larger than NEON don't have a matching SubRegIndex.
3825  assert(VT.isScalableVector() &&
3827  "Expected to insert into a packed scalable vector!");
3829  "Expected to insert a fixed length vector!");
3830 
3831  SDLoc DL(V);
3832  switch (V.getValueType().getSizeInBits()) {
3833  case 64: {
3834  auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
3835  auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
3836  return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
3837  SDValue(Container, 0), V, SubReg);
3838  }
3839  case 128: {
3840  auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
3841  auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
3842  return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
3843  SDValue(Container, 0), V, SubReg);
3844  }
3845  default: {
3846  auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
3847  return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
3848  }
3849  }
3850 }
3851 
3852 void AArch64DAGToDAGISel::Select(SDNode *Node) {
3853  // If we have a custom node, we already have selected!
3854  if (Node->isMachineOpcode()) {
3855  LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
3856  Node->setNodeId(-1);
3857  return;
3858  }
3859 
3860  // Few custom selection stuff.
3861  EVT VT = Node->getValueType(0);
3862 
3863  switch (Node->getOpcode()) {
3864  default:
3865  break;
3866 
3867  case ISD::ATOMIC_CMP_SWAP:
3868  if (SelectCMP_SWAP(Node))
3869  return;
3870  break;
3871 
3872  case ISD::READ_REGISTER:
3873  if (tryReadRegister(Node))
3874  return;
3875  break;
3876 
3877  case ISD::WRITE_REGISTER:
3878  if (tryWriteRegister(Node))
3879  return;
3880  break;
3881 
3882  case ISD::ADD:
3883  if (tryMLAV64LaneV128(Node))
3884  return;
3885  break;
3886 
3887  case ISD::LOAD: {
3888  // Try to select as an indexed load. Fall through to normal processing
3889  // if we can't.
3890  if (tryIndexedLoad(Node))
3891  return;
3892  break;
3893  }
3894 
3895  case ISD::SRL:
3896  case ISD::AND:
3897  case ISD::SRA:
3899  if (tryBitfieldExtractOp(Node))
3900  return;
3901  if (tryBitfieldInsertInZeroOp(Node))
3902  return;
3903  [[fallthrough]];
3904  case ISD::ROTR:
3905  case ISD::SHL:
3906  if (tryShiftAmountMod(Node))
3907  return;
3908  break;
3909 
3910  case ISD::SIGN_EXTEND:
3911  if (tryBitfieldExtractOpFromSExt(Node))
3912  return;
3913  break;
3914 
3915  case ISD::FP_EXTEND:
3916  if (tryHighFPExt(Node))
3917  return;
3918  break;
3919 
3920  case ISD::OR:
3921  if (tryBitfieldInsertOp(Node))
3922  return;
3923  break;
3924 
3925  case ISD::EXTRACT_SUBVECTOR: {
3926  // Bail when not a "cast" like extract_subvector.
3927  if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue() != 0)
3928  break;
3929 
3930  // Bail when normal isel can do the job.
3931  EVT InVT = Node->getOperand(0).getValueType();
3932  if (VT.isScalableVector() || InVT.isFixedLengthVector())
3933  break;
3934 
3935  // NOTE: We can only get here when doing fixed length SVE code generation.
3936  // We do manual selection because the types involved are not linked to real
3937  // registers (despite being legal) and must be coerced into SVE registers.
3938  //
3939  // NOTE: If the above changes, be aware that selection will still not work
3940  // because the td definition of extract_vector does not support extracting
3941  // a fixed length vector from a scalable vector.
3942 
3943  ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0)));
3944  return;
3945  }
3946 
3947  case ISD::INSERT_SUBVECTOR: {
3948  // Bail when not a "cast" like insert_subvector.
3949  if (cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue() != 0)
3950  break;
3951  if (!Node->getOperand(0).isUndef())
3952  break;
3953 
3954  // Bail when normal isel should do the job.
3955  EVT InVT = Node->getOperand(1).getValueType();
3956  if (VT.isFixedLengthVector() || InVT.isScalableVector())
3957  break;
3958 
3959  // NOTE: We can only get here when doing fixed length SVE code generation.
3960  // We do manual selection because the types involved are not linked to real
3961  // registers (despite being legal) and must be coerced into SVE registers.
3962  //
3963  // NOTE: If the above changes, be aware that selection will still not work
3964  // because the td definition of insert_vector does not support inserting a
3965  // fixed length vector into a scalable vector.
3966 
3967  ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1)));
3968  return;
3969  }
3970 
3971  case ISD::Constant: {
3972  // Materialize zero constants as copies from WZR/XZR. This allows
3973  // the coalescer to propagate these into other instructions.
3974  ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
3975  if (ConstNode->isZero()) {
3976  if (VT == MVT::i32) {
3977  SDValue New = CurDAG->getCopyFromReg(
3978  CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
3979  ReplaceNode(Node, New.getNode());
3980  return;
3981  } else if (VT == MVT::i64) {
3982  SDValue New = CurDAG->getCopyFromReg(
3983  CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
3984  ReplaceNode(Node, New.getNode());
3985  return;
3986  }
3987  }
3988  break;
3989  }
3990 
3991  case ISD::FrameIndex: {
3992  // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
3993  int FI = cast<FrameIndexSDNode>(Node)->getIndex();
3994  unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
3995  const TargetLowering *TLI = getTargetLowering();
3996  SDValue TFI = CurDAG->getTargetFrameIndex(
3997  FI, TLI->getPointerTy(CurDAG->getDataLayout()));
3998  SDLoc DL(Node);
3999  SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
4000  CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
4001  CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
4002  return;
4003  }
4004  case ISD::INTRINSIC_W_CHAIN: {
4005  unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
4006  switch (IntNo) {
4007  default:
4008  break;
4009  case Intrinsic::aarch64_ldaxp:
4010  case Intrinsic::aarch64_ldxp: {
4011  unsigned Op =
4012  IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
4013  SDValue MemAddr = Node->getOperand(2);
4014  SDLoc DL(Node);
4015  SDValue Chain = Node->getOperand(0);
4016 
4017  SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
4018  MVT::Other, MemAddr, Chain);
4019 
4020  // Transfer memoperands.
4022  cast<MemIntrinsicSDNode>(Node)->getMemOperand();
4023  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
4024  ReplaceNode(Node, Ld);
4025  return;
4026  }
4027  case Intrinsic::aarch64_stlxp:
4028  case Intrinsic::aarch64_stxp: {
4029  unsigned Op =
4030  IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
4031  SDLoc DL(Node);
4032  SDValue Chain = Node->getOperand(0);
4033  SDValue ValLo = Node->getOperand(2);
4034  SDValue ValHi = Node->getOperand(3);
4035  SDValue MemAddr = Node->getOperand(4);
4036 
4037  // Place arguments in the right order.
4038  SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
4039 
4040  SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
4041  // Transfer memoperands.
4043  cast<MemIntrinsicSDNode>(Node)->getMemOperand();
4044  CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
4045 
4046  ReplaceNode(Node, St);
4047  return;
4048  }
4049  case Intrinsic::aarch64_neon_ld1x2:
4050  if (VT == MVT::v8i8) {
4051  SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
4052  return;
4053  } else if (VT == MVT::v16i8) {
4054  SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
4055  return;
4056  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4057  SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
4058  return;
4059  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4060  SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
4061  return;
4062  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4063  SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
4064  return;
4065  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4066  SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
4067  return;
4068  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4069  SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
4070  return;
4071  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4072  SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
4073  return;
4074  }
4075  break;
4076  case Intrinsic::aarch64_neon_ld1x3:
4077  if (VT == MVT::v8i8) {
4078  SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
4079  return;
4080  } else if (VT == MVT::v16i8) {
4081  SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
4082  return;
4083  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4084  SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
4085  return;
4086  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4087  SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
4088  return;
4089  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4090  SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
4091  return;
4092  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4093  SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
4094  return;
4095  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4096  SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
4097  return;
4098  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4099  SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
4100  return;
4101  }
4102  break;
4103  case Intrinsic::aarch64_neon_ld1x4:
4104  if (VT == MVT::v8i8) {
4105  SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
4106  return;
4107  } else if (VT == MVT::v16i8) {
4108  SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
4109  return;
4110  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4111  SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
4112  return;
4113  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4114  SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
4115  return;
4116  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4117  SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
4118  return;
4119  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4120  SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
4121  return;
4122  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4123  SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
4124  return;
4125  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4126  SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
4127  return;
4128  }
4129  break;
4130  case Intrinsic::aarch64_neon_ld2:
4131  if (VT == MVT::v8i8) {
4132  SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
4133  return;
4134  } else if (VT == MVT::v16i8) {
4135  SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
4136  return;
4137  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4138  SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
4139  return;
4140  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4141  SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
4142  return;
4143  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4144  SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
4145  return;
4146  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4147  SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
4148  return;
4149  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4150  SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
4151  return;
4152  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4153  SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
4154  return;
4155  }
4156  break;
4157  case Intrinsic::aarch64_neon_ld3:
4158  if (VT == MVT::v8i8) {
4159  SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
4160  return;
4161  } else if (VT == MVT::v16i8) {
4162  SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
4163  return;
4164  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4165  SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
4166  return;
4167  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4168  SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
4169  return;
4170  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4171  SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
4172  return;
4173  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4174  SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
4175  return;
4176  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4177  SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
4178  return;
4179  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4180  SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
4181  return;
4182  }
4183  break;
4184  case Intrinsic::aarch64_neon_ld4:
4185  if (VT == MVT::v8i8) {
4186  SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
4187  return;
4188  } else if (VT == MVT::v16i8) {
4189  SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
4190  return;
4191  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4192  SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
4193  return;
4194  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4195  SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
4196  return;
4197  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4198  SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
4199  return;
4200  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4201  SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
4202  return;
4203  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4204  SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
4205  return;
4206  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4207  SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
4208  return;
4209  }
4210  break;
4211  case Intrinsic::aarch64_neon_ld2r:
4212  if (VT == MVT::v8i8) {
4213  SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
4214  return;
4215  } else if (VT == MVT::v16i8) {
4216  SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
4217  return;
4218  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4219  SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
4220  return;
4221  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4222  SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
4223  return;
4224  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4225  SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
4226  return;
4227  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4228  SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
4229  return;
4230  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4231  SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
4232  return;
4233  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4234  SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
4235  return;
4236  }
4237  break;
4238  case Intrinsic::aarch64_neon_ld3r:
4239  if (VT == MVT::v8i8) {
4240  SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
4241  return;
4242  } else if (VT == MVT::v16i8) {
4243  SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
4244  return;
4245  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4246  SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
4247  return;
4248  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4249  SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
4250  return;
4251  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4252  SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
4253  return;
4254  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4255  SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
4256  return;
4257  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4258  SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
4259  return;
4260  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4261  SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
4262  return;
4263  }
4264  break;
4265  case Intrinsic::aarch64_neon_ld4r:
4266  if (VT == MVT::v8i8) {
4267  SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
4268  return;
4269  } else if (VT == MVT::v16i8) {
4270  SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
4271  return;
4272  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4273  SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
4274  return;
4275  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4276  SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
4277  return;
4278  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4279  SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
4280  return;
4281  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4282  SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
4283  return;
4284  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4285  SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
4286  return;
4287  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4288  SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
4289  return;
4290  }
4291  break;
4292  case Intrinsic::aarch64_neon_ld2lane:
4293  if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4294  SelectLoadLane(Node, 2, AArch64::LD2i8);
4295  return;
4296  } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4297  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4298  SelectLoadLane(Node, 2, AArch64::LD2i16);
4299  return;
4300  } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4301  VT == MVT::v2f32) {
4302  SelectLoadLane(Node, 2, AArch64::LD2i32);
4303  return;
4304  } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4305  VT == MVT::v1f64) {
4306  SelectLoadLane(Node, 2, AArch64::LD2i64);
4307  return;
4308  }
4309  break;
4310  case Intrinsic::aarch64_neon_ld3lane:
4311  if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4312  SelectLoadLane(Node, 3, AArch64::LD3i8);
4313  return;
4314  } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4315  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4316  SelectLoadLane(Node, 3, AArch64::LD3i16);
4317  return;
4318  } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4319  VT == MVT::v2f32) {
4320  SelectLoadLane(Node, 3, AArch64::LD3i32);
4321  return;
4322  } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4323  VT == MVT::v1f64) {
4324  SelectLoadLane(Node, 3, AArch64::LD3i64);
4325  return;
4326  }
4327  break;
4328  case Intrinsic::aarch64_neon_ld4lane:
4329  if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4330  SelectLoadLane(Node, 4, AArch64::LD4i8);
4331  return;
4332  } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4333  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4334  SelectLoadLane(Node, 4, AArch64::LD4i16);
4335  return;
4336  } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4337  VT == MVT::v2f32) {
4338  SelectLoadLane(Node, 4, AArch64::LD4i32);
4339  return;
4340  } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4341  VT == MVT::v1f64) {
4342  SelectLoadLane(Node, 4, AArch64::LD4i64);
4343  return;
4344  }
4345  break;
4346  case Intrinsic::aarch64_ld64b:
4347  SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0);
4348  return;
4349  case Intrinsic::aarch64_sve_ld2_sret: {
4350  if (VT == MVT::nxv16i8) {
4351  SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B,
4352  true);
4353  return;
4354  } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4355  VT == MVT::nxv8bf16) {
4356  SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H,
4357  true);
4358  return;
4359  } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4360  SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W,
4361  true);
4362  return;
4363  } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4364  SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D,
4365  true);
4366  return;
4367  }
4368  break;
4369  }
4370  case Intrinsic::aarch64_sve_ld3_sret: {
4371  if (VT == MVT::nxv16i8) {
4372  SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B,
4373  true);
4374  return;
4375  } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4376  VT == MVT::nxv8bf16) {
4377  SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H,
4378  true);
4379  return;
4380  } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4381  SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W,
4382  true);
4383  return;
4384  } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4385  SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D,
4386  true);
4387  return;
4388  }
4389  break;
4390  }
4391  case Intrinsic::aarch64_sve_ld4_sret: {
4392  if (VT == MVT::nxv16i8) {
4393  SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B,
4394  true);
4395  return;
4396  } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4397  VT == MVT::nxv8bf16) {
4398  SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H,
4399  true);
4400  return;
4401  } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4402  SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W,
4403  true);
4404  return;
4405  } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4406  SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D,
4407  true);
4408  return;
4409  }
4410  break;
4411  }
4412  case Intrinsic::swift_async_context_addr: {
4413  SDLoc DL(Node);
4414  SDValue Chain = Node->getOperand(0);
4415  SDValue CopyFP = CurDAG->getCopyFromReg(Chain, DL, AArch64::FP, MVT::i64);
4416  SDValue Res = SDValue(
4417  CurDAG->getMachineNode(AArch64::SUBXri, DL, MVT::i64, CopyFP,
4418  CurDAG->getTargetConstant(8, DL, MVT::i32),
4419  CurDAG->getTargetConstant(0, DL, MVT::i32)),
4420  0);
4421  ReplaceUses(SDValue(Node, 0), Res);
4422  ReplaceUses(SDValue(Node, 1), CopyFP.getValue(1));
4423  CurDAG->RemoveDeadNode(Node);
4424 
4425  auto &MF = CurDAG->getMachineFunction();
4426  MF.getFrameInfo().setFrameAddressIsTaken(true);
4427  MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
4428  return;
4429  }
4430  }
4431  } break;
4432  case ISD::INTRINSIC_WO_CHAIN: {
4433  unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
4434  switch (IntNo) {
4435  default:
4436  break;
4437  case Intrinsic::aarch64_tagp:
4438  SelectTagP(Node);
4439  return;
4440  case Intrinsic::aarch64_neon_tbl2:
4441  SelectTable(Node, 2,
4442  VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
4443  false);
4444  return;
4445  case Intrinsic::aarch64_neon_tbl3:
4446  SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
4447  : AArch64::TBLv16i8Three,
4448  false);
4449  return;
4450  case Intrinsic::aarch64_neon_tbl4:
4451  SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
4452  : AArch64::TBLv16i8Four,
4453  false);
4454  return;
4455  case Intrinsic::aarch64_neon_tbx2:
4456  SelectTable(Node, 2,
4457  VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
4458  true);
4459  return;
4460  case Intrinsic::aarch64_neon_tbx3:
4461  SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
4462  : AArch64::TBXv16i8Three,
4463  true);
4464  return;
4465  case Intrinsic::aarch64_neon_tbx4:
4466  SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
4467  : AArch64::TBXv16i8Four,
4468  true);
4469  return;
4470  case Intrinsic::aarch64_neon_smull:
4471  case Intrinsic::aarch64_neon_umull:
4472  if (tryMULLV64LaneV128(IntNo, Node))
4473  return;
4474  break;
4475  }
4476  break;
4477  }
4478  case ISD::INTRINSIC_VOID: {
4479  unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
4480  if (Node->getNumOperands() >= 3)
4481  VT = Node->getOperand(2)->getValueType(0);
4482  switch (IntNo) {
4483  default:
4484  break;
4485  case Intrinsic::aarch64_neon_st1x2: {
4486  if (VT == MVT::v8i8) {
4487  SelectStore(Node, 2, AArch64::ST1Twov8b);
4488  return;
4489  } else if (VT == MVT::v16i8) {
4490  SelectStore(Node, 2, AArch64::ST1Twov16b);
4491  return;
4492  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4493  VT == MVT::v4bf16) {
4494  SelectStore(Node, 2, AArch64::ST1Twov4h);
4495  return;
4496  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4497  VT == MVT::v8bf16) {
4498  SelectStore(Node, 2, AArch64::ST1Twov8h);
4499  return;
4500  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4501  SelectStore(Node, 2, AArch64::ST1Twov2s);
4502  return;
4503  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4504  SelectStore(Node, 2, AArch64::ST1Twov4s);
4505  return;
4506  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4507  SelectStore(Node, 2, AArch64::ST1Twov2d);
4508  return;
4509  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4510  SelectStore(Node, 2, AArch64::ST1Twov1d);
4511  return;
4512  }
4513  break;
4514  }
4515  case Intrinsic::aarch64_neon_st1x3: {
4516  if (VT == MVT::v8i8) {
4517  SelectStore(Node, 3, AArch64::ST1Threev8b);
4518  return;
4519  } else if (VT == MVT::v16i8) {
4520  SelectStore(Node, 3, AArch64::ST1Threev16b);
4521  return;
4522  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4523  VT == MVT::v4bf16) {
4524  SelectStore(Node, 3, AArch64::ST1Threev4h);
4525  return;
4526  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4527  VT == MVT::v8bf16) {
4528  SelectStore(Node, 3, AArch64::ST1Threev8h);
4529  return;
4530  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4531  SelectStore(Node, 3, AArch64::ST1Threev2s);
4532  return;
4533  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4534  SelectStore(Node, 3, AArch64::ST1Threev4s);
4535  return;
4536  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4537  SelectStore(Node, 3, AArch64::ST1Threev2d);
4538  return;
4539  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4540  SelectStore(Node, 3, AArch64::ST1Threev1d);
4541  return;
4542  }
4543  break;
4544  }
4545  case Intrinsic::aarch64_neon_st1x4: {
4546  if (VT == MVT::v8i8) {
4547  SelectStore(Node, 4, AArch64::ST1Fourv8b);
4548  return;
4549  } else if (VT == MVT::v16i8) {
4550  SelectStore(Node, 4, AArch64::ST1Fourv16b);
4551  return;
4552  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4553  VT == MVT::v4bf16) {
4554  SelectStore(Node, 4, AArch64::ST1Fourv4h);
4555  return;
4556  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4557  VT == MVT::v8bf16) {
4558  SelectStore(Node, 4, AArch64::ST1Fourv8h);
4559  return;
4560  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4561  SelectStore(Node, 4, AArch64::ST1Fourv2s);
4562  return;
4563  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4564  SelectStore(Node, 4, AArch64::ST1Fourv4s);
4565  return;
4566  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4567  SelectStore(Node, 4, AArch64::ST1Fourv2d);
4568  return;
4569  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4570  SelectStore(Node, 4, AArch64::ST1Fourv1d);
4571  return;
4572  }
4573  break;
4574  }
4575  case Intrinsic::aarch64_neon_st2: {
4576  if (VT == MVT::v8i8) {
4577  SelectStore(Node, 2, AArch64::ST2Twov8b);
4578  return;
4579  } else if (VT == MVT::v16i8) {
4580  SelectStore(Node, 2, AArch64::ST2Twov16b);
4581  return;
4582  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4583  VT == MVT::v4bf16) {
4584  SelectStore(Node, 2, AArch64::ST2Twov4h);
4585  return;
4586  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4587  VT == MVT::v8bf16) {
4588  SelectStore(Node, 2, AArch64::ST2Twov8h);
4589  return;
4590  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4591  SelectStore(Node, 2, AArch64::ST2Twov2s);
4592  return;
4593  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4594  SelectStore(Node, 2, AArch64::ST2Twov4s);
4595  return;
4596  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4597  SelectStore(Node, 2, AArch64::ST2Twov2d);
4598  return;
4599  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4600  SelectStore(Node, 2, AArch64::ST1Twov1d);
4601  return;
4602  }
4603  break;
4604  }
4605  case Intrinsic::aarch64_neon_st3: {
4606  if (VT == MVT::v8i8) {
4607  SelectStore(Node, 3, AArch64::ST3Threev8b);
4608  return;
4609  } else if (VT == MVT::v16i8) {
4610  SelectStore(Node, 3, AArch64::ST3Threev16b);
4611  return;
4612  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4613  VT == MVT::v4bf16) {
4614  SelectStore(Node, 3, AArch64::ST3Threev4h);
4615  return;
4616  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4617  VT == MVT::v8bf16) {
4618  SelectStore(Node, 3, AArch64::ST3Threev8h);
4619  return;
4620  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4621  SelectStore(Node, 3, AArch64::ST3Threev2s);
4622  return;
4623  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4624  SelectStore(Node, 3, AArch64::ST3Threev4s);
4625  return;
4626  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4627  SelectStore(Node, 3, AArch64::ST3Threev2d);
4628  return;
4629  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4630  SelectStore(Node, 3, AArch64::ST1Threev1d);
4631  return;
4632  }
4633  break;
4634  }
4635  case Intrinsic::aarch64_neon_st4: {
4636  if (VT == MVT::v8i8) {
4637  SelectStore(Node, 4, AArch64::ST4Fourv8b);
4638  return;
4639  } else if (VT == MVT::v16i8) {
4640  SelectStore(Node, 4, AArch64::ST4Fourv16b);
4641  return;
4642  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4643  VT == MVT::v4bf16) {
4644  SelectStore(Node, 4, AArch64::ST4Fourv4h);
4645  return;
4646  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4647  VT == MVT::v8bf16) {
4648  SelectStore(Node, 4, AArch64::ST4Fourv8h);
4649  return;
4650  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4651  SelectStore(Node, 4, AArch64::ST4Fourv2s);
4652  return;
4653  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4654  SelectStore(Node, 4, AArch64::ST4Fourv4s);
4655  return;
4656  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4657  SelectStore(Node, 4, AArch64::ST4Fourv2d);
4658  return;
4659  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4660  SelectStore(Node, 4, AArch64::ST1Fourv1d);
4661  return;
4662  }
4663  break;
4664  }
4665  case Intrinsic::aarch64_neon_st2lane: {
4666  if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4667  SelectStoreLane(Node, 2, AArch64::ST2i8);
4668  return;
4669  } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4670  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4671  SelectStoreLane(Node, 2, AArch64::ST2i16);
4672  return;
4673  } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4674  VT == MVT::v2f32) {
4675  SelectStoreLane(Node, 2, AArch64::ST2i32);
4676  return;
4677  } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4678  VT == MVT::v1f64) {
4679  SelectStoreLane(Node, 2, AArch64::ST2i64);
4680  return;
4681  }
4682  break;
4683  }
4684  case Intrinsic::aarch64_neon_st3lane: {
4685  if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4686  SelectStoreLane(Node, 3, AArch64::ST3i8);
4687  return;
4688  } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4689  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4690  SelectStoreLane(Node, 3, AArch64::ST3i16);
4691  return;
4692  } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4693  VT == MVT::v2f32) {
4694  SelectStoreLane(Node, 3, AArch64::ST3i32);
4695  return;
4696  } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4697  VT == MVT::v1f64) {
4698  SelectStoreLane(Node, 3, AArch64::ST3i64);
4699  return;
4700  }
4701  break;
4702  }
4703  case Intrinsic::aarch64_neon_st4lane: {
4704  if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4705  SelectStoreLane(Node, 4, AArch64::ST4i8);
4706  return;
4707  } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4708  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4709  SelectStoreLane(Node, 4, AArch64::ST4i16);
4710  return;
4711  } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4712  VT == MVT::v2f32) {
4713  SelectStoreLane(Node, 4, AArch64::ST4i32);
4714  return;
4715  } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4716  VT == MVT::v1f64) {
4717  SelectStoreLane(Node, 4, AArch64::ST4i64);
4718  return;
4719  }
4720  break;
4721  }
4722  case Intrinsic::aarch64_sve_st2: {
4723  if (VT == MVT::nxv16i8) {
4724  SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);
4725  return;
4726  } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4727  VT == MVT::nxv8bf16) {
4728  SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM);
4729  return;
4730  } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4731  SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM);
4732  return;
4733  } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4734  SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM);
4735  return;
4736  }
4737  break;
4738  }
4739  case Intrinsic::aarch64_sve_st3: {
4740  if (VT == MVT::nxv16i8) {
4741  SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM);
4742  return;
4743  } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4744  VT == MVT::nxv8bf16) {
4745  SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM);
4746  return;
4747  } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4748  SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM);
4749  return;
4750  } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4751  SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM);
4752  return;
4753  }
4754  break;
4755  }
4756  case Intrinsic::aarch64_sve_st4: {
4757  if (VT == MVT::nxv16i8) {
4758  SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM);
4759  return;
4760  } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4761  VT == MVT::nxv8bf16) {
4762  SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM);
4763  return;
4764  } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4765  SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM);
4766  return;
4767  } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4768  SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM);
4769  return;
4770  }
4771  break;
4772  }
4773  }
4774  break;
4775  }
4776  case AArch64ISD::LD2post: {
4777  if (VT == MVT::v8i8) {
4778  SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
4779  return;
4780  } else if (VT == MVT::v16i8) {
4781  SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
4782  return;
4783  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4784  SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
4785  return;
4786  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4787  SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
4788  return;
4789  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4790  SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
4791  return;
4792  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4793  SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
4794  return;
4795  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4796  SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
4797  return;
4798  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4799  SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
4800  return;
4801  }
4802  break;
4803  }
4804  case AArch64ISD::LD3post: {
4805  if (VT == MVT::v8i8) {
4806  SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
4807  return;
4808  } else if (VT == MVT::v16i8) {
4809  SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
4810  return;
4811  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4812  SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
4813  return;
4814  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4815  SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
4816  return;
4817  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4818  SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
4819  return;
4820  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4821  SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
4822  return;
4823  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4824  SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
4825  return;
4826  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4827  SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
4828  return;
4829  }
4830  break;
4831  }
4832  case AArch64ISD::LD4post: {
4833  if (VT == MVT::v8i8) {
4834  SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
4835  return;
4836  } else if (VT == MVT::v16i8) {
4837  SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
4838  return;
4839  } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4840  SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
4841  return;
4842  } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4843  SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
4844  return;
4845  } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4846  SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
4847  return;
4848  } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4849  SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
4850  return;
4851  } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4852  SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
4853  return;
4854  } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4855  SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
4856  return;
4857  }
4858  break;
4859  }
4860  case AArch64ISD::LD1x2post: {
4861  if (VT == MVT::v8i8) {
4862  SelectPostLoad(Node, 2, AArch64::LD1Twov8b_PO